In [None]:
from pathlib import Path
import json
from typing import List, Tuple
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def spans_to_bio(text: str, spans: List[dict]) -> List[Tuple[str, str]]:
    encoding = tokenizer(
        text,
        return_offsets_mapping=True,
        add_special_tokens=False,
        return_attention_mask=False
    )
    offsets = encoding["offset_mapping"]
    subwords = tokenizer.convert_ids_to_tokens(encoding["input_ids"])
    tags = ["O"] * len(offsets)

    for span in spans:
        start, end, label = span["start"], span["end"], span["label"]
        in_span = False
        for i, (tok_start, tok_end) in enumerate(offsets):
            if tok_end <= start:
                continue
            if tok_start >= end:
                break
            if not in_span:
                tags[i] = f"B-{label}"
                in_span = True
            else:
                tags[i] = f"I-{label}"


    output = []
    for token, tag in zip(subwords, tags):
        if token.strip() and (token.isalnum() or token.startswith("##")):
            output.append((token, tag))
    return output

def process_jsonl_directory(input_dir: str, output_path: str):
    input_dir = Path(input_dir)
    all_bio_lines = []

    for file in input_dir.glob("*.jsonl"):
        with open(file, "r", encoding="utf-8") as f:
            for line in f:
                obj = json.loads(line)
                text = obj["text"]
                spans = obj.get("spans", [])
                bio_tags = spans_to_bio(text, spans)
                for token, tag in bio_tags:
                    all_bio_lines.append(f"{token} {tag}")
                all_bio_lines.append("")  

    with open(output_path, "w", encoding="utf-8") as f_out:
        f_out.write("\n".join(all_bio_lines))


input_folder = r"C:\Users\Administrator\Desktop\Project\Resource\annotations_new"
output_file = r"C:\Users\Administrator\Desktop\Project\bio_dataset_cleaned(1).txt"

process_jsonl_directory(input_folder, output_file)


  from .autonotebook import tqdm as notebook_tqdm
Token indices sequence length is longer than the specified maximum sequence length for this model (588 > 512). Running this sequence through the model will result in indexing errors
