In [32]:
from datasets import load_dataset
from transformers import AutoTokenizer
import datasets
from transformers import DataCollatorForTokenClassification
from transformers import TFAutoModelForTokenClassification
from transformers import create_optimizer
import tensorflow as tf
import evaluate
import numpy as np
from seqeval.metrics import classification_report as seqeval_classification_report
import pandas as pd
from collections import Counter
import random
import os
import re

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [41]:
# Define paths
base_path = "cadecv2"
original_path = os.path.join(base_path, "original")
text_path = os.path.join(base_path, "text")
output_path = os.path.join(base_path, "train.txt")

# Function to parse annotations with semicolon-sliced offsets
def parse_annotations(file_path):
    entities = []
    with open(file_path, "r") as f:
        for line in f:
            if line.startswith("T"):  # Entity line
                parts = line.strip().split("\t")
                tag_number, entity_info, text = parts
                entity_type, *offsets = entity_info.split(" ")
                offset_ranges = " ".join(offsets).split(";")  # Handle semicolon-separated offsets
                for offset_range in offset_ranges:
                    start_offset, end_offset = map(int, offset_range.split(" "))
                    entities.append((start_offset, end_offset, entity_type))
    return entities

# Function to create IOB labeling
def create_iob_labels(text, entities):
    labels = ["O"] * len(text)  # Initialize all tokens with "O"
    for start, end, entity_type in entities:
        labels[start] = f"B-{entity_type}"
        for i in range(start + 1, end):
            labels[i] = f"I-{entity_type}"
    return labels

# Function to tokenize text and align labels
def tokenize_and_label(text, labels):
    tokens = text.split()
    token_labels = []
    text_index = 0
    for token in tokens:
        token_length = len(token)
        if any(char.isalnum() for char in token):  # Skip punctuation
            token_label = labels[text_index : text_index + token_length]
            label = token_label[0] if token_label else "O"
            token_labels.append((token, label))
        else:
            token_labels.append((token, "O"))
        text_index += token_length + 1  # Move index past the token and space
    return token_labels

# Process files
output_lines = []
for text_file in os.listdir(text_path):
    text_file_path = os.path.join(text_path, text_file)
    annotation_file_path = os.path.join(original_path, text_file.replace(".txt", ".ann"))
    
    if os.path.exists(annotation_file_path):
        # Read text
        with open(text_file_path, "r") as f:
            text = f.read()
        
        # Parse annotations and create labels
        entities = parse_annotations(annotation_file_path)
        labels = create_iob_labels(text, entities)
        token_labels = tokenize_and_label(text, labels)
        
        # Write to output
        for token, label in token_labels:
            output_lines.append(f"{token}\t{label}")
            if token.endswith("."):  # Add a blank line after sentences
                output_lines.append("\n")

# Write the output to train.txt
with open(output_path, "w") as f:
    f.write("\n".join(output_lines))

In [42]:
# List of special characters to remove
special_characters = [".", ","]

# Cleaning process
with open("cadecv2/train.txt", "r") as file:
    lines = file.readlines()

cleaned_lines = []
for line in lines:
    # Remove special characters
    for char in special_characters:
        line = line.replace(char, "")
    cleaned_lines.append(line)

# Writing the cleaned data to a new file
with open("train2.txt", "w") as file:
    file.writelines(cleaned_lines)


In [43]:
# Processing the file
with open("train2.txt", "r") as file:
    lines = file.readlines()

processed_lines = []
for line in lines:
    line = line.strip()
    if not line:  # Skip empty lines
        processed_lines.append("\n")
        continue

    if "\t" in line:  # Process only lines with a tab (word-label pairs)
        word, label = line.split("\t")
        if "'" in word:  # Check if the word contains an apostrophe
            base, suffix = word.split("'", 1)  # Split the word at the apostrophe
            processed_lines.append(f"{base}\t{label}\n")  # Add the base part
            processed_lines.append(f"'{suffix}\t{label}\n")  # Add the suffix with the same label
        else:
            processed_lines.append(line + "\n")  # Add the original line
    else:
        processed_lines.append(line + "\n")  # Add lines without tabs as is

# Writing the processed data to a new file
with open("train3.txt", "w") as file:
    file.writelines(processed_lines)