In [1]:
import json
import re
import regex
import pandas as pd

# Functions for NER Preprocessing

In [2]:
def regex_tokenize(text):
    """
    Tokenize text using regex pattern and track character offsets. 
    It returns list of (token, start_offset, end_offset) tuples.
    """
    tokens_with_offsets = []
    current_pos = 0
    #tokenized_text = re.split(r'([,.:;?_!"()\']|--|\s)', text) # pattern to split into words
    tokenized_text = regex.split(r'(\p{P}|\s+)', text) # pattern to align with BERT Tokenizer
    tokenized_text = [item.strip() for item in tokenized_text if item.strip()] # eliminates with spaces
    for token in tokenized_text:
        if token: 
            # Find the token in the original text starting from current_pos
            start = text.find(token, current_pos)
            end = start + len(token)
            tokens_with_offsets.append((token, start, end))
            current_pos = end
    return tokens_with_offsets


def assign_entity_labels(tokens_with_offsets, entities, target_entity):
    """
    This function does NOT address overlapping labels and it focuses on one target label at a time.
    It returns a list of labels in the form: ['O', 'B-target_entity', 'I-target_entity', 'O', 'O', etc.]
    """
    # Sort entities by start offset
    sorted_entities = sorted(entities, key=lambda x: x['start_offset']) 
    
    # Filter entities by target label to only process entities matching the target label
    target_entities = [e for e in sorted_entities if e['label'] == target_entity]

    # Initialize all tokens with 'O' label
    token_labels = ['O'] * len(tokens_with_offsets) 

    for entity in target_entities:
        entity_start, entity_end, entity_label = entity['start_offset'], entity['end_offset'], entity['label']
        # Loop through each token and its offsets
        for i, (token, token_start, token_end) in enumerate(tokens_with_offsets):
            # Skip punctuation tokens
            if re.compile(r'^[,.:;?_!"()\']$').match(token):
                continue
            # Check if token overlaps with entity
            if (token_start <= entity_end and token_end > entity_start):
                # If token starts at or after entity start, mark as B- or I-
                if token_start >= entity_start:
                    # If it's the first token of the entity, use B- prefix
                    if token_start == entity_start:
                        token_labels[i] = f'B-{entity_label}'
                    else:
                        token_labels[i] = f'I-{entity_label}'
                # If token starts before entity but overlaps, mark as B-
                else:
                    token_labels[i] = f'B-{entity_label}'

    return token_labels

# Preprocess Doccano Data

In [19]:
file_name = '4800_axiomatic_dataset'
file_name = '1200_agreement_MC'

file_path = f'/home/fantoni/patent-sentence-classification/data/{file_name}.jsonl'
label_to_int = {'FUN': 0, 'STR': 1, 'MIX': 2, 'OTH': 3}

data = []
with open(file_path, 'r') as file:
    for line in file:
        json_obj = json.loads(line)
        text = json_obj["text"]
        entities = json_obj["entities"]
        sent, sent_id = re.split(r'\t', text)
        
        # Extract Sentence Tag
        sent_tags = [entity['label'] for entity in entities if entity['label'] in ['FUN', 'STR', 'MIX', 'OTH']]
        # Ensure only one sentence tag per sentence 
        assert len(sent_tags) == 1, f"More than one number of sentence tag for sent_id: {sent_id}"
        sent_tag = sent_tags[0] if sent_tags else None
        sent_class = label_to_int[sent_tag] # create numeric class

        # Extract the tokens eliminating the last token for classification [:-1]
        tokens_with_offsets = regex_tokenize(text)
        tokens = [t.lower() for t, _ , _ in tokens_with_offsets][:-1]

        # Assign lables to tokens eliminating the last token for classification [:-1] 
        D_labels = assign_entity_labels(tokens_with_offsets, entities, target_entity = 'D')[:-1]
        A_labels = assign_entity_labels(tokens_with_offsets, entities, target_entity = 'A')[:-1]
        R_labels = assign_entity_labels(tokens_with_offsets, entities, target_entity = 'R')[:-1]
        P_labels = assign_entity_labels(tokens_with_offsets, entities, target_entity = 'P')[:-1]
        AX_labels = assign_entity_labels(tokens_with_offsets, entities, target_entity = 'AX')[:-1]
        
        # Ensure all lists are of the same length
        assert len(tokens) == len(D_labels) == len(A_labels) == len(R_labels) ==len(P_labels) == len(AX_labels), (
            f"Mismatch in the number of tokens and labels for sent_id: {sent_id}"
        )

        # Store data
        data.append({
            "sent_id": sent_id,
            "sent": sent,
            "sent_tag": sent_tag,
            "sent_class": sent_class,
            "words": '<w>'.join(tokens),
            "D_labels": ','.join(D_labels),
            "A_labels": ','.join(A_labels),
            "R_labels": ','.join(R_labels),
            "P_labels": ','.join(P_labels),
            "AX_labels": ','.join(AX_labels)
        })

df = pd.DataFrame(data)

# Sanity Check

In [20]:
# Perform Sanity Check:

# Check for duplicates
duplicates = df[df.duplicated(subset=['sent_id', 'sent'], keep=False)]
num_duplicates = duplicates.shape[0]
if num_duplicates > 0:
    print("Duplicate entries:")
    print(duplicates)
else:
    print('No duplicates found.')

# Check for missing values
missing_values = df.isnull().sum()
if missing_values.sum() > 0:
    print("\nMissing values in each column:")
    print(missing_values)
    print("\nRows with missing values:")
    print(df[df.isnull().any(axis=1)])
else:
    print("\nNo missing values found.")

No duplicates found.

No missing values found.


In [None]:
# Visualize the class label distribution ---> The Dataset is NOT BALANCED!
result = df['sent_tag'].value_counts().to_frame(name='count') #print(df['sent_tag'].value_counts())
result['%'] = (df['sent_tag'].value_counts(normalize=True) * 100).round(2)
print(result)

# Save Data

In [21]:
# Save Dataframe
output_file = f'/home/fantoni/patent-sentence-classification/data/{file_name}.xlsx'
df.to_excel(output_file, index=False)
print(f"\nDataFrame saved to: {output_file}")


DataFrame saved to: /home/fantoni/patent-sentence-classification/data/1200_agreement_MC.xlsx


# Merge Agreement

In [24]:
# Import MC
MC_df = pd.read_excel('/home/fantoni/patent-sentence-classification/data/1200_agreement_MC.xlsx')
MC_rename_dict = {'sent_tag':'sent_tag_mc','sent_class':'sent_class_mc','words': 'words_mc', 'D_labels': 'D_labels_mc', 'A_labels': 'A_labels_mc', 'R_labels': 'R_labels_mc', 'P_labels': 'P_labels_mc', 'AX_labels': 'AX_labels_mc'}
MC_df = MC_df.rename(columns = MC_rename_dict)

# Import ML
ML_df = pd.read_excel('/home/fantoni/patent-sentence-classification/data/1200_agreement_ML.xlsx')
ML_rename_dict = {'sent_tag':'sent_tag_ml','sent_class':'sent_class_ml','words': 'words_ml', 'D_labels': 'D_labels_ml', 'A_labels': 'A_labels_ml', 'R_labels': 'R_labels_ml', 'P_labels': 'P_labels_ml', 'AX_labels': 'AX_labels_ml'}
ML_df = ML_df.rename(columns=ML_rename_dict)
ML_df = ML_df.drop(columns='sent')

# Merge Agreement
merged_df = pd.merge(MC_df, ML_df, on="sent_id", how="inner") 
merged_df['agreement'] = merged_df['sent_class_mc'] == merged_df['sent_class_ml']

# Save
merged_df.to_excel('/home/fantoni/patent-sentence-classification/data/1200_agreement_All.xlsx', index= False)