In [1]:
import json
import re
import regex
import pandas as pd
from transformers import BertTokenizerFast

  from .autonotebook import tqdm as notebook_tqdm


# Functions for NER Preprocessing

In [None]:
def split_into_words_with_regex(text, regex_pattern = r'(\p{P}|\s+)'):
    """Split text into words using regex pattern and track character offsets. It returns list of (word, start_offset, end_offset) tuples."""
    words_with_offsets = []
    current_pos = 0

    # 1. pattern to split into words
    #tokenized_text = re.split(r'([,.:;?_!"()\']|--|\s)', text) 

    # 2. # pattern to align with BERT Tokenizer as much as possible
    tokenized_text = regex.split(regex_pattern, text) 
    
    tokenized_text = [word.strip() for word in tokenized_text if word.strip()] # eliminates with spaces
    for word in tokenized_text:
        if word: 
            # Find the word in the original text starting from current_pos
            start = text.find(word, current_pos)
            end = start + len(word)
            words_with_offsets.append((word.lower(), start, end))
            current_pos = end
    return words_with_offsets


def assign_entity_labels(words_with_offsets, entities, target_entity):
    """
    This function does NOT address overlapping labels and it focuses on one target label at a time.
    It returns a list of labels in the form: ['O', 'B-target_entity', 'I-target_entity', 'O', 'O', etc.]
    It is a function specific to parse Doccano output.
    """
    # Filter entities by target label to only process entities matching the target label
    target_entities = [e for e in entities if e['label'] == target_entity]

    # Initialize all words with 'O' label
    word_labels = ['O'] * len(words_with_offsets) 

    for entity in target_entities:
        entity_start, entity_end, entity_label = entity['start_offset'], entity['end_offset'], entity['label']
        # Loop through each word and its offsets
        for i, (word, word_start, word_end) in enumerate(words_with_offsets):
            # === FIX: Skip punctuation words ===
            # Motivation: In some cases, punctuation marks are mistakenly included in entity labels by annotators.
            # This leads to incorrect annotations like "piston." being labeled as part of a entity (instead of just "piston").
            # By skipping punctuation words, we improve labeling accuracy on-the-fly.
            if re.compile(r'^[,.:;?_!"()\']$').match(word):
                continue
            # Check if word overlaps with entity
            if (word_start <= entity_end and word_end > entity_start):
                # If word starts at or after entity start, mark as B- or I-
                if word_start >= entity_start:
                    # If it's the first word of the entity, use B- prefix
                    if word_start == entity_start:
                        word_labels[i] = f'B-{entity_label}'
                    else:
                        word_labels[i] = f'I-{entity_label}'
                # If word starts before entity but overlaps, mark as B-
                else:
                    word_labels[i] = f'B-{entity_label}'

    return word_labels

# Preprocess Doccano Data

In [None]:
file_name = '4800_axiomatic_dataset'
#file_name = '1200_agreement_MC'

file_path = f'/home/fantoni/patent-sentence-classification/data/{file_name}.jsonl'
label_to_int = {'FUN': 0, 'STR': 1, 'MIX': 2, 'OTH': 3}

# Use BertTokenizerFast
bert_tokenizer = BertTokenizerFast.from_pretrained('bert-large-uncased')
bert_tokenizer = BertTokenizerFast.from_pretrained('anferico/bert-for-patents')

data = []
with open(file_path, 'r') as file:
    
    for line in file:
        json_obj = json.loads(line)
        sent, sent_id = re.split(r'\t', json_obj["text"])
        entities = sorted(json_obj["entities"], key=lambda x: x['start_offset'])        
        
        # Extract Sentence Tag
        sent_tags = [e['label'] for e in entities if e['label'] in ['FUN', 'STR', 'MIX', 'OTH']]
        # Ensure only one sentence tag per sentence 
        assert len(sent_tags) == 1, f"More than one number of sentence tag for sent_id: {sent_id}"
        sent_tag = sent_tags[0] if sent_tags else None
        sent_class = label_to_int[sent_tag] # create numeric class

        # 1. Extract the words with regex from the sentence
        words_with_offsets = split_into_words_with_regex(sent)
        words = [w for w, start, end in words_with_offsets]

        # 2. Assign lables to words
        D_labels = assign_entity_labels(words_with_offsets, entities, target_entity = 'D')
        A_labels = assign_entity_labels(words_with_offsets, entities, target_entity = 'A')
        R_labels = assign_entity_labels(words_with_offsets, entities, target_entity = 'R')
        P_labels = assign_entity_labels(words_with_offsets, entities, target_entity = 'P')
        AX_labels = assign_entity_labels(words_with_offsets, entities, target_entity = 'AX')
        
        # Ensure all lists are of the same length
        assert len(words) == len(D_labels) == len(A_labels) == len(R_labels) ==len(P_labels) == len(AX_labels), (
            f"Mismatch in the number of words and labels for sent_id: {sent_id}"
        )

        # Store data
        data.append({
            "sent_id": sent_id,
            "sent": sent,
            "sent_tag": sent_tag,
            "sent_class": sent_class,
            "words": '<w>'.join(words),
            "D_labels": ','.join(D_labels),
            "A_labels": ','.join(A_labels),
            "R_labels": ','.join(R_labels),
            "P_labels": ','.join(P_labels),
            "AX_labels": ','.join(AX_labels)
        })

df = pd.DataFrame(data)
df



Unnamed: 0,sent_id,sent,sent_tag,sent_class,words,D_labels,A_labels,R_labels,P_labels,AX_labels
0,173113,The drainage lumen 140 may be pressurized to a...,FUN,0,the<w>drainage<w>lumen<w>140<w>may<w>be<w>pres...,"O,O,O,O,O,O,O,O,O,B-D,I-D,O,O,O,O,O,O","O,O,O,O,O,O,B-A,O,O,O,O,O,O,O,O,O,O","O,B-R,I-R,O,O,O,O,O,O,O,O,O,O,O,O,O,O","O,O,O,O,O,O,O,O,O,B-P,I-P,O,O,O,O,O,O","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
1,1480549,"When the cover 34 vibrates, an internal pressu...",FUN,0,"when<w>the<w>cover<w>34<w>vibrates<w>,<w>an<w>...","O,O,B-D,O,O,O,O,B-D,I-D,I-D,I-D,I-D,I-D,O,O,O,...","O,O,O,O,B-A,O,O,O,O,O,O,O,O,O,B-A,O,O,O,O,O,O,...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,...","O,O,O,O,O,O,O,B-P,I-P,I-P,I-P,I-P,I-P,O,O,O,O,...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-AX,I-AX,O,O,O,..."
2,79463,Rotation of the proximal drive shaft segment 3...,STR,1,rotation<w>of<w>the<w>proximal<w>drive<w>shaft...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,...","B-P,I-P,I-P,I-P,I-P,I-P,I-P,O,O,O,B-P,I-P,I-P,...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
3,1383668,Heavy oviposition into stems can cause death o...,OTH,3,heavy<w>oviposition<w>into<w>stems<w>can<w>cau...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
4,1567249,In one embodiment there is provided the looped...,STR,1,in<w>one<w>embodiment<w>there<w>is<w>provided<...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
...,...,...,...,...,...,...,...,...,...,...
4795,2627502,The truth table shown in figure 7 summarizes t...,FUN,0,the<w>truth<w>table<w>shown<w>in<w>figure<w>7<...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-A,O","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-R,O,O,O","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
4796,1207229,If the authentication server node 618 and the ...,FUN,0,if<w>the<w>authentication<w>server<w>node<w>61...,"O,O,B-D,I-D,I-D,O,O,O,B-D,O,O,O,O,O,O,O,O,O,O,...","O,O,O,O,O,O,O,O,O,O,O,B-A,O,O,O,O,O,O,O,O,O,O,...","O,O,O,O,O,O,O,O,O,O,O,O,O,B-R,I-R,I-R,O,O,O,O,...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
4797,1074032,"In the second aspect, the surfactant is prefer...",FUN,0,"in<w>the<w>second<w>aspect<w>,<w>the<w>surfact...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,...","O,O,O,O,O,O,O,O,O,B-A,O,O,O,O,O,O,O,O,O,O,O,O,...","O,O,O,O,O,O,B-R,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,...","O,O,O,O,O,O,O,O,O,O,O,O,B-P,I-P,I-P,I-P,O,O,O,...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
4798,2826149,figure 4 shows an exemplary source device comm...,FUN,0,figure<w>4<w>shows<w>an<w>exemplary<w>source<w...,"O,O,O,O,O,B-D,I-D,O,O,O,O,O,O,O,O,O,O,O,O,O,O,...","O,O,O,O,O,O,O,B-A,O,O,O,O,O,O,O,O,O,O,O,O,O,O,...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."


In [14]:
from tabulate import tabulate

# Store rows where differences occur
table = []

# Loop through the rows
for i in range(len(df_regex)):
    sentence_id = df_regex.loc[i, 'sent_id']
    tokens = df_regex.loc[i, 'words'].split('<w>')
    labels1 = df_regex.loc[i, 'D_labels'].split(',')
    labels2 = df_regex_no_punct.loc[i, 'D_labels'].split(',')

    # Check if there's at least one difference
    has_difference = any(l1 != l2 for l1, l2 in zip(labels1, labels2))

    if has_difference:
        for token, l1, l2 in zip(tokens, labels1, labels2):
            table.append([sentence_id, token, l1, l2])

# Print only rows with differences
headers = ["Sentence ID", "Word", "D_labels (Original)", "D_labels (No Punct)"]
print(tabulate(table, headers=headers, tablefmt="pretty"))


+-------------+---------------------+---------------------+---------------------+
| Sentence ID |        Word         | D_labels (Original) | D_labels (No Punct) |
+-------------+---------------------+---------------------+---------------------+
|   1550861   |         in          |          O          |          O          |
|   1550861   |        some         |          O          |          O          |
|   1550861   |     embodiments     |          O          |          O          |
|   1550861   |          ,          |          O          |          O          |
|   1550861   |         the         |          O          |          O          |
|   1550861   |       release       |          O          |          O          |
|   1550861   |         of          |          O          |          O          |
|   1550861   |         the         |          O          |          O          |
|   1550861   |     formulation     |          O          |          O          |
|   1550861   | 

# Sanity Check

In [4]:
# Perform Sanity Check:

# Check for duplicates
duplicates = df[df.duplicated(subset=['sent_id', 'sent'], keep=False)]
num_duplicates = duplicates.shape[0]
if num_duplicates > 0:
    print("Duplicate entries:")
    print(duplicates)
else:
    print('No duplicates found.')

# Check for missing values
missing_values = df.isnull().sum()
if missing_values.sum() > 0:
    print("\nMissing values in each column:")
    print(missing_values)
    print("\nRows with missing values:")
    print(df[df.isnull().any(axis=1)])
else:
    print("\nNo missing values found.")

No duplicates found.

No missing values found.


In [5]:
# Visualize the class label distribution ---> The Dataset is NOT BALANCED!
result = df['sent_tag'].value_counts().to_frame(name='count') #print(df['sent_tag'].value_counts())
result['%'] = (df['sent_tag'].value_counts(normalize=True) * 100).round(2)
print(result)

          count      %
sent_tag              
STR        2224  46.33
FUN        1810  37.71
MIX         432   9.00
OTH         334   6.96


# Save Data

In [6]:
# Save Dataframe
output_file = f'/home/fantoni/patent-sentence-classification/data/{file_name}_tokenizer.xlsx'
df.to_excel(output_file, index=False)
print(f"\nDataFrame saved to: {output_file}")


DataFrame saved to: /home/fantoni/patent-sentence-classification/data/4800_axiomatic_dataset_tokenizer.xlsx


# Merge Agreement

In [24]:
# Import MC
MC_df = pd.read_excel('/home/fantoni/patent-sentence-classification/data/1200_agreement_MC.xlsx')
MC_rename_dict = {'sent_tag':'sent_tag_mc','sent_class':'sent_class_mc','words': 'words_mc', 'D_labels': 'D_labels_mc', 'A_labels': 'A_labels_mc', 'R_labels': 'R_labels_mc', 'P_labels': 'P_labels_mc', 'AX_labels': 'AX_labels_mc'}
MC_df = MC_df.rename(columns = MC_rename_dict)

# Import ML
ML_df = pd.read_excel('/home/fantoni/patent-sentence-classification/data/1200_agreement_ML.xlsx')
ML_rename_dict = {'sent_tag':'sent_tag_ml','sent_class':'sent_class_ml','words': 'words_ml', 'D_labels': 'D_labels_ml', 'A_labels': 'A_labels_ml', 'R_labels': 'R_labels_ml', 'P_labels': 'P_labels_ml', 'AX_labels': 'AX_labels_ml'}
ML_df = ML_df.rename(columns=ML_rename_dict)
ML_df = ML_df.drop(columns='sent')

# Merge Agreement
merged_df = pd.merge(MC_df, ML_df, on="sent_id", how="inner") 
merged_df['agreement'] = merged_df['sent_class_mc'] == merged_df['sent_class_ml']

# Save
merged_df.to_excel('/home/fantoni/patent-sentence-classification/data/1200_agreement_All.xlsx', index= False)