In [22]:
import json
import re
import pandas as pd

# Sanity Check

In [None]:
# Open the JSONL file and read it line by line
file_name ='1200_agreement_ML' 
file_path = f'/home/fantoni/patent-sentence-classification/data/{file_name}.jsonl'
data = []
with open(file_path, 'r') as file:
    for line in file:
        json_obj = json.loads(line)
        sent, sent_id = re.split(r'\t', json_obj['text'])
        # Get Sentence Tag
        for entity in json_obj['entities']:
          if entity['label'] in ['FUN', 'STR', 'MIX', 'OTH']:
            sent_tag = entity['label']
            data.append({'sent_id': sent_id, 'sent': sent, 'sent_tag': sent_tag})

df = pd.DataFrame(data)

# Create numeric class label
label_to_int = {'FUN': 0, 'STR': 1, 'MIX': 2, 'OTH': 3}
df['sent_class'] = df['sent_tag'].map(label_to_int)

# Check for duplicates based on `sent_id` and `sent`
duplicates = df[df.duplicated(subset=['sent_id', 'sent'], keep=False)]
num_duplicates = duplicates.shape[0]
# Optionally, display the duplicate rows
if num_duplicates > 0:
    print("Duplicate entries:")
    print(duplicates)
else:
  print('No duplicates found.')

# Check for missing values
missing_values = df.isnull().sum()
if missing_values.sum() > 0:
    print("\nMissing values in each column:")
    print(missing_values)
    print("\nRows with missing values:")
    print(df[df.isnull().any(axis=1)])
else:
    print("\nNo missing values found.")

# Save Dataframe
output_file = f'/home/fantoni/patent-sentence-classification/data/{file_name}.xlsx'
df.to_excel(output_file, index=False)
print(f"\nDataset Size: {len(df)}")
print(f"\nDataFrame saved to {output_file}")

No duplicates found.

No missing values found.

Dataset Size: 1200

DataFrame saved to /home/fantoni/patent-sentence-classification/data/1200_agreement_ML.xlsx


In [24]:
# Visualize the class label distribution ---> The Dataset is NOT BALANCED!
#print(df['sent_tag'].value_counts())
result = df['sent_tag'].value_counts().to_frame(name='count')
result['%'] = (df['sent_tag'].value_counts(normalize=True) * 100).round(2)
print(result)

          count      %
sent_tag              
STR         559  46.58
FUN         411  34.25
MIX         119   9.92
OTH         111   9.25


# Merge Agreement

In [None]:
# Import MC
MC_df = pd.read_excel('/home/fantoni/patent-sentence-classification/data/1200_agreement_MC.xlsx')
MC_df = MC_df.rename(columns={'sent_tag': 'sent_tag_mc', 'sent_class': 'sent_class_mc'})

# Import ML
ML_df = pd.read_excel('/home/fantoni/patent-sentence-classification/data/1200_agreement_ML.xlsx')
ML_df = ML_df.rename(columns={'sent_tag': 'sent_tag_ml', 'sent_class': 'sent_class_ml'})
ML_df = ML_df.drop(columns='sent')

# Merge Agreement
merged_df = pd.merge(MC_df, ML_df, on="sent_id", how="inner") 
merged_df['Agreement'] = merged_df['sent_class_mc'] == merged_df['sent_class_ml']

# Save
merged_df.to_excel('/home/fantoni/patent-sentence-classification/data/1200_agreement_All.xlsx', index= False)