In [1]:
import json
import re
import pandas as pd

In [2]:
# Open the JSONL file and read it line by line
file_path = '/home/fantoni/patent-sentence-classification/data/6000_axiomatic_dataset.jsonl'
data = []
with open(file_path, 'r') as file:
    for line in file:
        json_obj = json.loads(line)
        sent, sent_id = re.split(r'\t', json_obj['text'])
        # Get Sentence Tag
        for entity in json_obj['entities']:
          if entity['label'] in ['FUN', 'STR', 'MIX', 'OTH']:
            sent_tag = entity['label']
            data.append({'sent_id': sent_id, 'sent': sent, 'sent_tag': sent_tag})

df = pd.DataFrame(data)

# Create numeric class label
label_to_int = {'FUN': 0, 'STR': 1, 'MIX': 2, 'OTH': 3}
df['sent_class'] = df['sent_tag'].map(label_to_int)

# Check for duplicates based on `sent_id` and `sent`
duplicates = df[df.duplicated(subset=['sent_id', 'sent'], keep=False)]
num_duplicates = duplicates.shape[0]
# Optionally, display the duplicate rows
if num_duplicates > 0:
    print("Duplicate entries:")
    print(duplicates)
else:
  print('No duplicates found.')

# Check for missing values
missing_values = df.isnull().sum()
if missing_values.sum() > 0:
    print("\nMissing values in each column:")
    print(missing_values)
    print("\nRows with missing values:")
    print(df[df.isnull().any(axis=1)])
else:
    print("\nNo missing values found.")

# Save Dataframe
output_file = '/home/fantoni/patent-sentence-classification/data/6000_axiomatic_dataset.xlsx'
df.to_excel(output_file, index=False)
print(f"\nDataFrame saved to {output_file}")

No duplicates found.

No missing values found.

DataFrame saved to /home/fantoni/patent-sentence-classification/data/6000_axiomatic_dataset.xlsx


In [5]:
# The Dataset is NOT BALANCED!
# Visualize the class label distribution
#print(df['sent_tag'].value_counts())
result = df['sent_tag'].value_counts().to_frame(name='count')
result['%'] = (df['sent_tag'].value_counts(normalize=True) * 100).round(2)
print(result)

          count      %
sent_tag              
STR        2759  45.98
FUN        2222  37.03
MIX         608  10.13
OTH         411   6.85
