In [1]:
import json
import re
import pandas as pd
import random

In [3]:
# Open the JSONL file and read it line by line
file_path = '/home/fantoni/patent-sentence-classification/data/6000_axiomatic_dataset.jsonl'
data = []
with open(file_path, 'r') as file:
    for line in file:
        json_obj = json.loads(line)
        sent, sent_id = re.split(r'\t', json_obj['text'])
        # Get Sentence Tag
        for entity in json_obj['entities']:
          if entity['label'] in ['FUN', 'STR', 'MIX', 'OTH']:
            sent_tag = entity['label']
            data.append({'sent_id': sent_id, 'sent': sent, 'sent_tag': sent_tag})

df = pd.DataFrame(data)

# Create numeric class label
label_to_int = {'FUN': 0, 'STR': 1, 'MIX': 2, 'OTH': 3}
df['sent_class'] = df['sent_tag'].map(label_to_int)

# Check for duplicates based on `sent_id` and `sent`
duplicates = df[df.duplicated(subset=['sent_id', 'sent'], keep=False)]
num_duplicates = duplicates.shape[0]
# Optionally, display the duplicate rows
if num_duplicates > 0:
    print("Duplicate entries:")
    print(duplicates)
else:
  print('No duplicates found.')

# Check for missing values
missing_values = df.isnull().sum()
if missing_values.sum() > 0:
    print("\nMissing values in each column:")
    print(missing_values)
    print("\nRows with missing values:")
    print(df[df.isnull().any(axis=1)])
else:
    print("\nNo missing values found.")

# Save Dataframe
output_file = '/home/fantoni/patent-sentence-classification/data/6000_axiomatic_dataset.xlsx'
df.to_excel(output_file, index=False)
print(f"\nDataFrame saved to {output_file}")

No duplicates found.

No missing values found.

DataFrame saved to /home/fantoni/patent-sentence-classification/data/6000_axiomatic_dataset.xlsx


In [4]:
# Visualize the class label distribution
print(df['sent_tag'].value_counts())

sent_tag
STR    2759
FUN    2222
MIX     608
OTH     411
Name: count, dtype: int64


In [6]:
def random_split(df, train_ratio, eval_ratio, seed):
    random.seed(seed)
    # Shuffle the entire DataFramme
    shuffled_df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
    # Calculate split indices
    train_end = int(len(df) * train_ratio)
    validation_end = train_end + int(len(df) * eval_ratio)
    # Split the DataFrame
    train_set = df[:train_end]
    eval_set = df[train_end:validation_end]
    test_set = df[validation_end:]
    return train_set, eval_set, test_set

train_ratio = 0.7
eval_ratio = 0.2
seed = 1999
train_set, eval_set, test_set = random_split(df, train_ratio=train_ratio, eval_ratio=eval_ratio, seed=seed)
train_set.to_excel(f"/home/fantoni/patent-sentence-classification/data/train.xlsx", index=False)
eval_set.to_excel(f"/home/fantoni/patent-sentence-classification/data/eval.xlsx", index=False)
test_set.to_excel(f"/home/fantoni/patent-sentence-classification/data/test.xlsx", index=False)
print("Data successfully split and saved.")

Data successfully split and saved.
