In [20]:
import pandas as pd
import random

df = pd.read_excel('/home/fantoni/patent-sentence-classification/data/6000_axiomatic_dataset.xlsx')

# Random Splitting

In [15]:
def random_split(df, train_ratio=0.7, eval_ratio=0.2, seed=1999):
    random.seed(seed)
    # Shuffle the entire DataFramme
    df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
    
    # Calculate split indices
    train_end = int(len(df) * train_ratio)
    validation_end = train_end + int(len(df) * eval_ratio)
    
    # Split the DataFrame
    train_set = df[:train_end]
    eval_set = df[train_end:validation_end]
    test_set = df[validation_end:]
    
    return train_set, eval_set, test_set

train_set, eval_set, test_set = random_split(df)
train_set.to_excel(f"/home/fantoni/patent-sentence-classification/data/train.xlsx", index=False)
eval_set.to_excel(f"/home/fantoni/patent-sentence-classification/data/eval.xlsx", index=False)
test_set.to_excel(f"/home/fantoni/patent-sentence-classification/data/test.xlsx", index=False)
print("Data successfully split and saved with random sampling.")

Data successfully split and saved with random sampling.


In [18]:
# Compute proportions
train_dist = train_set['sent_class'].value_counts(normalize=True).mul(100).round(2).rename('Train %')
eval_dist = eval_set['sent_class'].value_counts(normalize=True).mul(100).round(2).rename('Eval %')
test_dist = test_set['sent_class'].value_counts(normalize=True).mul(100).round(2).rename('Test %')

# Compute counts
train_count = train_set['sent_class'].value_counts().rename('Train Count')
eval_count = eval_set['sent_class'].value_counts().rename('Eval Count')
test_count = test_set['sent_class'].value_counts().rename('Test Count')

# Visualize distribution of the classes for random splitting
class_distributions = pd.concat([train_dist, eval_dist, test_dist, train_count, eval_count, test_count], axis=1)
class_distributions

Unnamed: 0_level_0,Train %,Eval %,Test %,Train Count,Eval Count,Test Count
sent_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,46.55,46.0,42.0,1955,552,252
0,36.48,37.42,40.17,1532,449,241
2,10.07,10.25,10.33,423,123,62
3,6.9,6.33,7.5,290,76,45


# Stratified Splitting

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd

def stratified_split(df, train_ratio=0.7, eval_ratio=0.2, seed=1999, class_column = "sent_class"):

    # Ensure the target column is present
    if class_column not in df.columns:
        raise ValueError(f"DataFrame must contain a '{class_column}' column for stratification.")

    # First, split into train and temp (eval + test)
    train_set, temp_set = train_test_split(
        df, test_size=(1 - train_ratio), stratify=df[class_column], random_state=seed
    )

    # Determine the proportion of the remaining data that should go to validation
    eval_ratio_adjusted = eval_ratio / (1 - train_ratio)  # Adjusted proportion

    # Split the temp set into validation and test sets
    eval_set, test_set = train_test_split(
        temp_set, test_size=(1 - eval_ratio_adjusted), stratify=temp_set[class_column], random_state=seed
    )

    return train_set, eval_set, test_set

train_set, eval_set, test_set = stratified_split(df)
#train_set.to_excel("/home/fantoni/patent-sentence-classification/data/train_rnd.xlsx", index=False)
#eval_set.to_excel("/home/fantoni/patent-sentence-classification/data/eval_rnd.xlsx", index=False)
#test_set.to_excel("/home/fantoni/patent-sentence-classification/data/test_rdn.xlsx", index=False)
print("Data successfully split and saved with stratified sampling.")


Data successfully split and saved with stratified sampling.


In [22]:
# Compute proportions
train_dist = train_set['sent_class'].value_counts(normalize=True).mul(100).round(2).rename('Train %')
eval_dist = eval_set['sent_class'].value_counts(normalize=True).mul(100).round(2).rename('Eval %')
test_dist = test_set['sent_class'].value_counts(normalize=True).mul(100).round(2).rename('Test %')

# Compute counts
train_count = train_set['sent_class'].value_counts().rename('Train Count')
eval_count = eval_set['sent_class'].value_counts().rename('Eval Count')
test_count = test_set['sent_class'].value_counts().rename('Test Count')

# Visualize distribution of the classes for stratified splitting
class_distributions = pd.concat([train_dist, eval_dist, test_dist, train_count, eval_count, test_count], axis=1)
class_distributions

Unnamed: 0_level_0,Train %,Eval %,Test %,Train Count,Eval Count,Test Count
sent_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,45.99,46.0,45.92,1931,552,276
0,37.03,37.0,37.1,1555,444,223
2,10.12,10.17,10.15,425,122,61
3,6.86,6.83,6.82,288,82,41
