In [1]:
import pandas as pd
import random

# Random Splitting

In [15]:
def random_split(df, train_ratio=0.7, eval_ratio=0.2, seed=1999):
    random.seed(seed)
    # Shuffle the entire DataFramme
    df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
    
    # Calculate split indices
    train_end = int(len(df) * train_ratio)
    validation_end = train_end + int(len(df) * eval_ratio)
    
    # Split the DataFrame
    train_set = df[:train_end]
    eval_set = df[train_end:validation_end]
    test_set = df[validation_end:]
    
    return train_set, eval_set, test_set

df = pd.read_excel('/home/fantoni/patent-sentence-classification/data/6000_axiomatic_dataset.xlsx')

train_set, eval_set, test_set = random_split(df)
train_set.to_excel(f"/home/fantoni/patent-sentence-classification/data/train_rnd.xlsx", index=False)
eval_set.to_excel(f"/home/fantoni/patent-sentence-classification/data/eval_rnd.xlsx", index=False)
test_set.to_excel(f"/home/fantoni/patent-sentence-classification/data/test_rnd.xlsx", index=False)
print("Data successfully split and saved with random sampling.")

Data successfully split and saved with random sampling.


In [None]:
# Compute proportions
train_dist = train_set['sent_tag'].value_counts(normalize=True).mul(100).round(2).rename('Train %')
eval_dist = eval_set['sent_tag'].value_counts(normalize=True).mul(100).round(2).rename('Eval %')
test_dist = test_set['sent_tag'].value_counts(normalize=True).mul(100).round(2).rename('Test %')

# Compute counts
train_count = train_set['sent_tag'].value_counts().rename('Train Count')
eval_count = eval_set['sent_tag'].value_counts().rename('Eval Count')
test_count = test_set['sent_tag'].value_counts().rename('Test Count')

# Visualize distribution of the classes for random splitting
class_distributions = pd.concat([train_dist, eval_dist, test_dist, train_count, eval_count, test_count], axis=1)
class_distributions

Unnamed: 0_level_0,Train %,Eval %,Test %,Train Count,Eval Count,Test Count
sent_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,46.55,46.0,42.0,1955,552,252
0,36.48,37.42,40.17,1532,449,241
2,10.07,10.25,10.33,423,123,62
3,6.9,6.33,7.5,290,76,45


# Stratified Splitting

In [2]:
from sklearn.model_selection import train_test_split
import pandas as pd

def stratified_split(df, train_ratio=0.7, eval_ratio=0.2, seed=1999, class_column = "sent_class"):

    # Ensure the target column is present
    if class_column not in df.columns:
        raise ValueError(f"DataFrame must contain a '{class_column}' column for stratification.")

    # First, split into train and temp (eval + test)
    train_set, temp_set = train_test_split(
        df, test_size=(1 - train_ratio), stratify=df[class_column], random_state=seed
    )

    # Determine the proportion of the remaining data that should go to validation
    eval_ratio_adjusted = eval_ratio / (1 - train_ratio)  # Adjusted proportion

    # Split the temp set into validation and test sets
    eval_set, test_set = train_test_split(
        temp_set, test_size=(1 - eval_ratio_adjusted), stratify=temp_set[class_column], random_state=seed
    )

    return train_set, eval_set, test_set

df = pd.read_excel('/home/fantoni/patent-sentence-classification/data/6000_axiomatic_dataset.xlsx')

train_set, eval_set, test_set = stratified_split(df)
train_set.to_excel("/home/fantoni/patent-sentence-classification/data/train_stratified.xlsx", index=False)
eval_set.to_excel("/home/fantoni/patent-sentence-classification/data/eval_stratified.xlsx", index=False)
test_set.to_excel("/home/fantoni/patent-sentence-classification/data/test_stratified.xlsx", index=False)
print("Data successfully split and saved with stratified sampling.")
print(len(train_set))

Data successfully split and saved with stratified sampling.
4199


In [None]:
# Compute proportions
train_dist = train_set['sent_tag'].value_counts(normalize=True).mul(100).round(2).rename('Train %')
eval_dist = eval_set['sent_tag'].value_counts(normalize=True).mul(100).round(2).rename('Eval %')
test_dist = test_set['sent_tag'].value_counts(normalize=True).mul(100).round(2).rename('Test %')

# Compute counts
train_count = train_set['sent_tag'].value_counts().rename('Train Count')
eval_count = eval_set['sent_tag'].value_counts().rename('Eval Count')
test_count = test_set['sent_tag'].value_counts().rename('Test Count')

# Visualize distribution of the classes for stratified splitting
class_distributions = pd.concat([train_dist, eval_dist, test_dist, train_count, eval_count, test_count], axis=1)
class_distributions

Unnamed: 0_level_0,Train %,Eval %,Test %,Train Count,Eval Count,Test Count
sent_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,45.99,46.0,45.92,1931,552,276
0,37.03,37.0,37.1,1555,444,223
2,10.12,10.17,10.15,425,122,61
3,6.86,6.83,6.82,288,82,41


# Agreement Splitting

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd

# Import data
df_4800 = pd.read_excel('/home/fantoni/patent-sentence-classification/data/4800_axiomatic_dataset.xlsx')
test_set = pd.read_excel('/home/fantoni/patent-sentence-classification/data/1200_agreement_MC.xlsx')

# DEPRECATED: Get 600 for test set from 1200 of agreement
#temp_set, test_set = train_test_split(df_1200, test_size=600, stratify=df_1200['sent_class'], random_state=1999)
# Concatenate the other 600 to the 4,800 = 5,400
#train_eval_df = pd.concat([df_4800, temp_set])

# Split into train and valid set
#train_set, eval_set = train_test_split(df_4800, train_size=0.8, stratify=df_4800['sent_class'], random_state=1999) # stratified
train_set, eval_set = train_test_split(df_4800, train_size=0.8, random_state=1999)
#train_set.to_excel("/home/fantoni/patent-sentence-classification/data/train_agreement.xlsx", index=False)
#eval_set.to_excel("/home/fantoni/patent-sentence-classification/data/eval_agreement.xlsx", index=False)
#test_set.to_excel("/home/fantoni/patent-sentence-classification/data/test_agreement.xlsx", index=False)
print("Data successfully split and saved with agreement sampling.")

Data successfully split and saved with agreement sampling.


In [13]:
# Compute proportions
train_dist = train_set['sent_tag'].value_counts(normalize=True).mul(100).round(2).rename('Train %')
eval_dist = eval_set['sent_tag'].value_counts(normalize=True).mul(100).round(2).rename('Eval %')
test_dist = test_set['sent_tag'].value_counts(normalize=True).mul(100).round(2).rename('Test %')

# Compute counts
train_count = train_set['sent_tag'].value_counts().rename('Train Count')
eval_count = eval_set['sent_tag'].value_counts().rename('Eval Count')
test_count = test_set['sent_tag'].value_counts().rename('Test Count')

# Visualize distribution of the classes for stratified splitting
class_distributions = pd.concat([train_dist, eval_dist, test_dist, train_count, eval_count, test_count], axis=1)
class_distributions

Unnamed: 0_level_0,Train %,Eval %,Test %,Train Count,Eval Count,Test Count
sent_tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
STR,46.15,47.08,44.58,1772,452,535
FUN,37.86,37.08,34.33,1454,356,412
MIX,9.04,8.85,14.67,347,85,176
OTH,6.95,6.98,6.42,267,67,77


In [14]:
# Get distribution of agreement over the test set
df_agreement = pd.read_excel('/home/fantoni/patent-sentence-classification/data/1200_agreement_All.xlsx', usecols=['sent_id', 'agreement'])
test_set = pd.merge(test_set, df_agreement, on="sent_id", how="left")
print(f'Test size: {len(test_set)}')
test_set['agreement'].value_counts()

Test size: 1200


agreement
True     893
False    307
Name: count, dtype: int64

# Incremental Splitting with K-fold

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import pandas as pd

# Import data
df_4800 = pd.read_excel('/home/fantoni/patent-sentence-classification/data/4800_axiomatic_dataset.xlsx') # train + valid
test_set = pd.read_excel('/home/fantoni/patent-sentence-classification/data/1200_agreement_MC.xlsx') # test

# Generate samples
start = 480  
end = len(df_4800)  
step = 480
samples = list(range(start, len(df_4800), step)) 

# Ensure the last number is exactly 'end'
if samples[-1] != end:  
    samples.append(end)

# Create dictionary 
samples_dict = {i + 1: sample for i, sample in enumerate(samples)}
print(samples_dict)

# Generate seed and splits
k = 10 # set the number of splits
seeds = list(range(1, k+1))
k_splits = list(range(1, k+1))
print(f'seeds: {seeds}')
print(f'k_splits: {k_splits}')

{1: 480, 2: 960, 3: 1440, 4: 1920, 5: 2400, 6: 2880, 7: 3360, 8: 3840, 9: 4320, 10: 4800}
seeds: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
k_splits: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [4]:
# Generate incremental training and validation sets
for idx, n in samples_dict.items():
    if n == len(df_4800):
        for k, seed in zip(k_splits, seeds):
            #train_set, eval_set = train_test_split(df_4800, train_size=0.8, stratify=df_4800["sent_class"], random_state=seed) # stratified
            train_set, eval_set = train_test_split(df_4800, train_size=0.8, random_state=seed)
            train_set.to_excel(f"/home/fantoni/patent-sentence-classification/data/incremental/train_{idx}_{k}.xlsx", index=False)
            eval_set.to_excel(f"/home/fantoni/patent-sentence-classification/data/incremental/eval_{idx}_{k}.xlsx", index=False)
    else:
        for k, seed in zip(k_splits, seeds):
            # Get a sample of 'n_sample' from 4800 stratified on 'sent_class' with variable 'random_state'
            #train_eval_set, _ = train_test_split(df_4800, train_size=n, stratify=df_4800["sent_class"], random_state=seed) # stratified
            train_eval_set, _ = train_test_split(df_4800, train_size=n, random_state=seed)

            # Split the sample into train and validation set using fixed 'train_size' = 0.7 and fixed 'random_state' = 1999
            #train_set, eval_set = train_test_split(train_eval_set, train_size=0.8, stratify=train_eval_set["sent_class"], random_state=1999) # stratified
            train_set, eval_set = train_test_split(train_eval_set, train_size=0.8, random_state=1999)

            # Save train and validation sets
            train_set.to_excel(f"/home/fantoni/patent-sentence-classification/data/incremental/train_{idx}_{k}.xlsx", index=False)
            eval_set.to_excel(f"/home/fantoni/patent-sentence-classification/data/incremental/eval_{idx}_{k}.xlsx", index=False)

# Generate fixed test set
test_set.to_excel("/home/fantoni/patent-sentence-classification/data/test_agreement.xlsx", index=False)

In [5]:
import pandas as pd

# Import data
train_set = pd.read_excel('/home/fantoni/patent-sentence-classification/data/incremental/train_10_1.xlsx')
eval_set = pd.read_excel('/home/fantoni/patent-sentence-classification/data/incremental/eval_10_1.xlsx')
test_set = pd.read_excel('/home/fantoni/patent-sentence-classification/data/1200_agreement_MC.xlsx') # test

# Compute proportions
train_dist = train_set['sent_tag'].value_counts(normalize=True).mul(100).round(2).rename('Train %')
eval_dist = eval_set['sent_tag'].value_counts(normalize=True).mul(100).round(2).rename('Eval %')
test_dist = test_set['sent_tag'].value_counts(normalize=True).mul(100).round(2).rename('Test %')

# Compute counts
train_count = train_set['sent_tag'].value_counts().rename('Train Count')
eval_count = eval_set['sent_tag'].value_counts().rename('Eval Count')
test_count = test_set['sent_tag'].value_counts().rename('Test Count')

# Visualize distribution of the classes for stratified splitting
class_distributions = pd.concat([train_dist, eval_dist, test_dist, train_count, eval_count, test_count], axis=1)

# Add a final row with the total sum for count columns
totals = pd.Series({
    'Train %': train_dist.sum(),
    'Eval %': eval_dist.sum(),
    'Test %': test_dist.sum(),
    'Train Count': int(train_count.sum()),
    'Eval Count': int(eval_count.sum()),
    'Test Count': int(test_count.sum())
}, name='Total')

# Append totals row
class_distributions = pd.concat([class_distributions, totals.to_frame().T])

class_distributions

Unnamed: 0,Train %,Eval %,Test %,Train Count,Eval Count,Test Count
STR,47.08,43.33,44.58,1808.0,416.0,535.0
FUN,37.16,39.9,34.33,1427.0,383.0,412.0
MIX,8.78,9.9,14.67,337.0,95.0,176.0
OTH,6.98,6.88,6.42,268.0,66.0,77.0
Total,100.0,100.01,100.0,3840.0,960.0,1200.0


# Losanno Splitting

using train:valid = 0.9:0.1

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd

# Import data
df_4800 = pd.read_excel('/home/fantoni/patent-sentence-classification/data/4800_axiomatic_dataset.xlsx')
test_set = pd.read_excel('/home/fantoni/patent-sentence-classification/data/1200_agreement_MC.xlsx')

# Split into train and valid set
train_set, eval_set = train_test_split(df_4800, train_size=0.9, random_state=1999)
#train_set.to_excel("/home/fantoni/patent-sentence-classification/data/train_losanno.xlsx", index=False)
#eval_set.to_excel("/home/fantoni/patent-sentence-classification/data/eval_losanno.xlsx", index=False)
#test_set.to_excel("/home/fantoni/patent-sentence-classification/data/test_agreement.xlsx", index=False)
print("Data successfully split and saved with agreement sampling.")

Data successfully split and saved with agreement sampling.
