In [49]:
import os
import sys

In [50]:
def is_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False

In [51]:
if is_colab():
    from google.colab import drive
    drive.mount('/content/drive')

    # code required by Colab
    %cd /content/drive/MyDrive/PatMatBaselineAlena

    root_dir = '/content/drive/MyDrive/PatMatBaselineAlena'
else:
    current_dir = os.getcwd()
    if current_dir.endswith("notebooks"):
        root_dir = '..'
        %cd ..
    else:
        root_dir = '.'

In [52]:
current_dir = os.getcwd()
print(current_dir)

c:\Users\alena\MyFolder\DIL\PatMatBaselineAlena


In [53]:
sys.path.append(current_dir)

In [None]:
# install requirements
%pip install -r requirements.txt

In [55]:
import pandas as pd

In [56]:
train_file_path = 'data/train.parquet'
test_file_path = 'data/test.parquet'

In [57]:
df_train = pd.read_parquet(train_file_path)
df_test = pd.read_parquet(test_file_path)

In [58]:
len(df_test), len(df_train)

(808, 3030)

In [59]:
len(df_test) / (len(df_test) + len(df_train))

0.21052631578947367

In [60]:
rows_with_nulls_train = df_train[df_train.isna().any(axis=1)]
print(len(rows_with_nulls_train), len(rows_with_nulls_train)/len(df_train))

280 0.0924092409240924


In [61]:
rows_with_nulls_test = df_test[df_test.isna().any(axis=1)]
print(len(rows_with_nulls_test), len(rows_with_nulls_test)/len(df_test))

74 0.09158415841584158


In [62]:
rows_with_nulls_train_columns_needed = df_train[df_train[['text', 'text_b', 'label']].isna().any(axis=1)]
print(len(rows_with_nulls_train_columns_needed), len(rows_with_nulls_train_columns_needed)/len(df_train))

91 0.030033003300330034


In [63]:
rows_with_nulls_test_columns_needed = df_test[df_test[['text', 'text_b', 'label']].isna().any(axis=1)]
print(len(rows_with_nulls_test_columns_needed), len(rows_with_nulls_test_columns_needed)/len(df_test))

25 0.03094059405940594


Cleaning and filtering of train dataset

In [64]:
df_train_filtered = df_train[df_train[['text', 'text_b', 'label']].notna().all(axis=1)]

len(df_train_filtered)

2939

In [65]:
duplicates_train = df_train_filtered.duplicated(subset=["text", "text_b"], keep=False)
num_duplicates_train = duplicates_train.sum()
cleaned_df_train = df_train_filtered[~duplicates_train]
len(cleaned_df_train), len(duplicates_train), num_duplicates_train, num_duplicates_train / len(df_train)

(2881, 2939, 58, 0.01914191419141914)

In [66]:
value_counts = cleaned_df_train['label'].value_counts()
value_counts

label
1.0    1683
0.0    1198
Name: count, dtype: int64

Cleaning and filtering of test dataset

In [68]:
df_test_filtered = df_test[df_test[['text', 'text_b', 'label']].notna().all(axis=1)]

len(df_test_filtered)

783

In [69]:
duplicates_test = df_test_filtered.duplicated(subset=["text", "text_b"], keep=False)
num_duplicates_test = duplicates_test.sum()
cleaned_df_test = df_test_filtered[~duplicates_test]
len(cleaned_df_test), len(duplicates_test), num_duplicates_test, num_duplicates_test/len(df_test)

(769, 783, 14, 0.017326732673267328)

In [70]:
value_counts = cleaned_df_test['label'].value_counts()
value_counts

label
0.0    394
1.0    375
Name: count, dtype: int64

In [73]:
# Check for duplicates in the test set that are present in the training set
duplicates_test = cleaned_df_test.merge(cleaned_df_train, on=["text", "text_b"], how="inner")
len(duplicates_test)

0

In [74]:
from sklearn.model_selection import train_test_split

In [75]:
df_train_for_split = cleaned_df_train.sample(frac=1, random_state=42).reset_index(drop=True)
train_df, validation_df = train_test_split(df_train_for_split, test_size=0.2, random_state=42)

In [76]:
train_df = train_df[['text', 'text_b', 'label']]
validation_df = validation_df[['text', 'text_b', 'label']]
test_df = cleaned_df_test[['text', 'text_b', 'label']]

In [79]:
print(len(train_df), len(train_df) / sum([len(train_df), len(validation_df), len(test_df)]))
print(len(validation_df), len(validation_df) / sum([len(train_df), len(validation_df), len(test_df)]))
print(len(test_df), len(test_df) / sum([len(train_df), len(validation_df), len(test_df)]))

2304 0.6312328767123287
577 0.15808219178082192
769 0.2106849315068493


In [96]:
test_df.head()

Unnamed: 0,text,text_b,label
0,The medical device of claim 1 wherein said hou...,In one embodiment the device also includes at ...,1.0
1,Process according to any one of the preceding ...,According to another embodiment the white pigm...,0.0
2,The method of any of claims 11 to 12 further c...,In certain embodiments the method further comp...,0.0
3,A gas turbine engine 20 comprising an engine s...,Referring now to FIG.6 an axial section view o...,1.0
4,The system of claim 1 the operations comprisin...,Each of the entry and exit gates 202 204 compr...,0.0


In [66]:
def save_to_jsonl(df, filename):
    df.to_json(filename, orient='records', lines=True)

save_to_jsonl(train_df, '../data/train.jsonl')
save_to_jsonl(validation_df, '../data/validation.jsonl')
save_to_jsonl(test_df, '../data/test.jsonl')