In [1]:
import os
import pandas as pd
import sys
from sklearn.model_selection import train_test_split


In [2]:
# code required by local PC
# Get the current working directory
current_dir = os.getcwd()

# Determine the parent directory
# Use os.path.abspath and os.path.join to get the absolute path to the parent directory
parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))

# Add the parent directory to sys.path
# This allows importing modules from the parent directory
sys.path.append(parent_dir)


In [3]:
def load_and_prepare_data(file_path):
    # Load the data
    data = pd.read_parquet(file_path, columns=['text', 'text_b', 'label'])
    
    # Cast label column to integer
    data['label'] = data['label'].astype(int)
    
    # Data shuffling
    data = data.sample(frac=1).reset_index(drop=True)
    
    return data


In [4]:
from src.data_cleaning import clearing
# Call the clearing function to execute the dataset cleaning process
clearing()

Train set: Initial number of rows: 3030
Train set: Number of rows after removing nulls and duplicates: 2909
Test set: Initial number of rows: 808
Test set: Number of rows after removing nulls and duplicates: 776


In [5]:
# Get the parent directory of the current working directory
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

In [6]:
# Load and prepare cleaned training data
clean_train_path = os.path.join(parent_dir, 'data', 'clean_train.parquet')
train_data = load_and_prepare_data(clean_train_path)

In [7]:
# Split the data into training and validation sets
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

In [8]:
# Save the cleaned training and validation DataFrames to new .parquet files
train_data.to_parquet(os.path.join(parent_dir, 'data', 'clean_train.parquet'))
val_data.to_parquet(os.path.join(parent_dir, 'data', 'clean_val.parquet'))

In [9]:
# Load and prepare cleaned test data
clean_test_path = os.path.join(parent_dir, 'data', 'clean_test.parquet')
test_data = load_and_prepare_data(clean_test_path)

In [10]:
# Convert training and validation sets to dictionary lists
train_list = train_data.to_dict(orient='records')
val_list = val_data.to_dict(orient='records')
# Convert test set to dictionary list
test_list = test_data.to_dict(orient='records')

In [11]:

# Result output
print(len(train_list))
print(len(val_list))
print(len(test_list))

2327
582
776
