In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

original_dataset_path = '../datasets/imdb_dataset.csv'

try:
    df = pd.read_csv(original_dataset_path)
    print(f"Successfully loaded dataset with {len(df)} samples.")
except FileNotFoundError:
    print(f"File not found: {original_dataset_path}")
    exit(1)
except pd.errors.EmptyDataError:
    print(f"No data: {original_dataset_path} is empty.")
    exit(1)
except pd.errors.ParserError:
    print(f"Parsing error: Check the format of {original_dataset_path}.")
    exit(1)

print("First few samples of the dataset:")
print(df.head())

required_columns = {'review', 'sentiment'}
if not required_columns.issubset(df.columns):
    print(f"Dataset must contain the following columns: {required_columns}")
    exit(1)

train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    # random_state=42,
    stratify=df['sentiment']
)

# Reset the index of the resulting DataFrames
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# Define output file paths
train_output_path = '../datasets/imdb_dataset_train.csv'
test_output_path = '../datasets/imdb_dataset_test.csv'

# Save the splits to CSV files
train_df.to_csv(train_output_path, index=False)
test_df.to_csv(test_output_path, index=False)

print(f"Training set saved to {train_output_path} with {len(train_df)} samples.")
print(f"Test set saved to {test_output_path} with {len(test_df)} samples.")

# Optional: Verify the distribution of labels in both splits
def verify_distribution(original, split, split_name):
    original_dist = original['sentiment'].value_counts(normalize=True)
    split_dist = split['sentiment'].value_counts(normalize=True)
    print(f"\nLabel distribution in {split_name}:")
    print(split_dist)
    print(f"Original label distribution:")
    print(original_dist)

verify_distribution(df, train_df, "Training Set")
verify_distribution(df, test_df, "Test Set")

Successfully loaded dataset with 50000 samples.
First few samples of the dataset:
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
Training set saved to ../datasets/imdb_dataset_train.csv with 40000 samples.
Test set saved to ../datasets/imdb_dataset_test.csv with 10000 samples.

Label distribution in Training Set:
sentiment
positive    0.5
negative    0.5
Name: proportion, dtype: float64
Original label distribution:
sentiment
positive    0.5
negative    0.5
Name: proportion, dtype: float64

Label distribution in Test Set:
sentiment
positive    0.5
negative    0.5
Name: proportion, dtype: float64
Original label distribution:
sentiment
positive    0.5
negative    