In [1]:
!pip install pandas
!pip install scikit-learn



In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
def three_way_split_and_save(merged_csv_path, output_folder, stratify_column=None):
    # Loading the merged dataset
     """
    Splits a merged dataset into train, validation, and test sets
    (70%, 15%, 15%) and saves them as separate CSV files.

    Optionally supports stratified splitting based on a specified column
    to preserve class distribution across splits.

    Args:
        merged_csv_path (str): Full path to the merged CSV dataset.
        output_folder (str): Directory where the split CSV files should be saved.
        stratify_column (str, optional): Column name to stratify on, if desired.
                                         Defaults to None (no stratification).

    Returns:
        None. Saves 'train_dataset.csv', 'val_dataset.csv', and 'test_dataset.csv'
        in the output folder.
    """
    df = pd.read_csv(merged_csv_path)

    # Set stratification values if a valid stratify column is provided
    stratify_vals = df[stratify_column] if stratify_column and stratify_column in df.columns else None

    # First, split 70% for training and 30% as a temporary set
    train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42, stratify=stratify_vals)

    # Then split the temporary set equally into validation and test sets (15% each)
    stratify_temp = temp_df[stratify_column] if stratify_column and stratify_column in df.columns else None
    val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=stratify_temp)

    # Define the paths for each of the split datasets
    train_path = os.path.join(output_folder, 'train_dataset.csv')
    val_path = os.path.join(output_folder, 'val_dataset.csv')
    test_path = os.path.join(output_folder, 'test_dataset.csv')

    train_df.to_csv(train_path, index=False)
    val_df.to_csv(val_path, index=False)
    test_df.to_csv(test_path, index=False)

    # Save each split to CSV (no index column)
    print(f"Training set saved: {train_path} — {len(train_df)} rows")
    print(f"Validation set saved: {val_path} — {len(val_df)} rows")
    print(f"Test set saved: {test_path} — {len(test_df)} rows")

# Input your paths
merged_csv_path = '/content/drive/MyDrive/GW Semesters/Sem 2/ML by Prof Shi Feng/ML Sem Project - Fake Review Detector/FakeReviewShield/Final Datasets/merged_output.csv'
output_folder = '/content/drive/MyDrive/GW Semesters/Sem 2/ML by Prof Shi Feng/ML Sem Project - Fake Review Detector/FakeReviewShield/Final Datasets'

# Run the function (no stratification for now)
three_way_split_and_save(merged_csv_path, output_folder, stratify_column=None)

Training set saved: /content/drive/MyDrive/GW Semesters/Sem 2/ML by Prof Shi Feng/ML Sem Project - Fake Review Detector/FakeReviewShield/Final Datasets/train_dataset.csv — 24500 rows
Validation set saved: /content/drive/MyDrive/GW Semesters/Sem 2/ML by Prof Shi Feng/ML Sem Project - Fake Review Detector/FakeReviewShield/Final Datasets/val_dataset.csv — 5250 rows
Test set saved: /content/drive/MyDrive/GW Semesters/Sem 2/ML by Prof Shi Feng/ML Sem Project - Fake Review Detector/FakeReviewShield/Final Datasets/test_dataset.csv — 5250 rows
