In [6]:
# Install necessary libraries
!pip install dvc dvc[gdrive] --quiet

import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Define paths
DATA_DIR = "/content/drive/MyDrive/sms+spam+collection"
RAW_FILE = os.path.join(DATA_DIR, "SMSSpamCollection.csv")
RAW_DATA = os.path.join(DATA_DIR, "raw_data.csv")
TRAIN_DATA = os.path.join(DATA_DIR, "train.csv")
VAL_DATA = os.path.join(DATA_DIR, "validation.csv")
TEST_DATA = os.path.join(DATA_DIR, "test.csv")

# Ensure data directory exists
os.makedirs(DATA_DIR, exist_ok=True)

# Remove broken Git/DVC setup if any
!rm -rf "{DATA_DIR}/.git" "{DATA_DIR}/.dvc"

# Initialize Git and DVC
%cd "{DATA_DIR}"
!git init
!dvc init

# Configure Git identity (Replace with your actual details)
!git config --global user.email "suryadeva.eada@gmail.com"
!git config --global user.name "infinitysurya"

# Set up Google Drive as remote storage for DVC
!dvc remote add -d myremote "{DATA_DIR}"

# Load raw data
df = pd.read_csv(RAW_FILE, sep='\t', header=None, names=['label', 'text'])
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
df.to_csv(RAW_DATA, index=False)

# Track raw data with DVC
!dvc add raw_data.csv
!git add raw_data.csv.dvc .gitignore
!git commit -m "Added raw data"
!dvc push

# Function to split and track data
def split_data(random_seed):
    df = pd.read_csv(RAW_DATA)
    train, temp = train_test_split(df, test_size=0.3, random_state=random_seed, stratify=df['label'])
    val, test = train_test_split(temp, test_size=0.5, random_state=random_seed, stratify=temp['label'])

    train.to_csv(TRAIN_DATA, index=False)
    val.to_csv(VAL_DATA, index=False)
    test.to_csv(TEST_DATA, index=False)

    # Track split data with DVC
    !dvc add train.csv validation.csv test.csv
    !git add train.csv.dvc validation.csv.dvc test.csv.dvc
    !git commit -m f"Added train/val/test split with seed {random_seed}"
    !dvc push

    return train, val, test

# First version of data split
train_1, val_1, test_1 = split_data(random_seed=42)

# Print first version label distribution
print("First version label distribution:")
print("Train:", train_1['label'].value_counts().to_dict())
print("Validation:", val_1['label'].value_counts().to_dict())
print("Test:", test_1['label'].value_counts().to_dict())

# Update the split using a different random seed
train_2, val_2, test_2 = split_data(random_seed=99)

# Print updated version label distribution
print("\nUpdated version label distribution:")
print("Train:", train_2['label'].value_counts().to_dict())
print("Validation:", val_2['label'].value_counts().to_dict())
print("Test:", test_2['label'].value_counts().to_dict())

# Compare differences between both splits
print("\nComparison of label distribution changes:")
print("Train Difference:", {k: train_2['label'].value_counts().to_dict()[k] - train_1['label'].value_counts().to_dict()[k] for k in [0, 1]})
print("Validation Difference:", {k: val_2['label'].value_counts().to_dict()[k] - val_1['label'].value_counts().to_dict()[k] for k in [0, 1]})
print("Test Difference:", {k: test_2['label'].value_counts().to_dict()[k] - test_1['label'].value_counts().to_dict()[k] for k in [0, 1]})


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/sms+spam+collection
hint: Using 'master' as the name for the initial branch. This default branch name
hint: is subject to change. To configure the initial branch name to use in all
hint: 
hint: 	git config --global init.defaultBranch <name>
hint: 
hint: Names commonly chosen instead of 'master' are 'main', 'trunk' and
hint: 'development'. The just-created branch can be renamed via this command:
hint: 
hint: 	git branch -m <name>
Initialized empty Git repository in /content/drive/MyDrive/sms+spam+collection/.git/
Initialized DVC repository.

You can now commit the changes to git.

+---------------------------------------------------------------------+
|                                                                     |
|        DVC has enabled anonymous aggregate usage analytics.         |
|     Read the analytics documentation (and h