# Data Preprocessing

In [1]:
from pathlib import Path
import shutil
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:

# Define paths using pathlib
data_dir = Path("../data")
csv_path = data_dir / "train.csv"
train_img_dir = data_dir / "train"
train_output = data_dir / "train_split"
val_output = data_dir / "val_split"

In [3]:
# Load CSV
df = pd.read_csv(csv_path)
df

Unnamed: 0,image_ID,label
0,7c225f7b61.jpg,Badminton
1,b31.jpg,Badminton
2,acb146546c.jpg,Badminton
3,0e62f4d8f2.jpg,Badminton
4,cfd2c28c74.jpg,Cricket
...,...,...
8222,903d2fffb3.jpg,Cricket
8223,b5f174c688.jpg,Badminton
8224,2433770a67.jpg,Tennis
8225,6403964166.jpg,Tennis


In [None]:
# Split 80% training / 20% validation
train_df, val_df = train_test_split(df, test_size=0.2, stratify= df["label"], random_state=42)

# Ensure output directories exist
train_output.mkdir(parents=True, exist_ok=True)
val_output.mkdir(parents=True, exist_ok=True)

# Move images using pathlib
def move_images(df, target_folder):
    for _, row in df.iterrows():
        src = train_img_dir / row['image_ID']
        dst_folder = target_folder / row['label']
        dst_folder.mkdir(parents=True, exist_ok=True)
        shutil.move(str(src), str(dst_folder / row['image_ID']))

move_images(train_df, train_output)
move_images(val_df, val_output)

# Save new CSV files
train_df.to_csv(data_dir / "train_split.csv", index=False)
val_df.to_csv(data_dir / "val_split.csv", index=False)

print("✅ Data has been split: 80% for training, 20% for validation!")


✅ Data has been split: 80% for training, 20% for validation!
