### **Step1 : Importing Libraries**

In [None]:
import os, shutil
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
RAW_DIR = "../data/raw"
PROC_DIR = "../data/processed"

# Make directories
for split in ['train', 'val', 'test']:
    for cls in os.listdir(RAW_DIR):
        os.makedirs(os.path.join(PROC_DIR, split, cls), exist_ok=True)

#### **Create File DataFrame**

In [None]:
filepaths = []
for cls in os.listdir(RAW_DIR):
    cls_path = os.path.join(RAW_DIR, cls)
    if not os.path.isdir(cls_path): continue
    for fname in os.listdir(cls_path):
        filepaths.append((os.path.join(cls_path, fname), cls))

df = pd.DataFrame(filepaths, columns=["filepath", "label"])
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

#### **Class Distribution (Before Splitting)**

Adding Viz for calculation of the counts of images of each class

In [None]:
plt.figure(figsize=(10, 5))
sns.countplot(data=df, x='label', order=df['label'].value_counts().index)
plt.title("📊 Original Class Distribution")
plt.xticks(rotation=45)
plt.show()


#### **Stratified Split**

In [None]:
train_df, temp_df = train_test_split(df, test_size=0.3, stratify=df['label'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label'], random_state=42)

print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")


🔄 Stratified Data Splitting (3-way: Train, Val, Test)

You're taking your full dataset `df` and splitting it into three sets while **preserving the class distribution** (a technique called stratified sampling):

Step 1: Train + Temp (70% / 30%)

- **train_df** gets 70% of the total data.
- **temp_df** holds the remaining 30%.
- **stratify=df['label']** ensures every class is represented in roughly the same proportion across all splits.
- **random_state=42** makes the split reproducible.

Step 2: Val + Test (15% / 15%)

- Take that 30% temporary set and split it evenly:
  - 15% to **val_df**
  - 15% to **test_df**
- Again, it uses stratified splitting.

#### **Class Distribution Across Splits**

In [None]:
def plot_split_dist(df_list, labels=["Train", "Val", "Test"]):
    plt.figure(figsize=(12, 4))
    for i, split_df in enumerate(df_list):
        plt.subplot(1, 3, i+1)
        sns.countplot(data=split_df, x='label', order=split_df['label'].value_counts().index)
        plt.title(f"{labels[i]} Distribution")
        plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

plot_split_dist([train_df, val_df, test_df])

#### **Copy Files to `data/processed/`**

In [None]:
def copy_to_split(df_subset, split):
    for src, label in tqdm(df_subset[["filepath", "label"]].values):
        dst = os.path.join(PROC_DIR, split, label, os.path.basename(src))
        shutil.copy2(src, dst)

copy_to_split(train_df, "train")
copy_to_split(val_df, "val")
copy_to_split(test_df, "test")


In [None]:
def log_class_counts(*splits, split_names=["Train", "Val", "Test"]):
    print("📋 Image Count per Class per Split\n")
    summary = {}
    for df, name in zip(splits, split_names):
        counts = df['label'].value_counts().sort_index()
        summary[name] = counts
    summary_df = pd.DataFrame(summary).fillna(0).astype(int)
    # display(summary_df)
    return summary_df

log_class_counts(train_df, val_df, test_df)
# Log the processed DataFrames 

#### **Save to CSV in `reports/`**

In [None]:
summary_df = log_class_counts(train_df, val_df, test_df)
summary_df.to_csv("../reports/class_distribution_summary.csv")