# main3: portfolio-ready rebuild
Step-by-step refresh toward a cleaner, more reproducible pipeline.

In [1]:
import sys, platform
print(sys.executable)
print(platform.python_version())

/usr/bin/python3
3.12.3


In [2]:
from pathlib import Path
root = Path('.')
data_dir = root / 'Data'
data_original_dir = root / 'Data_original'
print('Data dir exists:', data_dir.exists())
print('Data_original dir exists:', data_original_dir.exists())
if data_dir.exists():
    train_dir = data_dir / 'Train'
    val_dir = data_dir / 'Validation'
    print('Train classes:', sorted([p.name for p in train_dir.iterdir() if p.is_dir()]))
    print('Validation classes:', sorted([p.name for p in val_dir.iterdir() if p.is_dir()]))
if data_original_dir.exists():
    train_dir_o = data_original_dir / 'Train'
    val_dir_o = data_original_dir / 'Validation'
    print('Orig Train classes:', sorted([p.name for p in train_dir_o.iterdir() if p.is_dir()]))
    print('Orig Validation classes:', sorted([p.name for p in val_dir_o.iterdir() if p.is_dir()]))

Data dir exists: True
Data_original dir exists: True
Train classes: ['biryani', 'chapli_kebab', 'chocolate_cake', 'samosa', 'seekh_kebab']
Validation classes: ['biryani', 'chapli_kebab', 'chocolate_cake', 'samosa', 'seekh_kebab']
Orig Train classes: ['biryani', 'chapli_kebab', 'chocolate_cake', 'samosa', 'seekh_kebab']
Orig Validation classes: ['biryani', 'chapli_kebab', 'chocolate_cake', 'samosa', 'seekh_kebab']


In [3]:
from pathlib import Path
from collections import Counter

def count_images(root: Path):
    counts = {}
    if not root.exists():
        return counts
    for cls in sorted([p for p in root.iterdir() if p.is_dir()]):
        counts[cls.name] = len(list(cls.glob('*.jpg')))
    return counts

train_counts = count_images(Path('Data')/'Train')
val_counts = count_images(Path('Data')/'Validation')
orig_train_counts = count_images(Path('Data_original')/'Train')
orig_val_counts = count_images(Path('Data_original')/'Validation')

print('Augmented train counts:', train_counts)
print('Augmented val counts:', val_counts)
print('Original train counts:', orig_train_counts)
print('Original val counts:', orig_val_counts)
print('Total augmented train:', sum(train_counts.values()))
print('Total original train:', sum(orig_train_counts.values()))

Augmented train counts: {'biryani': 95, 'chapli_kebab': 95, 'chocolate_cake': 150, 'samosa': 110, 'seekh_kebab': 95}
Augmented val counts: {'biryani': 5, 'chapli_kebab': 4, 'chocolate_cake': 7, 'samosa': 5, 'seekh_kebab': 4}
Original train counts: {'biryani': 19, 'chapli_kebab': 19, 'chocolate_cake': 30, 'samosa': 22, 'seekh_kebab': 19}
Original val counts: {'biryani': 5, 'chapli_kebab': 4, 'chocolate_cake': 7, 'samosa': 5, 'seekh_kebab': 4}
Total augmented train: 545
Total original train: 109


In [4]:
import random, shutil
from pathlib import Path
from collections import defaultdict

seed = 42
random.seed(seed)

source_dirs = [Path('Data_original/Train'), Path('Data_original/Validation')]
out_root = Path('Data_split')

# discover classes from source
classes = [p.name for p in sorted(source_dirs[0].iterdir()) if p.is_dir()]

# collect images per class across train/val
per_class = {}
for cls in classes:
    imgs = []
    for src in source_dirs:
        cls_path = src / cls
        imgs.extend(sorted(cls_path.glob('*.jpg')))
    per_class[cls] = imgs

train_ratio, val_ratio, test_ratio = 0.6, 0.2, 0.2

# reset output split dirs
if out_root.exists():
    shutil.rmtree(out_root)
for split in ['train', 'val', 'test']:
    for cls in classes:
        (out_root / split / cls).mkdir(parents=True, exist_ok=True)

split_counts = defaultdict(lambda: defaultdict(int))

for cls, imgs in per_class.items():
    imgs = list(imgs)
    random.shuffle(imgs)
    n = len(imgs)
    test_n = max(1, int(round(n * test_ratio)))
    val_n = max(1, int(round(n * val_ratio)))
    if test_n + val_n >= n:
        test_n = max(1, int(n * 0.2))
        val_n = max(1, int(n * 0.2))
        if test_n + val_n >= n:
            val_n = max(1, n - test_n - 1)
    test_imgs = imgs[:test_n]
    val_imgs = imgs[test_n : test_n + val_n]
    train_imgs = imgs[test_n + val_n :]
    for img in train_imgs:
        shutil.copy2(img, out_root / 'train' / cls / img.name)
        split_counts['train'][cls] += 1
    for img in val_imgs:
        shutil.copy2(img, out_root / 'val' / cls / img.name)
        split_counts['val'][cls] += 1
    for img in test_imgs:
        shutil.copy2(img, out_root / 'test' / cls / img.name)
        split_counts['test'][cls] += 1

print('Split counts per class:')
for split in ['train', 'val', 'test']:
    print(split, {cls: split_counts[split][cls] for cls in classes})
print('Totals:', {split: sum(split_counts[split].values()) for split in split_counts})

Split counts per class:
train {'biryani': 14, 'chapli_kebab': 13, 'chocolate_cake': 23, 'samosa': 17, 'seekh_kebab': 13}
val {'biryani': 5, 'chapli_kebab': 5, 'chocolate_cake': 7, 'samosa': 5, 'seekh_kebab': 5}
test {'biryani': 5, 'chapli_kebab': 5, 'chocolate_cake': 7, 'samosa': 5, 'seekh_kebab': 5}
Totals: {'train': 80, 'val': 27, 'test': 27}


In [None]:
from pathlib import Path
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input

train_dir = Path('Data_split/train')
val_dir = Path('Data_split/val')
test_dir = Path('Data_split/test')

train_datagen = ImageDataGenerator(
    preprocessing_function=preprocess_input,
    rotation_range=30,
    horizontal_flip=True,
    zoom_range=0.2,
    width_shift_range=0.15,
    height_shift_range=0.15,
    brightness_range=[0.8, 1.2],
    shear_range=0.1,
    fill_mode='nearest',
)
val_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)
test_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)

train_gen = train_datagen.flow_from_directory(
    train_dir,
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical',
    shuffle=True,
)
val_gen = val_datagen.flow_from_directory(
    val_dir,
    target_size=(224, 224),
    batch_size=16,
    class_mode='categorical',
    shuffle=False,
)
test_gen = test_datagen.flow_from_directory(
    test_dir,
    target_size=(224, 224),
    batch_size=16,
    class_mode='categorical',
    shuffle=False,
)

print('Train samples:', train_gen.samples)
print('Val samples:', val_gen.samples, 'shuffle:', val_gen.shuffle)
print('Test samples:', test_gen.samples, 'shuffle:', test_gen.shuffle)
print('Classes:', train_gen.class_indices)