# Step 1: Import Libraries and Load the Dataset

In [1]:
# Imports

#pathlib: modern, clean way to handle file and folder paths
from pathlib import Path

#shutil: used to copy files from raw folders to the train and test folders
import shutil

import random

# nump useful for numerical operations
import numpy as np

import pandas as pd

#matplotlib.pyplot: uswed for visualizations
import matplotlib.pyplot as plt


import seaborn as sns

from PIL import Image

#tensorflow and keras for deep learning
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator




# Step 2: Data setup

In [2]:
# Here we first must define the original datasets paths
# then educe them to 5,000 images for train and 2,000 for test which will be appropriate for the scale of the workshop

raw_train_dir = Path("data/raw/train/train")
raw_test_dir  = Path("data/raw/test/test")

#Paths for processsed/reduced dataset
train_dir = Path("data/train")
test_dir = Path("data/test")


#move iamges into subfolders for cat and dog

for folder in [train_dir /"cats", train_dir / "dogs",
               test_dir / "cats", test_dir / "dogs",]:
    folder.mkdir(parents=True, exist_ok=True)


#Randomly takes 2,5000 cats and dog images from the data set to for the reduced training dataset

def copy_random_n_images_by_prefix(src_folder, prefix, dest_folder, class_name, n):

    all_files = list(src_folder.glob(f"{prefix}*.jpg")) #list all images
    selected = random.sample(all_files, n) #randomly  pick n
    for f in selected:
        shutil.copy(f, dest_folder /f.name)

#Copy the random 2,5000 per class for training (cats and dogs)
copy_random_n_images_by_prefix(src_folder=raw_train_dir, prefix="cat", dest_folder=train_dir / "cats", n=2500)
copy_random_n_images_by_prefix(src_folder=raw_train_dir, prefix="dog", dest_folder=train_dir / "dogs", n=2500)

#Copy the random 1,000 per class for test (cats and dogs)
copy_random_n_images_by_prefix(src_folder=raw_test_dir, prefix="cat", dest_folder=test_dir / "cats", n=1000)
copy_random_n_images_by_prefix(src_folder=raw_test_dir, prefix="dog", dest_folder=test_dir / "dogs", n=1000)

#print result

print("REduced dataset created sucessfully")


ValueError: Sample larger than population or is negative

# Step 3 EDA

In [None]:
# Count the amount of images are present per class

def count_images(directory):
    classes = ["cats", "dogs"]
    counts = {}
    for cls in classes:
        counts[cls] = len(list((directory / cls).glob("*.jpg")))
    return counts

train_counts = count_images(train_dir)
test_counts = count_images(test_dir)

print("Training set:", train_counts)
print("Test set:", test_counts)

# Visualize distribution of cats vs dogs

plt.figure(figsize=(6,4))
sns.barplot(x=list(train_counts.keys()), y=list(train_counts.values()))
plt.title("Class Distribution in Training Set")
plt.ylabel("Number of Images")
plt.show()

#Show Sample Images from each class

def show_samples(directory, class_name, n=5):
    files = list((directory / class_name).glob("*.jpg"))
    samples = random.sample(files, n)

    plt.figure(figsize=(15,3))
    for i, img_path in enumerate(samples):
        img = Image.open(img_path)
        plt.subplot(1,n,i+1)
        plt.imshow(img)
        plt.axis("off")
    plt.suptitle(f"Sample {class_name.capitalize()} Images")
    plt.show()

show_samples(train_dir, "dogs")
show_samples(test_dir, "cats")


#Analyze image sizes (width x height)