<a href="https://colab.research.google.com/github/LeibGit/-DI_Bootcamp/blob/main/mini_project_week6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import zipfile
import os

zip_file_path = "cats_dogs.zip"
destination_path = "data/cats_dogs"

try:
  with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
    zip_ref.extractall(destination_path)
    print("File successfully extracted.")
except Exception as e:
  print("An error occured extracting", e)

In [None]:
# Prefilled. Just copy and execute.
import os, math, re, random
from glob import glob
from pathlib import Path
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator

np.random.seed(42); tf.random.set_seed(42)

# Paths - change if needed
DATA_ROOT = Path("data/cats_dogs")
train_dir = (DATA_ROOT / "train" / "train") if (DATA_ROOT / "train" / "train").exists() else (DATA_ROOT / "train")
test_dir  = (DATA_ROOT / "test"  / "test")  if (DATA_ROOT / "test"  / "test").exists()  else (DATA_ROOT / "test")

IMG_HEIGHT, IMG_WIDTH = 180, 180
batch_size = 32
seed = 1337

# Build DataFrames from folders
def build_df_from_folder(folder: Path, labeled: bool=True):
    exts = ('*.jpg','*.jpeg','*.png','*.bmp')
    files = []
    for ex in exts:
        files.extend(glob(str(folder / '**' / ex), recursive=True))
    if not files:
        raise FileNotFoundError(f"No images found under {folder}")
    rows = []
    for f in files:
        if labeled:
            name = Path(f).name.lower()
            parent = Path(f).parent.name.lower()
            if parent in {"cat","cats"}:
                label = "cat"
            elif parent in {"dog","dogs"}:
                label = "dog"
            else:
                if re.search(r'(^|[^a-z])cat([^a-z]|$)', name): label = "cat"
                elif re.search(r'(^|[^a-z])dog([^a-z]|$)', name): label = "dog"
                else:
                    continue
            rows.append({"filepath": f, "label": label})
        else:
            rows.append({"filepath": f})
    return pd.DataFrame(rows)

df_train_full = build_df_from_folder(train_dir, labeled=True)
df_test_full  = build_df_from_folder(test_dir,  labeled=False)

# Train validation split
from sklearn.model_selection import train_test_split
df_tr, df_val = train_test_split(
    df_train_full, test_size=0.2, stratify=df_train_full["label"], random_state=seed
)

# Generators
train_gen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=10,
    zoom_range=0.1,
    horizontal_flip=True,
)
# No augmentation generator
na_train_gen = ImageDataGenerator()

val_gen = ImageDataGenerator(rescale=1./255)
test_gen = ImageDataGenerator(rescale=1./255)

train_flow = train_gen.flow_from_dataframe(
    df_tr, x_col="filepath", y_col="label",
    target_size=(IMG_HEIGHT, IMG_WIDTH),
    class_mode="binary", batch_size=batch_size,
    shuffle=True, seed=seed, validate_filenames=False
)
val_flow = val_gen.flow_from_dataframe(
    df_val, x_col="filepath", y_col="label",
    target_size=(IMG_HEIGHT, IMG_WIDTH),
    class_mode="binary", batch_size=batch_size,
    shuffle=False, validate_filenames=False
)
# Unlabeled test for inference only
test_flow = test_gen.flow_from_dataframe(
    df_test_full, x_col="filepath", y_col=None,
    target_size=(IMG_HEIGHT, IMG_WIDTH),
    class_mode=None, batch_size=batch_size,
    shuffle=False, validate_filenames=False
)

print({"train": train_flow.samples, "val": val_flow.samples, "test": test_flow.samples,
       "class_indices": train_flow.class_indices})

In [None]:
# Prefilled. Just copy and execute.
import os, math, re, random
from glob import glob
from pathlib import Path
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator

np.random.seed(42); tf.random.set_seed(42)

# Paths - change if needed
DATA_ROOT = Path("data/cats_dogs")
train_dir = (DATA_ROOT / "train" / "train") if (DATA_ROOT / "train" / "train").exists() else (DATA_ROOT / "train")
test_dir  = (DATA_ROOT / "test"  / "test")  if (DATA_ROOT / "test"  / "test").exists()  else (DATA_ROOT / "test")

IMG_HEIGHT, IMG_WIDTH = 180, 180
batch_size = 32
seed = 1337

# Build DataFrames from folders
def build_df_from_folder(folder: Path, labeled: bool=True):
    exts = ('*.jpg','*.jpeg','*.png','*.bmp')
    files = []
    for ex in exts:
        files.extend(glob(str(folder / '**' / ex), recursive=True))
    if not files:
        raise FileNotFoundError(f"No images found under {folder}")
    rows = []
    for f in files:
        if labeled:
            name = Path(f).name.lower()
            parent = Path(f).parent.name.lower()
            if parent in {"cat","cats"}:
                label = "cat"
            elif parent in {"dog","dogs"}:
                label = "dog"
            else:
                if re.search(r'(^|[^a-z])cat([^a-z]|$)', name): label = "cat"
                elif re.search(r'(^|[^a-z])dog([^a-z]|$)', name): label = "dog"
                else:
                    continue
            rows.append({"filepath": f, "label": label})
        else:
            rows.append({"filepath": f})
    return pd.DataFrame(rows)

df_train_full = build_df_from_folder(train_dir, labeled=True)
df_test_full  = build_df_from_folder(test_dir,  labeled=False)

# Train validation split
from sklearn.model_selection import train_test_split
df_tr, df_val = train_test_split(
    df_train_full, test_size=0.2, stratify=df_train_full["label"], random_state=seed
)


# No augmentation generator
na_train_gen = ImageDataGenerator()

val_gen = ImageDataGenerator(rescale=1./255)
test_gen = ImageDataGenerator(rescale=1./255)

na_train_flow = na_train_gen.flow_from_dataframe(
    df_tr, x_col="filepath", y_col="label",
    target_size=(IMG_HEIGHT, IMG_WIDTH),
    class_mode="binary", batch_size=batch_size,
    shuffle=True, seed=seed, validate_filenames=False
)

val_flow = val_gen.flow_from_dataframe(
    df_val, x_col="filepath", y_col="label",
    target_size=(IMG_HEIGHT, IMG_WIDTH),
    class_mode="binary", batch_size=batch_size,
    shuffle=False, validate_filenames=False
)

# Unlabeled test for inference only
test_flow = test_gen.flow_from_dataframe(
    df_test_full, x_col="filepath", y_col=None,
    target_size=(IMG_HEIGHT, IMG_WIDTH),
    class_mode=None, batch_size=batch_size,
    shuffle=False, validate_filenames=False
)

print({"train": train_flow.samples, "val": val_flow.samples, "test": test_flow.samples, "class_indices": train_flow.class_indices})

In [None]:
from google.colab import drive
drive.mount('/content/drive')

To-Do: Describe your planned CNN in full sentences before coding:

Number of convolutional blocks and filter sizes.
1. Going to use 3 conv2d layers(blocks) the first one being 32 batches with (3, 3) and the others using the same (3, 3) but being in 64 batches.
Placement of MaxPooling to reduce spatial dimensions.
We will do 2 maxpools one after the 32 first conv2d layer and one after the second conv2d layer. this will be (2, 2)
Use of Dropout and why it helps with regularization.
Dropout should be set at 20% and is helpfull for reducing overfitting and over  relience on particular nuerons.
Final Dense layers including the output layer with a single sigmoid unit for binary targets.
The best option would be to use one nueron with sigmoid activation, since this is a binary classification.

To-Do: Specify and justify

Optimizer choice, recommended Adam for fast convergence.
I am going to go with adam for faster convergence.
Initial learning rate and why it is reasonable.
I will go with 0.001 since this is usually the default for the adam optimizer.
Batch size relative to GPU or CPU memory.
I will be using 32 and 64 batch size since this is standard.
EarlyStopping on validation loss and optional ReduceLROnPlateau to adapt learning rate.


To-Do: Train for a fixed number of epochs, log training and validation curves, and then repeat with early stopping enabled.

Explain how you detect overfitting from the curves and what change you make to mitigate it.

Why: Curves reveal the bias variance tradeoff. Divergence between train and validation indicates overfitting. Mitigations include stronger augmentation, more dropout, or fewer parameters.

In [None]:
from tensorflow.keras import models, layers
from tensorflow.keras.models import Sequential
import matplotlib.pyplot as plt
import tensorflow as tf
model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation="relu", input_shape=(180, 180, 3)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation="relu"))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation="relu"))
model.add(layers.Flatten())
model.add(layers.Dense(64, activation="relu"))
model.add(layers.Dense(units=1, activation='sigmoid'))

model.compile(
    optimizer="adam",
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
    metrics=['accuracy']
)

history = model.fit(
    train_flow,
    epochs=10,
    validation_data=val_flow
)

In [None]:
# plot and evaluate
plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label='val_accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.show()
test_loss, test_acc = model.evaluate(val_flow)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_acc}")

Changes that should be made is not possivly the learning rate. Additionally, the image data generator is changing the image too much. The large variance in accuracy indicates overfitting.

In [None]:
import seaborn as sns
import numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

val_probs = model.predict(val_flow)
val_preds = (val_probs > 0.5).astype(int).ravel()
val_true = val_flow.classes

cm = confusion_matrix(val_true, val_preds)
disp = ConfusionMatrixDisplay(
    confusion_matrix=cm,
    display_labels=list(train_flow.class_indices.keys())
)
disp.plot(cmap="Blues")

In [None]:
# No augmentation model
from tensorflow.keras import models, layers
from tensorflow.keras.models import Sequential
import matplotlib.pyplot as plt
import tensorflow as tf
model_na = models.Sequential()
model_na.add(layers.Conv2D(32, (3, 3), activation="relu", input_shape=(180, 180, 3)))
model_na.add(layers.MaxPooling2D((2, 2)))
model_na.add(layers.Conv2D(64, (3, 3), activation="relu"))
model_na.add(layers.MaxPooling2D((2, 2)))
model_na.add(layers.Conv2D(64, (3, 3), activation="relu"))
model_na.add(layers.Flatten())
model_na.add(layers.Dense(64, activation="relu"))
model_na.add(layers.Dense(units=1, activation='sigmoid'))

model_na.compile(
    optimizer="adam",
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
    metrics=['accuracy']
)

history = model_na.fit(
    na_train_flow,
    epochs=10,
    validation_data=val_flow
)

In [None]:
# plot and evaluate
plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label='val_accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.show()
test_loss, test_acc = model_na.evaluate(val_flow)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_acc}")

In [None]:
import seaborn as sns
import numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

val_probs = model.predict(val_flow)
val_preds = (val_probs > 0.5).astype(int).ravel()
val_true = val_flow.classes

cm = confusion_matrix(val_true, val_preds)
disp = ConfusionMatrixDisplay(
    confusion_matrix=cm,
    display_labels=list(train_flow.class_indices.keys())
)
disp.plot(cmap="Blues")

Without augmentation, the model overfit the data. This can be seen by how well the model performed on the training data, but how poorly it performed on the testing data.

In [None]:
from sklearn.utils import class_weight
import numpy as np

class_weights = class_weight.compute_class_weight(
  class_weight='balanced',
  classes=np.unique(train_flow.classes),
  y=train_flow.classes
)

class_weight_dict = dict(enumerate(class_weights))
print(class_weight_dict)

In [None]:
# saving model
model.save("model.keras")

model_config = model.to_json()

with open("model_config.json", "w") as json_file:
  json_file.write(model_config)

To-Do: Propose one of the following and justify the expected benefit

Batch Normalization after convolutions.
Transfer learning with a frozen lightweight backbone like MobileNetV2 and a small classifier head.
Mixed precision training to accelerate on modern GPUs.
Why: These techniques can yield better accuracy or faster training with limited additional code.I believe using mixed precision training on modern GPU's since this will also help the model run faster.