<a href="https://colab.research.google.com/github/Hartiel/Transfer-Learning-Classifier/blob/main/transfer_learning_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import zipfile
import shutil
import random
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from shutil import copyfile
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras import layers, models
import matplotlib.pyplot as plt

In [2]:
# --- CONFIG AND CONSTANTS ---
# Set URLs and temp directories
DATASET_URL = "https://download.microsoft.com/download/3/e/1/3e1c3f21-ecdb-4869-8368-6deba77b919f/kagglecatsanddogs_5340.zip"
LOCAL_ZIP = "/tmp/cats-and-dogs.zip"
BASE_DIR = '/tmp/cats-v-dogs'

In [3]:
# --- DOWNLOAD AND EXTRACT DATA ---
print(f">>> Start dataset downloading")
!wget --no-check-certificate "{DATASET_URL}" -O "{LOCAL_ZIP}"

print(">>> Extract files...")
with zipfile.ZipFile(LOCAL_ZIP, 'r') as zip_ref:
    zip_ref.extractall('/tmp')

>>> Start dataset downloading
--2025-11-27 14:02:55--  https://download.microsoft.com/download/3/e/1/3e1c3f21-ecdb-4869-8368-6deba77b919f/kagglecatsanddogs_5340.zip
Resolving download.microsoft.com (download.microsoft.com)... 23.44.74.17, 2600:1407:7400:1184::317f, 2600:1407:7400:1187::317f
Connecting to download.microsoft.com (download.microsoft.com)|23.44.74.17|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 824887076 (787M) [application/octet-stream]
Saving to: ‘/tmp/cats-and-dogs.zip’


2025-11-27 14:03:08 (61.6 MB/s) - ‘/tmp/cats-and-dogs.zip’ saved [824887076/824887076]

>>> Extract files...


In [4]:
# --- DATA CLEANING AND SPLIT ---
# Directory reset for incorrect data
if os.path.exists(BASE_DIR):
    shutil.rmtree(BASE_DIR)

# Directory structure Keras default
for root in ['training', 'testing']:
    for label in ['cats', 'dogs']:
        os.makedirs(os.path.join(BASE_DIR, root, label))

# Raw images process,  remove corrupt files (0 bytes or invalid headers)
# and random split for avoid ordering bias.
def sanity_check_and_split(source, training_dir, testing_dir, split_size):
    valid_files = []

    # Integrity validation
    for filename in os.listdir(source):
        file_path = os.path.join(source, filename)

        if os.path.getsize(file_path) > 0:
            valid_files.append(filename)
        else:
            print(f"Void file ignored: {filename}")

    # Shuffle & Split (90/10)
    training_length = int(len(valid_files) * split_size)
    testing_length = int(len(valid_files) - training_length)
    shuffled_set = random.sample(valid_files, len(valid_files))
    training_set = shuffled_set[0:training_length]
    testing_set = shuffled_set[-testing_length:]

    # File ordering
    for filename in training_set:
        copyfile(os.path.join(source, filename), os.path.join(training_dir, filename))
    for filename in testing_set:
        copyfile(os.path.join(source, filename), os.path.join(testing_dir, filename))

# Clean pipeline execution
print(">>> Processing and cleaning images (may take time)...")
sanity_check_and_split("/tmp/PetImages/Cat/",
                       f"{BASE_DIR}/training/cats/",
                       f"{BASE_DIR}/testing/cats/",
                       0.9)
sanity_check_and_split("/tmp/PetImages/Dog/",
                       f"{BASE_DIR}/training/dogs/",
                       f"{BASE_DIR}/testing/dogs/",
                       0.9)

>>> Processing and cleaning images (may take time)...
Void file ignored: 666.jpg
Void file ignored: 11702.jpg


In [5]:
# --- DATA GENERATORS (Streaming Middleware) ---
# pixel sanitizing
train_datagen = ImageDataGenerator(rescale=1./255)
test_datagen = ImageDataGenerator(rescale=1./255)

# Batch loader
train_generator = train_datagen.flow_from_directory(
    f'{BASE_DIR}/training/',
    target_size=(160, 160), # Input shape MobileNetV2
    batch_size=32,
    class_mode='binary'
)

validation_generator = test_datagen.flow_from_directory(
    f'{BASE_DIR}/testing/',
    target_size=(160, 160),
    batch_size=32,
    class_mode='binary'
)

Found 22498 images belonging to 2 classes.
Found 2500 images belonging to 2 classes.


In [6]:
# --- TRANSFER LEARNING ARCHITECTURE ---
# Load MobileNetV2 with pre-trained weights
base_model = MobileNetV2(input_shape=(160, 160, 3), include_top=False, weights='imagenet')
base_model.trainable = False # Freeze the base model to preserve the learned features

# Define the custom classification head
model = models.Sequential([
    base_model,
    layers.GlobalAveragePooling2D(), # Flatten 3D feature map into 1D vector
    layers.Dropout(0.2), # Regularization to prevent overfitting
    layers.Dense(1, activation='sigmoid') # Binary output
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_160_no_top.h5
[1m9406464/9406464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [None]:
# --- MODEL EXECUTION ---
print(">>> Starting model training...")
# The .fit() method triggers the training loop.
# It returns a 'History' object containing all metrics for every epoch.
history = model.fit(
    train_generator,
    epochs=3,
    validation_data=validation_generator
)

# --- METRICS VISUALIZATION ---
# Extracting logs from the history object (Analagous to parsing a log JSON)
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

# Setting up the Dashboard (Matplotlib)
plt.figure(figsize=(12, 4))

# Subplot 1: Accuracy (Higher is better)
plt.subplot(1, 2, 1)
plt.plot(acc, label='Train Accuracy')
plt.plot(val_acc, label='Val Accuracy')
plt.legend(loc='lower right')
plt.title('Accuracy Evolution')

# Subplot 2: Loss (Lower is better)
plt.subplot(1, 2, 2)
plt.plot(loss, label='Train Loss')
plt.plot(val_loss, label='Val Loss')
plt.legend(loc='upper right')
plt.title('Loss Evolution')

plt.show()

>>> Starting model training...
Epoch 1/3
[1m428/704[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m3:15[0m 708ms/step - accuracy: 0.9808 - loss: 0.0534



[1m673/704[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m21s[0m 710ms/step - accuracy: 0.9803 - loss: 0.0544