Cell 1: Download the Dataset

In [7]:
# This command downloads the dataset from a new source.
# The -O flag renames the downloaded file to 'UTKFace.tar.gz'
!wget https://huggingface.co/datasets/py97/UTKFace-Cropped/resolve/main/UTKFace.tar.gz -O UTKFace.tar.gz

--2025-11-02 06:42:12--  https://huggingface.co/datasets/py97/UTKFace-Cropped/resolve/main/UTKFace.tar.gz
Resolving huggingface.co (huggingface.co)... 18.239.50.16, 18.239.50.49, 18.239.50.103, ...
Connecting to huggingface.co (huggingface.co)|18.239.50.16|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cas-bridge.xethub.hf.co/xet-bridge-us/670e8fda9411f204a9dfd7d7/8eecbac21553bc27c16480142348a98bd9118cf412b786c211afaa5e886c52cf?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=cas%2F20251102%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20251102T064212Z&X-Amz-Expires=3600&X-Amz-Signature=c2b3176305628a707ea50958b91586ada3cd533a1554826a4280abb85e4ce0e8&X-Amz-SignedHeaders=host&X-Xet-Cas-Uid=public&response-content-disposition=inline%3B+filename*%3DUTF-8%27%27UTKFace.tar.gz%3B+filename%3D%22UTKFace.tar.gz%22%3B&response-content-type=application%2Fgzip&x-id=GetObject&Expires=1762069332&Policy=eyJTdGF0ZW1lbnQiOlt7Ik

Cell 2: Extract the Dataset

In [8]:
# This command extracts the file. You'll see the 'UTKFace/' folder appear
# in the Colab file browser (left-hand side).
!tar -xf UTKFace.tar.gz

Cell 3: Import All Necessary Libraries

In [9]:
import tensorflow as tf
from tensorflow.keras.layers import Input, GlobalAveragePooling2D, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.applications import MobileNetV2
import numpy as np
import pandas as pd
import os
import cv2
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix

Cell 4: Define Constants and Hyperparameters

In [10]:
# --- 1. Define Constants ---
DATASET_PATH = 'UTKFace'
IMAGE_SIZE = 224
BATCH_SIZE = 64
EPOCHS = 50

Cell 5: Load and Parse Image Paths

In [11]:
# --- 2. Load and Preprocess Image Paths ---
# The filenames are like: [age]_[gender]_[race]_[date].jpg
# gender: 0 = Male, 1 = Female

image_paths = []
ages = []
genders = []

print("Loading dataset...")
for filename in os.listdir(DATASET_PATH):
    if filename.endswith('.jpg'):
        try:
            parts = filename.split('_')
            age = int(parts[0])
            gender = int(parts[1])

            # Filter out some bad labels
            if age > 0 and age < 117: # A reasonable age range
                image_paths.append(os.path.join(DATASET_PATH, filename))
                ages.append(age)
                genders.append(gender)
        except Exception as e:
            # Skip files with bad naming
            # print(f"Skipping {filename}: {e}")
            pass

if not image_paths:
    raise FileNotFoundError(f"No images found in {DATASET_PATH}. "
                           "Did you download and extract the dataset correctly?")

print(f"Total images loaded: {len(image_paths)}")

Loading dataset...
Total images loaded: 23708


Cell 6: Create DataFrame and Split Data

In [12]:
# Create a DataFrame
df = pd.DataFrame({'image_path': image_paths, 'age': ages, 'gender': genders})

# Split into training and validation
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

print(f"Training samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")

Training samples: 18966
Validation samples: 4742


Cell 7: Define tf.data Preprocessing Function

In [13]:
# --- 3. Create tf.data Pipelines ---

def load_and_preprocess(image_path, age, gender):
    # Read image
    image = tf.io.read_file(image_path)
    image = tf.image.decode_jpeg(image, channels=3)

    # Resize and preprocess (matches MobileNetV2 input)
    image = tf.image.resize(image, [IMAGE_SIZE, IMAGE_SIZE])
    image = tf.keras.applications.mobilenet_v2.preprocess_input(image)

    # Format labels
    age_label = tf.cast(age, tf.float32)
    gender_label = tf.cast(gender, tf.float32)

    # Our model will have two outputs, so we return a dictionary of labels
    return image, {'age_output': age_label, 'gender_output': gender_label}

Cell 8: Create tf.data Datasets

In [14]:
def create_dataset(df, batch_size=BATCH_SIZE, shuffle=True):
    dataset = tf.data.Dataset.from_tensor_slices(
        (df['image_path'].values, df['age'].values, df['gender'].values)
    )
    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(df))
    dataset = dataset.map(load_and_preprocess, num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(buffer_size=tf.data.AUTOTUNE)
    return dataset

train_ds = create_dataset(train_df, shuffle=True)
# Create a non-shuffled validation set for consistent evaluation
val_ds = create_dataset(val_df, shuffle=False)

Cell 9: Build the Multi-Output Model

In [15]:
# --- 4. Build the "Multi-Output" Model ---
print("Building model...")

# Load MobileNetV2 base, pre-trained on ImageNet
base_model = MobileNetV2(
    input_shape=(IMAGE_SIZE, IMAGE_SIZE, 3),
    include_top=False,  # Don't include the final 1000-class layer
    weights='imagenet'
)

# Freeze the base model layers
base_model.trainable = False

# Define our model inputs
inputs = Input(shape=(IMAGE_SIZE, IMAGE_SIZE, 3))

# We will fine-tune from the base model
x = base_model(inputs, training=False)
x = GlobalAveragePooling2D()(x)
x = Dropout(0.5)(x) # Add dropout for regularization

# --- Head 1: Age (Regression) ---
age_head = Dense(128, activation='relu')(x)
age_head = Dense(64, activation='relu')(age_head)
age_output = Dense(1, activation='linear', name='age_output')(age_head) # 'linear' for regression

# --- Head 2: Gender (Binary Classification) ---
gender_head = Dense(128, activation='relu')(x)
gender_head = Dense(64, activation='relu')(gender_head)
gender_output = Dense(1, activation='sigmoid', name='gender_output')(gender_head) # 'sigmoid' for binary

# Combine into a single model
model = Model(inputs=inputs, outputs=[age_output, gender_output])

Building model...
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224_no_top.h5
[1m9406464/9406464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step


Cell 10: Compile the Model

In [16]:
# --- 5. Compile the Model ---
# We need separate loss functions for each head
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss={
        'age_output': 'mean_absolute_error',    # MAE is good for age regression
        'gender_output': 'binary_crossentropy'   # Standard for binary classification
    },
    metrics={
        'age_output': 'mae',
        'gender_output': 'accuracy'
    }
)

model.summary()

Cell 11: Train the Model

In [17]:
# --- 6. Train the Model ---
print("Starting training...")
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS
)

Starting training...
Epoch 1/50
[1m297/297[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 178ms/step - age_output_loss: 14.0927 - age_output_mae: 14.0928 - gender_output_accuracy: 0.7515 - gender_output_loss: 0.5016 - loss: 14.5944 - val_age_output_loss: 9.1114 - val_age_output_mae: 9.0070 - val_gender_output_accuracy: 0.8205 - val_gender_output_loss: 0.3894 - val_loss: 9.3961
Epoch 2/50
[1m297/297[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 81ms/step - age_output_loss: 9.6013 - age_output_mae: 9.6013 - gender_output_accuracy: 0.8051 - gender_output_loss: 0.4118 - loss: 10.0131 - val_age_output_loss: 8.9845 - val_age_output_mae: 8.8923 - val_gender_output_accuracy: 0.8178 - val_gender_output_loss: 0.3890 - val_loss: 9.2826
Epoch 3/50
[1m297/297[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 77ms/step - age_output_loss: 9.1940 - age_output_mae: 9.1941 - gender_output_accuracy: 0.8096 - gender_output_loss: 0.4032 - loss: 9.5973 - val_age_output_loss: 8.3438 

Cell 12: Plot Training History (Loss & Accuracy)

In [None]:
# --- 7. NEW: Plot Accuracy and Loss Curves ---
print("Plotting training history...")
history_dict = history.history

# Create a figure with 2x2 subplots
plt.figure(figsize=(14, 10))

# 1. Plot Age MAE (Loss)
plt.subplot(2, 2, 1)
plt.plot(history_dict['age_output_mae'], label='Train Age MAE')
plt.plot(history_dict['val_age_output_mae'], label='Val Age MAE')
plt.title('Age MAE (Regression Loss)')
plt.xlabel('Epochs')
plt.ylabel('MAE')
plt.legend()

# 2. Plot Gender Accuracy
plt.subplot(2, 2, 2)
plt.plot(history_dict['gender_output_accuracy'], label='Train Gender Accuracy')
plt.plot(history_dict['val_gender_output_accuracy'], label='Val Gender Accuracy')
plt.title('Gender Accuracy (Classification)')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

# 3. Plot Gender Loss
plt.subplot(2, 2, 3)
plt.plot(history_dict['gender_output_loss'], label='Train Gender Loss')
plt.plot(history_dict['val_gender_output_loss'], label='Val Gender Loss')
plt.title('Gender Loss (Binary Crossentropy)')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

# 4. Plot Total Loss
plt.subplot(2, 2, 4)
plt.plot(history_dict['loss'], label='Total Train Loss')
plt.plot(history_dict['val_loss'], label='Total Val Loss')
plt.title('Total Model Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

Plotting training history...


Cell 13: Get True Labels and Predictions for Evaluation

In [None]:
# --- 8. NEW: Classification Report and Confusion Matrix (for Gender) ---
print("Generating classification report and confusion matrix for Gender...")

# Get true labels from the validation dataset
y_true_gender = []
# We have to iterate through the dataset to get the labels
for images, labels in val_ds:
    y_true_gender.extend(labels['gender_output'].numpy())

y_true_gender = np.array(y_true_gender)

# Get predictions from the model on the validation set
# model.predict() returns a list: [age_preds, gender_preds]
predictions = model.predict(val_ds)
y_pred_gender_probs = predictions[1].squeeze() # Get gender preds and remove extra dim

# Convert probabilities to binary classes (0 or 1)
y_pred_gender = (y_pred_gender_probs > 0.5).astype(int)

Cell 14: Print Classification Report & Plot Confusion Matrix

In [None]:
# --- Classification Report ---
print("\nGender Classification Report:")
target_names = ['Male (0)', 'Female (1)']
print(classification_report(y_true_gender, y_pred_gender, target_names=target_names))

# --- Confusion Matrix ---
print("\nGender Confusion Matrix:")
cm = confusion_matrix(y_true_gender, y_pred_gender)
print(cm)

# Plot Confusion Matrix using Seaborn
plt.figure(figsize=(7, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=target_names, yticklabels=target_names)
plt.title('Gender Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

Cell 15: Save the Trained Model

In [None]:
# --- 9. Save Your Custom Model ---
model.save('age_gender_model.h5')
print("Model saved as age_gender_model.h5")