In [1]:
import os, math, random, glob
from pathlib import Path
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, mean_absolute_error, mean_squared_error
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
print('TensorFlow version:', tf.__version__)

TensorFlow version: 2.19.0


In [2]:
# ==============================
# 📌 STEP 1: Upload kaggle.json
# ==============================
from google.colab import files
import os

print("📂 Please upload your kaggle.json (Kaggle > Account > Create API Token)")
uploaded = files.upload()

os.makedirs("/root/.kaggle", exist_ok=True)
for fn in uploaded.keys():
    os.rename(fn, "/root/.kaggle/kaggle.json")

os.chmod("/root/.kaggle/kaggle.json", 0o600)


📂 Please upload your kaggle.json (Kaggle > Account > Create API Token)


Saving kaggle.json to kaggle.json


In [3]:
# ==============================
# 📌 STEP 2: Download UTKFace dataset from Kaggle
# ==============================
!pip install -q kaggle
print("⬇️ Downloading UTKFace dataset...")
!kaggle datasets download -d jangedoo/utkface-new -p /content


⬇️ Downloading UTKFace dataset...
Dataset URL: https://www.kaggle.com/datasets/jangedoo/utkface-new
License(s): copyright-authors
Downloading utkface-new.zip to /content
 97% 320M/331M [00:00<00:00, 475MB/s]
100% 331M/331M [00:00<00:00, 548MB/s]


In [16]:
# ==============================
# 📌 STEP 3: Unzip dataset
# ==============================
import zipfile, os

zip_path = "/content/utkface-new.zip"
extract_path = "/content/UTKFace"  # ✅ No double UTKFace folder

print("📦 Extracting dataset...")
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)
print(f"✅ Done! Extracted to {extract_path}")


📦 Extracting dataset...
✅ Done! Extracted to /content/UTKFace


In [17]:
# ==============================
# 📌 STEP 4: Parse filenames
# ==============================
import glob
import pandas as pd

def parse_utk_filename(filepath):
    filename = os.path.basename(filepath)
    try:
        age, gender, *_ = filename.split('_')
        return int(age), int(gender)
    except:
        raise ValueError("Filename not in expected format")

DATA_DIR = extract_path
image_paths = sorted(
    glob.glob(os.path.join(DATA_DIR, '*.jpg')) +
    glob.glob(os.path.join(DATA_DIR, '*.png'))
)

rows = []
if not image_paths:
    print(f"❌ No image files found in {DATA_DIR}. Please check the extraction path and file types.")
else:
    for p in image_paths:
        try:
            age, gender = parse_utk_filename(p)
            rows.append({'path': p, 'age': age, 'gender': gender})
        except Exception:
            continue

df = pd.DataFrame(rows)

if len(df) == 0 and image_paths:
    print(f"❌ Found {len(image_paths)} image files, but none matched the expected filename format for parsing age and gender.")
elif len(df) > 0:
    print(f"✅ Found {len(df)} labeled images. Age range: {df['age'].min()} - {df['age'].max()}")
    display(df.sample(5))

✅ Found 23708 labeled images. Age range: 1 - 116


Unnamed: 0,path,age,gender
9190,/content/UTKFace/28_1_1_20170115235013113.jpg....,28,1
8730,/content/UTKFace/28_0_1_20170113161237182.jpg....,28,0
20781,/content/UTKFace/61_1_0_20170117175120965.jpg....,61,1
6505,/content/UTKFace/26_0_3_20170119180508068.jpg....,26,0
8799,/content/UTKFace/28_0_1_20170117020752716.jpg....,28,0


In [18]:
train_df, test_df = None, None
if 'df' in globals():
    train_df, test_df = train_test_split(df, test_size=0.15, random_state=42, stratify=df['gender'])
    train_df, val_df  = train_test_split(train_df, test_size=0.15, random_state=42, stratify=train_df['gender'])
    print('Train:', len(train_df), 'Val:', len(val_df), 'Test:', len(test_df))
else:
    print('df not found; make sure previous cell ran and DATA_DIR contains images')

Train: 17128 Val: 3023 Test: 3557


In [19]:
IMG_SIZE = (128,128)
BATCH_SIZE = 32
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array

def df_to_generator(df, augment=False):
    datagen_kwargs = dict(rescale=1./255)
    if augment:
        datagen = ImageDataGenerator(rotation_range=15, width_shift_range=0.1, height_shift_range=0.1, horizontal_flip=True, **datagen_kwargs)
    else:
        datagen = ImageDataGenerator(**datagen_kwargs)
    def generator():
        paths = df['path'].values.copy()
        ages = df['age'].values.astype('float32')
        genders = df['gender'].values.astype('int32')
        idxs = np.arange(len(paths))
        while True:
            np.random.shuffle(idxs)
            for start in range(0, len(paths), BATCH_SIZE):
                batch_idx = idxs[start:start+BATCH_SIZE]
                batch_imgs = []
                batch_ages = []
                batch_genders = []
                for i in batch_idx:
                    img = load_img(paths[i], target_size=IMG_SIZE)
                    arr = img_to_array(img)
                    arr = arr.astype('float32')/255.0
                    if augment:
                        arr = datagen.random_transform(arr)
                    batch_imgs.append(arr)
                    batch_ages.append(ages[i])
                    batch_genders.append(genders[i])
                X = np.stack(batch_imgs, axis=0)
                y_age = np.array(batch_ages).astype('float32')
                y_gender = np.array(batch_genders).astype('int32')
                yield X, {'age_out': y_age/100.0, 'gender_out': y_gender}
    return generator()

if 'train_df' in globals() and train_df is not None:
    train_gen = df_to_generator(train_df, augment=True)
    val_gen = df_to_generator(val_df, augment=False)
    steps_per_epoch = max(1, len(train_df)//BATCH_SIZE)
    validation_steps = max(1, len(val_df)//BATCH_SIZE)
    print('Steps per epoch:', steps_per_epoch, 'Validation steps:', validation_steps)
else:
    print('Train/Val data not ready')

Steps per epoch: 535 Validation steps: 94


In [20]:
def build_model(input_shape=(*IMG_SIZE,3)):
    inp = layers.Input(shape=input_shape)
    x = layers.Conv2D(32,3,activation='relu',padding='same')(inp)
    x = layers.MaxPool2D(2)(x)
    x = layers.Conv2D(64,3,activation='relu',padding='same')(x)
    x = layers.MaxPool2D(2)(x)
    x = layers.Conv2D(128,3,activation='relu',padding='same')(x)
    x = layers.MaxPool2D(2)(x)
    x = layers.Flatten()(x)
    x = layers.Dense(256, activation='relu')(x)
    x = layers.Dropout(0.4)(x)

    age = layers.Dense(64, activation='relu')(x)
    age = layers.Dense(1, name='age_out')(age)

    gender = layers.Dense(64, activation='relu')(x)
    gender = layers.Dense(1, activation='sigmoid', name='gender_out')(gender)

    model = models.Model(inputs=inp, outputs=[age, gender])
    return model

model = build_model()
model.compile(optimizer=keras.optimizers.Adam(1e-4),
              loss={'age_out': 'mse', 'gender_out': 'binary_crossentropy'},
              loss_weights={'age_out': 1.0, 'gender_out': 1.0},
              metrics={'age_out': [keras.metrics.RootMeanSquaredError()], 'gender_out': ['accuracy']})
model.summary()

In [21]:
EPOCHS = 4
if 'train_gen' in globals():
    history = model.fit(train_gen, steps_per_epoch=steps_per_epoch, validation_data=val_gen, validation_steps=validation_steps, epochs=EPOCHS)
else:
    print('train_gen not available — run earlier cells and ensure DATA_DIR has images')

Epoch 1/4
[1m535/535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 179ms/step - age_out_loss: 0.0483 - age_out_root_mean_squared_error: 0.2192 - gender_out_accuracy: 0.6389 - gender_out_loss: 0.6328 - loss: 0.6811 - val_age_out_loss: 0.0290 - val_age_out_root_mean_squared_error: 0.1702 - val_gender_out_accuracy: 0.7879 - val_gender_out_loss: 0.4642 - val_loss: 0.4932
Epoch 2/4
[1m535/535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 175ms/step - age_out_loss: 0.0353 - age_out_root_mean_squared_error: 0.1880 - gender_out_accuracy: 0.7731 - gender_out_loss: 0.4826 - loss: 0.5195 - val_age_out_loss: 0.0255 - val_age_out_root_mean_squared_error: 0.1597 - val_gender_out_accuracy: 0.8148 - val_gender_out_loss: 0.4039 - val_loss: 0.4294
Epoch 3/4
[1m535/535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 177ms/step - age_out_loss: 0.0325 - age_out_root_mean_squared_error: 0.1799 - gender_out_accuracy: 0.7975 - gender_out_loss: 0.4373 - loss: 0.4691 - val_age_out_l

In [23]:
def df_to_arrays(df):
    Xs = []
    ages = []
    genders = []
    for p,a,g in zip(df['path'], df['age'], df['gender']):
        img = load_img(p, target_size=IMG_SIZE)
        arr = img_to_array(img).astype('float32')/255.0
        Xs.append(arr)
        ages.append(a)
        genders.append(g)
    X = np.stack(Xs, axis=0)
    ages = np.array(ages).astype('float32')
    genders = np.array(genders).astype('int32')
    return X, ages, genders

if 'test_df' in globals() and test_df is not None:
    X_test, ages_test, genders_test = df_to_arrays(test_df)
    pred_age_scaled, pred_gender_prob = model.predict(X_test, batch_size=32)
    pred_age = (pred_age_scaled.flatten()*100.0).clip(0,100)
    pred_gender = (pred_gender_prob.flatten() >= 0.5).astype(int)

    mae_age = mean_absolute_error(ages_test, pred_age)
    mse_age = mean_squared_error(ages_test, pred_age) # Calculate MSE
    rmse_age = np.sqrt(mse_age) # Calculate RMSE by taking the square root of MSE
    acc_gender = accuracy_score(genders_test, pred_gender)
    prec_gender = precision_score(genders_test, pred_gender, zero_division=0)
    rec_gender = recall_score(genders_test, pred_gender, zero_division=0)
    f1_gender = f1_score(genders_test, pred_gender, zero_division=0)
    try:
        auc_gender = roc_auc_score(genders_test, pred_gender_prob.flatten())
    except Exception:
        auc_gender = None

    print(f"Age MAE: {mae_age:.3f} years   RMSE: {rmse_age:.3f} years")
    print(f"Gender Acc: {acc_gender:.3f}  Prec: {prec_gender:.3f}  Rec: {rec_gender:.3f}  F1: {f1_gender:.3f}  AUC: {auc_gender}")
    cm = confusion_matrix(genders_test, pred_gender)
    print('Confusion matrix (rows=true, cols=pred):\n', cm)
else:
    print('test_df not ready — run earlier cells')

[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
Age MAE: 11.239 years   RMSE: 14.815 years
Gender Acc: 0.835  Prec: 0.772  Rec: 0.929  F1: 0.843  AUC: 0.9337151703963338
Confusion matrix (rows=true, cols=pred):
 [[1392  467]
 [ 121 1577]]


In [24]:
try:
    from ipywidgets import FileUpload, VBox, HBox, Button, Output, Image as WImage
    from IPython.display import display, clear_output
    widget_available = True
except Exception:
    widget_available = False

def preprocess_image_path(path):
    img = load_img(path, target_size=IMG_SIZE)
    arr = img_to_array(img).astype('float32')/255.0
    return np.expand_dims(arr, axis=0)

def predict_on_image(path):
    x = preprocess_image_path(path)
    pa_scaled, pg_prob = model.predict(x)
    age_pred = (pa_scaled.flatten()[0]*100.0)
    gender_pred = int(pg_prob.flatten()[0] >= 0.5)
    print(f"Image: {path}\n Predicted age: {age_pred:.1f}  Predicted gender: {gender_pred} (0=male,1=female)\n")
    return age_pred, gender_pred, pg_prob.flatten()[0]

if widget_available:
    upload = FileUpload(accept='image/*', multiple=True)
    out = Output()
    btn = Button(description='Predict uploaded images')
    def on_click(b):
        with out:
            clear_output()
            for name, fileinfo in upload.value.items():
                path = '/tmp/'+name
                with open(path, 'wb') as f:
                    f.write(fileinfo['content'])
                display(WImage(value=fileinfo['content'], format='png'))
                predict_on_image(path)
    btn.on_click(on_click)
    display(VBox([upload, btn, out]))
else:
    print('Upload widget not available. Call `predict_on_image(path)` for inference.')

VBox(children=(FileUpload(value={}, accept='image/*', description='Upload', multiple=True), Button(description…

In [28]:
# Instead of relying on model.evaluate() for accuracy/MAE:
pred_age_scaled, pred_gender_prob = model.predict(X_val, verbose=0)

# Gender predictions
pred_gender_labels = (pred_gender_prob.flatten() >= 0.5).astype(int)
gender_acc = (pred_gender_labels == y_gender_val).mean() * 100

# Age predictions (unscaled)
pred_age = (pred_age_scaled.flatten() * 100).clip(0,100)
mae_age = mean_absolute_error(y_age_val, pred_age)

print(f"Gender Accuracy: {gender_acc:.2f}%")
print(f"Age MAE: {mae_age:.2f} years")


Gender Accuracy: 83.33%
Age MAE: 11.34 years
