# Age Classification Model

## 1. Investigating dataset

In [None]:
from datasets import load_dataset
import matplotlib.pyplot as plt
import random

In [None]:
ds = load_dataset("prithivMLmods/Age-Classification-Set")

In [None]:
labels = ds["train"].features["label"].names
label_mapping = {i: v for i, v in enumerate(labels)}
label_mapping

In [None]:
ds = ds["train"]
print(ds)
print(len(ds))

In [None]:
ds[0]["label"], ds[0]["image"]

In [None]:
def print_samples():
    fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 7))
    axes = axes.flatten()
    ind = random.sample(range(len(ds)), 1)[0]
    for ax in axes:
        ax.imshow(ds[ind]['image'])
        ax.set_title(label_mapping[ds[ind]['label']])
        ind = random.sample(range(len(ds)), 1)[0]
    plt.tight_layout()  # Adjust the layout to prevent titles and labels from overlapping
    plt.show()
print_samples()

### Structure dataset folder for YOLO

Split data into "train", "eval", "test"

First we need to find indices of each age group

In [None]:
from tqdm import tqdm

indices_by_class = {}

for ind, sample in tqdm(enumerate(ds), total=len(ds), desc="Detecting indices of each age group"):
    cls = label_mapping[sample['label']]
    if cls not in indices_by_class:
        indices_by_class[cls] = []
    indices_by_class[cls].append(ind)

In [None]:
for cls, indices in indices_by_class.items():
    print(f"{cls}:   {len(indices)} samples")

Because the number of `21-44` group is exceeded, we randomly reduce the number of these images to 4000 samples (comparable to the second most class in the dataset).

In [None]:
import random
random.seed(42)

num_sample_remain = 4000

indices_by_class["21-44"] = random.sample(indices_by_class["21-44"], k=num_sample_remain)

In [None]:
len(indices_by_class["21-44"])

In [None]:
TRAIN_RATIO = 0.8
VALIDATION_RATIO = 0.1

ds_indices = {
    'train': [],
    'val': [],
    'test': []
}

for age, indices in indices_by_class.items():
    print(f"Splitting dataset for {age} group...")

    num_train_samples = int(TRAIN_RATIO * len(indices))
    num_validation_samples = int(VALIDATION_RATIO * len(indices))

    random.shuffle(indices)
    train_indices = indices[:num_train_samples]
    validation_indices = indices[num_train_samples:num_train_samples + num_validation_samples]
    test_indices = indices[num_train_samples + num_validation_samples:]

    ds_indices["train"] += train_indices
    ds_indices["val"] += validation_indices
    ds_indices["test"] += test_indices

random.shuffle(ds_indices["train"])
random.shuffle(ds_indices["val"])
random.shuffle(ds_indices["test"])

In [None]:
import os
from tqdm import tqdm
ROOT = "D:\Documents\Personal Projects\Age_Predictor"
DATA_ROOT = os.path.join(ROOT, "dataset", "age")
os.makedirs(DATA_ROOT, exist_ok=True)

for split in ['train', 'val', 'test']:
    split_dir = os.path.join(DATA_ROOT, split)
    os.makedirs(split_dir, exist_ok=True)

    for idx in tqdm(ds_indices[split], total=len(ds_indices[split]), desc=f"Processing {split} split..."):
        example = ds[idx]
        pil_image = example['image']
        label = label_mapping[example['label']]

        # Create a directory for this class if it doesn't exist
        class_dir = os.path.join(split_dir, label)
        os.makedirs(class_dir, exist_ok=True)

        # Save this image to the class directory
        image_filename = f"{idx}_{label}.png"
        image_path = os.path.join(class_dir, image_filename)
        pil_image.save(image_path)

# 2. Setup Model and Training Configurations

In [None]:
from ultralytics import YOLO

# Load a model
model = YOLO("yolo11n-cls.pt")

In [None]:
import os
ROOT = "D:\Documents\Personal Projects\Age_Predictor"
DATA_ROOT = os.path.join(ROOT, "dataset", "age")

In [None]:
# Train the model
results = model.train(
    data=DATA_ROOT,
    epochs=50,
    imgsz=64,
    device=0,
    save=True,
    save_period=1,                  # Save checkpoint every 10 epochs
    project="Age_Detection",     # Name of the project directory where training outputs are saved.
    name="v1_epochs_10_imgsz_64",   # Name of the training run.
    dropout=0.1,
    plots=True                      # Generates and saves plots of training, validation metrics, and prediction examples.
)


### Test Performance

In [None]:
model_path = r"D:\Documents\Personal Projects\Age_Predictor\notebooks\Age_Detection\v1_epochs_10_imgsz_64\weights\best.pt"
model = YOLO(model_path)  # load a custom model

In [None]:
test_dir = os.path.join(DATA_ROOT, "test")
ages = list(os.listdir(test_dir))

results = {}
for age in ages:
    image_path = os.path.join(test_dir, age)
    results[age] = model(image_path)

In [None]:
results["0-12"][0].names

In [None]:
from tqdm import tqdm

corrects = {age: 0 for age in ages}
total = {age: len(os.listdir(os.path.join(test_dir, age))) for age in ages}

mapping = results["0-12"][0].names

for age in ages:
    for result in tqdm(results[age], total=total[age], desc=f"Calculating accuracy for {age} group..."):
        label_index = result.probs.top1
        label = mapping[label_index]
        if label == age:
            corrects[age] += 1
    print(f"{age}: {corrects[age]}/{total[age]} - {corrects[age]/total[age] * 100:.2f}%")
