Motivation: Why Demographic Analysis Matters

Explanation:
This notebook analyzes the demographic composition of the CLIPScore-Light dataset to identify potential representation biases related to gender, age, and skin tone. Such biases can directly affect the fairness and robustness of vision‚Äìlanguage models trained on web-scale data.

Environment Setup and Configuration

Explanation:
Load required libraries and define paths for the CLIPScore-Light dataset.

In [None]:
import os
import json
import time
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm


Config paths

In [None]:
IN_CSV = "data/laion_big_light_tau1_0.2989.csv"
OUT_CSV = "data/laion_big_light_with_demographics.csv"
SUMMARY_JSON = "data/week5_demographics_summary.json"

CHECKPOINT_EVERY = 200
RESUME = True

ACTIONS = ["gender", "age", "race"]
ENFORCE_DETECTION = False  # important to not crash when no face


Load CSV + init columns + resume

In [None]:
in_path = Path(IN_CSV)
out_path = Path(OUT_CSV)
out_path.parent.mkdir(parents=True, exist_ok=True)

if RESUME and out_path.exists():
    print("üîÅ Resuming from:", out_path)
    df = pd.read_csv(out_path)
else:
    df = pd.read_csv(in_path)

for col in ["face_detected", "gender", "age", "race"]:
    if col not in df.columns:
        df[col] = pd.NA

print("Rows:", len(df))
print("Columns:", df.columns.tolist())


Verify image paths

In [None]:
assert "image_path" in df.columns, "CSV must contain image_path column"

exists_rate = df["image_path"].apply(lambda p: isinstance(p, str) and os.path.exists(p)).mean() * 100
print(f"Exists rate: {exists_rate:.2f}%")

# show one image
from PIL import Image
sample_path = df.loc[df["image_path"].apply(lambda p: isinstance(p, str) and os.path.exists(p)), "image_path"].iloc[0]
Image.open(sample_path).convert("RGB")


Install/Import DeepFace fix (tf-keras)

Run this once if you had the error ‚Äúrequires tf-keras‚Äù.

In [5]:
import sys
!{sys.executable} -m pip install -U tf-keras




Import DeepFace (after tf-keras)

In [6]:
from deepface import DeepFace





Safe analyze wrapper (anti-crash)

In [7]:
def safe_analyze(img_path: str):
    try:
        res = DeepFace.analyze(
            img_path=img_path,
            actions=ACTIONS,
            enforce_detection=ENFORCE_DETECTION,
        )

        if isinstance(res, list):
            res = res[0] if len(res) > 0 else {}

        gender = res.get("dominant_gender")
        age = res.get("age")
        race = res.get("dominant_race")

        face_detected = (gender is not None) or (age is not None) or (race is not None)

        return {
            "face_detected": bool(face_detected),
            "gender": gender,
            "age": age,
            "race": race,
        }

    except Exception:
        return {
            "face_detected": False,
            "gender": None,
            "age": None,
            "race": None,
        }


Determine remaining rows to process

In [8]:
need_mask = df["gender"].isna() & df["age"].isna() & df["race"].isna()
to_process = df.index[need_mask].tolist()
print("Remaining rows:", len(to_process))


Remaining rows: 104251


In [None]:
N_DEBUG = 10_000   # change √† 2000 / 5000 / 10000 pour tester

to_process = to_process[:min(N_DEBUG, len(to_process))]
print("Now processing only:", len(to_process), "rows")


In [None]:
CHUNK_SIZE = 2000           # traite 2000 images puis sauvegarde
CHECKPOINT_EVERY = 200      # checkpoint pendant le chunk

processed_total = 0
t0 = time.time()

# recompute remaining each time (useful when you restart kernel)
need_mask = df["gender"].isna() & df["age"].isna() & df["race"].isna()
remaining = df.index[need_mask].tolist()
print("Remaining rows:", len(remaining))

chunk_id = 0

while len(remaining) > 0:
    chunk_id += 1
    chunk_idx = remaining[:min(CHUNK_SIZE, len(remaining))]
    print(f"\nüöÄ Chunk {chunk_id} ‚Äî processing {len(chunk_idx)} rows")

    processed_chunk = 0

    for idx in tqdm(chunk_idx, desc=f"Chunk {chunk_id}"):
        img_path = df.at[idx, "image_path"]

        if not isinstance(img_path, str) or not os.path.exists(img_path):
            df.at[idx, "face_detected"] = False
            df.at[idx, "gender"] = pd.NA
            df.at[idx, "age"] = pd.NA
            df.at[idx, "race"] = pd.NA
        else:
            out = safe_analyze(img_path)
            df.at[idx, "face_detected"] = out["face_detected"]
            df.at[idx, "gender"] = out["gender"]
            df.at[idx, "age"] = out["age"]
            df.at[idx, "race"] = out["race"]

        processed_chunk += 1
        processed_total += 1

        # checkpoint intra-chunk
        if processed_chunk % CHECKPOINT_EVERY == 0:
            df.to_csv(out_path, index=False)
            elapsed = (time.time() - t0) / 60
            print(f"üíæ Checkpoint saved ‚Äî total processed: {processed_total} ‚Äî {elapsed:.1f} min")

    # fin du chunk => save obligatoire
    df.to_csv(out_path, index=False)
    elapsed = (time.time() - t0) / 60
    print(f"‚úÖ Chunk {chunk_id} saved ‚Äî total processed: {processed_total} ‚Äî {elapsed:.1f} min")

    # recompute remaining
    need_mask = df["gender"].isna() & df["age"].isna() & df["race"].isna()
    remaining = df.index[need_mask].tolist()

print("\nüéâ Done! Final saved:", out_path)


Run inference with checkpoint + resume

Basic stats

In [14]:
df["face_detected"].value_counts(dropna=False)


Plots (Gender / Age / Race)

In [None]:
# Gender
g = df["gender"].dropna()
if len(g) > 0:
    g.value_counts().plot(kind="bar", title="Gender distribution")
    plt.show()
else:
    print("No gender predictions available")

# Age
a = df["age"].dropna()
if len(a) > 0:
    a.plot(kind="hist", bins=30, title="Age distribution")
    plt.xlabel("Age")
    plt.show()
else:
    print("No age predictions available")

# Race
r = df["race"].dropna()
if len(r) > 0:
    r.value_counts().plot(kind="bar", title="Race distribution")
    plt.show()
else:
    print("No race predictions available")


JSON summary (for report)

In [None]:
summary = {
    "input_csv": IN_CSV,
    "rows": int(len(df)),
    "exists_rate": float(df["image_path"].apply(lambda p: isinstance(p, str) and os.path.exists(p)).mean()),
    "face_detected_rate": float(df["face_detected"].fillna(False).astype(bool).mean()),
    "gender_non_null": int(df["gender"].notna().sum()),
    "age_non_null": int(df["age"].notna().sum()),
    "race_non_null": int(df["race"].notna().sum()),
}

Path(SUMMARY_JSON).parent.mkdir(parents=True, exist_ok=True)
Path(SUMMARY_JSON).write_text(json.dumps(summary, indent=2), encoding="utf-8")
summary
