Imports + config

In [2]:
pip install ipywidgets

Collecting ipywidgetsNote: you may need to restart the kernel to use updated packages.

  Downloading ipywidgets-8.1.8-py3-none-any.whl.metadata (2.4 kB)
Collecting widgetsnbextension~=4.0.14 (from ipywidgets)
  Downloading widgetsnbextension-4.0.15-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab_widgets~=3.0.15 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.16-py3-none-any.whl.metadata (20 kB)
Downloading ipywidgets-8.1.8-py3-none-any.whl (139 kB)
Downloading jupyterlab_widgets-3.0.16-py3-none-any.whl (914 kB)
   ---------------------------------------- 0.0/914.9 kB ? eta -:--:--
   ---------------------------------------- 914.9/914.9 kB 8.4 MB/s  0:00:00
Downloading widgetsnbextension-4.0.15-py3-none-any.whl (2.2 MB)
   ---------------------------------------- 0.0/2.2 MB ? eta -:--:--
   ---------------------------------------- 2.2/2.2 MB 11.2 MB/s  0:00:00
Installing collected packages: widgetsnbextension, jupyterlab_widgets, ipywidgets

   ------------- -----------

ERROR: Could not install packages due to an OSError: [Errno 2] No such file or directory: 'C:\\Debiasing-and-Improving-the-Robustness-of-CLIP-like-Models\\.venv\\share\\jupyter\\labextensions\\@jupyter-widgets\\jupyterlab-manager\\static\\vendors-node_modules_d3-color_src_color_js-node_modules_d3-format_src_defaultLocale_js-node_m-09b215.2643c43f22ad111f4f82.js'



In [3]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from PIL import Image

import torch
from facenet_pytorch import MTCNN

import sys
sys.path.append("code/fairface")  # pour importer le modèle fairface si besoin


Paths

In [4]:
CSV_IN = "data/laion_big_light_tau1_0.2989.csv"
CSV_OUT = "data/laion_big_light_tau1_0.2989_fairface.csv"

FAIRFACE_MODEL_PATH = "code/fairface/models/fairface_alldata_4race_20191111.pt"

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", DEVICE)


Device: cpu


Load data + sanity check images exist

In [5]:
df = pd.read_csv(CSV_IN)
print("Rows:", len(df))
print(df.columns.tolist())

# verify paths exist
exists = df["image_path"].apply(os.path.exists).mean() * 100
print(f"Exists %: {exists:.2f}%")
df.head(2)


Rows: 107251
['image_path', 'caption', 'width', 'height', 'similarity', 'punsafe', 'pwatermark', 'aesthetic_score', 'caption_len_words', 'exists', 'keep_quality', 'clipscore']
Exists %: 100.00%


Unnamed: 0,image_path,caption,width,height,similarity,punsafe,pwatermark,aesthetic_score,caption_len_words,exists,keep_quality,clipscore
0,src/data/data/laion_aesthetic_subset_big\00000...,Photo pour Japanese pagoda and old house in Ky...,450.0,297.0,0.345947,0.000541,0.034099,6.526204,16,True,True,0.335841
1,src/data/data/laion_aesthetic_subset_big\00000...,San Pedro: One Of Mother Nature's Most Powerfu...,467.0,369.0,0.308873,0.000612,0.102705,6.749783,14,True,True,0.300125


Face detector (MTCNN)

In [6]:
mtcnn = MTCNN(keep_all=True, device=DEVICE)


FairFace model loader (PyTorch)

In [7]:
import torchvision.transforms as T
import torch.nn.functional as F
from torchvision import models
from torchvision.models import resnet34, ResNet34_Weights

# FairFace uses a ResNet34 classifier
weights = ResNet34_Weights.DEFAULT
fairface_model = resnet34(weights=weights)
fairface_model.fc = torch.nn.Linear(fairface_model.fc.in_features, 18)  # 18 classes output (gender+age+race encoded)
state = torch.load(FAIRFACE_MODEL_PATH, map_location=DEVICE)
fairface_model.load_state_dict(state, strict=False)
fairface_model = fairface_model.to(DEVICE).eval()

transform = T.Compose([
    T.Resize((224, 224)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225]),
])

print("FairFace model loaded ✔️")


FairFace model loaded ✔️


Helpers: decode outputs + map race→skin tone

In [8]:
GENDER = ["Male", "Female"]
AGE = ["0-2", "3-9", "10-19", "20-29", "30-39", "40-49", "50-59", "60-69", "70+"]
RACE = ["White", "Black", "Latino_Hispanic", "East Asian", "Southeast Asian", "Indian", "Middle Eastern"]

def race_to_skin_tone(r):
    # approximation "skin tone" en 3 classes (coarse)
    # (c'est une proxy, à expliquer dans le rapport)
    if r in ["White", "East Asian", "Southeast Asian"]:
        return "Light"
    if r in ["Latino_Hispanic", "Middle Eastern", "Indian"]:
        return "Medium"
    if r in ["Black"]:
        return "Dark"
    return None

def decode_fairface(logits):
    # logits shape [18]
    # structure standard fairface:
    # 2 gender + 9 age + 7 race = 18
    g = torch.argmax(logits[0:2]).item()
    a = torch.argmax(logits[2:11]).item()
    r = torch.argmax(logits[11:18]).item()
    return GENDER[g], AGE[a], RACE[r]


Main inference loop (sample d’abord)

In [9]:
sample_df = df.sample(n=300, random_state=0).reset_index(drop=True)

out_rows = []
for i, row in tqdm(sample_df.iterrows(), total=len(sample_df)):
    path = row["image_path"]
    cap = row.get("caption", "")

    face_detected = False
    gender = None
    age_group = None
    skin_tone = None
    race = None

    try:
        img = Image.open(path).convert("RGB")

        boxes, _ = mtcnn.detect(img)
        if boxes is not None and len(boxes) > 0:
            face_detected = True

            # take largest face
            areas = [(b[2]-b[0])*(b[3]-b[1]) for b in boxes]
            j = int(np.argmax(areas))
            x1, y1, x2, y2 = boxes[j].astype(int)

            face = img.crop((x1, y1, x2, y2))
            x = transform(face).unsqueeze(0).to(DEVICE)

            with torch.no_grad():
                logits = fairface_model(x).squeeze(0)
                gender, age_group, race = decode_fairface(logits)
                skin_tone = race_to_skin_tone(race)

    except Exception:
        pass

    out_rows.append({
        **row.to_dict(),
        "face_detected": face_detected,
        "gender": gender,
        "age_group": age_group,
        "race": race,
        "skin_tone": skin_tone
    })

sample_out = pd.DataFrame(out_rows)
print("Face detected %:", sample_out["face_detected"].mean()*100)
sample_out[["face_detected","gender","age_group","race","skin_tone"]].head(10)


100%|███████████████████████████████████████████████████████████████████████████| 300/300 [01:20<00:00,  3.71it/s]

Face detected %: 36.333333333333336





Unnamed: 0,face_detected,gender,age_group,race,skin_tone
0,False,,,,
1,True,Male,0-2,East Asian,Light
2,False,,,,
3,False,,,,
4,True,Male,40-49,Black,Dark
5,True,Male,0-2,East Asian,Light
6,False,,,,
7,False,,,,
8,False,,,,
9,False,,,,


Run full dataset (avec checkpoint)

In [1]:
CHECKPOINT_CSV = "data/laion_big_light_tau1_0.2989_fairface_checkpoint.csv"

# Resume if checkpoint exists
if os.path.exists(CHECKPOINT_CSV):
    done = pd.read_csv(CHECKPOINT_CSV)
    done_paths = set(done["image_path"].astypon e(str).tolist())
    print("Resume: already processed:", len(done_paths))
else:
    done = pd.DataFrame()
    done_paths = set()

out_rows = []
save_every = 10000

for i, row in tqdm(df.iterrows(), total=len(df)):
    path = str(row["image_path"])
    if path in done_paths:
        continue

    face_detected = False
    gender = None
    age_group = None
    skin_tone = None
    race = None

    try:
        img = Image.open(path).convert("RGB")
        boxes, _ = mtcnn.detect(img)

        if boxes is not None and len(boxes) > 0:
            face_detected = True
            areas = [(b[2]-b[0])*(b[3]-b[1]) for b in boxes]
            j = int(np.argmax(areas))
            x1, y1, x2, y2 = boxes[j].astype(int)

            face = img.crop((x1, y1, x2, y2))
            x = transform(face).unsqueeze(0).to(DEVICE)

            with torch.no_grad():
                logits = fairface_model(x).squeeze(0)
                gender, age_group, race = decode_fairface(logits)
                skin_tone = race_to_skin_tone(race)

    except Exception:
        pass

    out_rows.append({
        **row.to_dict(),
        "face_detected": face_detected,
        "gender": gender,
        "age_group": age_group,
        "race": race,
        "skin_tone": skin_tone
    })

    if len(out_rows) >= save_every:
        chunk = pd.DataFrame(out_rows)
        out_rows = []

        if len(done) == 0:
            done = chunk
        else:
            done = pd.concat([done, chunk], ignore_index=True)

        done.to_csv(CHECKPOINT_CSV, index=False)
        done_paths = set(done["image_path"].astype(str).tolist())
        print("Checkpoint saved:", len(done_paths))

# final save
if out_rows:
    chunk = pd.DataFrame(out_rows)
    done = pd.concat([done, chunk], ignore_index=True)

done.to_csv(CSV_OUT, index=False)
print("Saved final:", CSV_OUT, "| rows:", len(done))


NameError: name 'os' is not defined

Plots for all demographic attributes (face / gender / age_group / race / skin_tone)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

CSV_OUT = "data/laion_big_light_tau1_0.2989_fairface.csv"
df_ff = pd.read_csv(CSV_OUT)

print("Rows:", len(df_ff))
print("Face detected %:", df_ff["face_detected"].mean() * 100)

def barplot(col, title=None, top_n=None):
    if col not in df_ff.columns:
        print(f"❌ Missing column: {col}")
        return

    s = df_ff[col].dropna()
    if s.empty:
        print(f"⚠️ No values available for: {col} (all NaN)")
        return

    counts = s.value_counts()
    if top_n is not None:
        counts = counts.head(top_n)

    ax = counts.plot(kind="bar", title=title or f"{col} distribution")
    ax.set_xlabel(col)
    ax.set_ylabel("Count")
    plt.xticks(rotation=30, ha="right")
    plt.tight_layout()
    plt.show()

# 1) Face detected (including False/True)
if "face_detected" in df_ff.columns:
    face_counts = df_ff["face_detected"].fillna(False).value_counts()
    ax = face_counts.plot(kind="bar", title="Face Detection (True/False)")
    ax.set_xlabel("face_detected")
    ax.set_ylabel("Count")
    plt.xticks(rotation=0)
    plt.tight_layout()
    plt.show()

# 2) Gender
barplot("gender", "Gender Distribution")

# 3) Age group
barplot("age_group", "Age Group Distribution")

# 4) Race
barplot("race", "Race Distribution")

# 5) Skin tone (your proxy mapping)
barplot("skin_tone", "Skin Tone (Proxy) Distribution")

# Optional: how many rows have each attribute filled
cols = ["face_detected", "gender", "age_group", "race", "skin_tone"]
print("\nFilled ratios:")
for c in cols:
    if c in df_ff.columns:
        print(f"- {c}: {df_ff[c].notna().mean()*100:.2f}%")


JSON summary (counts + percentages)

In [None]:
import json
import numpy as np
from datetime import datetime

CSV_OUT = "data/laion_big_light_tau1_0.2989_fairface.csv"
JSON_OUT = "data/week5_fairface_light_summary.json"

df_ff = pd.read_csv(CSV_OUT)

def vc(col):
    if col not in df_ff.columns:
        return None
    s = df_ff[col].dropna()
    if s.empty:
        return {}
    return s.value_counts().to_dict()

summary = {
    "timestamp": datetime.now().isoformat(timespec="seconds"),
    "input_csv": CSV_OUT,
    "n_rows": int(len(df_ff)),

    "face_detected": {
        "true_count": int((df_ff["face_detected"] == True).sum()) if "face_detected" in df_ff.columns else None,
        "false_count": int((df_ff["face_detected"] == False).sum()) if "face_detected" in df_ff.columns else None,
        "true_pct": float((df_ff["face_detected"] == True).mean()*100) if "face_detected" in df_ff.columns else None,
    },

    "filled_ratio_pct": {
        c: float(df_ff[c].notna().mean()*100) if c in df_ff.columns else None
        for c in ["gender", "age_group", "race", "skin_tone"]
    },

    "distributions": {
        "gender": vc("gender"),
        "age_group": vc("age_group"),
        "race": vc("race"),
        "skin_tone": vc("skin_tone"),
    },
}

with open(JSON_OUT, "w", encoding="utf-8") as f:
    json.dump(summary, f, indent=2)

print("✅ Saved JSON summary:", JSON_OUT)
print(json.dumps(summary, indent=2)[:1200], "...\n")  # preview
