# 1. Tabular

## A. UCI Adult Income dataset

In [18]:
# Adult Income Dataset (UCI Census Income dataset)
# - Task: predict whether an individual's income exceeds $50K/year based on census attributes.
# - Importance: widely used in bias/fairness studies; known gender and race disparities.
# - Source: UCI ML Repository, mirrored on OpenML.
# - Behavior: load local data/adult.csv if present; otherwise fetch from OpenML.
#   In both cases, normalize the target column to 'income' and strip whitespace.

from pathlib import Path
import pandas as pd
from sklearn.datasets import fetch_openml

project_root = Path(__file__).resolve().parent.parent if "__file__" in globals() else Path.cwd().resolve().parent
data_dir = project_root / "data"
data_dir.mkdir(parents=True, exist_ok=True)
adult_csv = data_dir / "adult.csv"

# Load or fetch
if adult_csv.exists():
    df = pd.read_csv(adult_csv)
    print(f"Loaded existing: {adult_csv}")
else:
    adult = fetch_openml(name="adult", version=2, as_frame=True)
    df = adult.frame.copy()
    print("Fetched Adult Income from OpenML.")

# Normalize target column name and values
if "income" not in df.columns and "class" in df.columns:
    df = df.rename(columns={"class": "income"})
if "income" in df.columns:
    df["income"] = df["income"].astype(str).str.strip()

# Save normalized copy
df.to_csv(adult_csv, index=False)
print(f"Saved normalized dataset: {adult_csv}")

# Quick overview
print("Shape:", df.shape)
print("Columns:", list(df.columns)[:15], "...")
if "income" in df.columns:
    print(df["income"].value_counts())


Loaded existing: C:\Users\hana1\Documents\iva-bias-project\data\adult.csv
Saved normalized dataset: C:\Users\hana1\Documents\iva-bias-project\data\adult.csv
Shape: (48842, 15)
Columns: ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'] ...
income
<=50K    37155
>50K     11687
Name: count, dtype: int64


In [19]:
# Prepare a model-ready Adult Income CSV and minimal EDA summaries.
# - Input:  data/adult.csv  (raw, normalized to have 'income')
# - Output: data/adult_model.csv  (selected features + binary target 'label')
# - EDA:    results/eda/* balance tables for target and sensitive attributes

from pathlib import Path
import pandas as pd


project_root = Path(__file__).resolve().parent.parent if "__file__" in globals() else Path.cwd().resolve().parent
data_dir = project_root / "data"
data_dir.mkdir(parents=True, exist_ok=True)

results_eda = project_root / "results" / "eda"
results_eda.mkdir(parents=True, exist_ok=True)

src = data_dir / "adult.csv"
dst = data_dir / "adult_model.csv"

# Load raw (normalized) Adult dataset
df = pd.read_csv(src)

# Standardize target column
if "income" not in df.columns and "class" in df.columns:
    df = df.rename(columns={"class": "income"})
df["income"] = df["income"].astype(str).str.strip()

# Binary target: 1 iff income > 50K, else 0
df["label"] = (df["income"] == ">50K").astype(int)

# Feature selection (exclude technical fields like 'fnlwgt')
candidate_features = [
    "age", "workclass", "education", "education-num", "marital-status",
    "occupation", "relationship", "race", "sex",
    "capital-gain", "capital-loss", "hours-per-week", "native-country",
]
features = [c for c in candidate_features if c in df.columns]
cols = features + ["label"]

clean = df[cols].dropna().reset_index(drop=True)
clean.to_csv(dst, index=False)

# Minimal EDA summaries (counts and fractions)
def counts_and_frac(s: pd.Series) -> pd.DataFrame:
    vc = s.value_counts(dropna=False)
    frac = (vc / vc.sum()).round(4)
    return pd.DataFrame({"count": vc, "fraction": frac})

counts_and_frac(clean["label"]).to_csv(results_eda / "adult_target_balance.csv")
if "sex" in clean.columns:
    counts_and_frac(clean["sex"]).to_csv(results_eda / "adult_sensitive_sex_balance.csv")
if "race" in clean.columns:
    counts_and_frac(clean["race"]).to_csv(results_eda / "adult_sensitive_race_balance.csv")

print(f"Saved model-ready file: {dst}  | rows={len(clean)}  cols={len(clean.columns)}")
print("EDA summaries written to:", results_eda)


Saved model-ready file: C:\Users\hana1\Documents\iva-bias-project\data\adult_model.csv  | rows=45222  cols=14
EDA summaries written to: C:\Users\hana1\Documents\iva-bias-project\results\eda


## B. COMPAS Dataset

In [20]:
# COMPAS Recidivism dataset (ProPublica)
# - Task: predict whether a defendant re-offends within two years.
# - Importance: benchmark dataset for fairness studies due to racial disparities.
# - Source: ProPublica GitHub (https://github.com/propublica/compas-analysis)
# - Behavior: download CSV if not already saved locally, normalize columns, and save to data/compas.csv.

from pathlib import Path
import pandas as pd

project_root = Path(__file__).resolve().parent.parent if "__file__" in globals() else Path.cwd().resolve().parent
data_dir = project_root / "data"
data_dir.mkdir(parents=True, exist_ok=True)
compas_csv = data_dir / "compas.csv"

if compas_csv.exists():
    df = pd.read_csv(compas_csv)
    print(f"Loaded existing: {compas_csv}")
else:
    url = "https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv"
    df = pd.read_csv(url)
    df.to_csv(compas_csv, index=False)
    print(f"Downloaded & saved: {compas_csv}")

# Standardize key columns
if "two_year_recid" not in df.columns and "is_recid" in df.columns:
    df = df.rename(columns={"is_recid": "two_year_recid"})
df["two_year_recid"] = pd.to_numeric(df["two_year_recid"], errors="coerce").fillna(0).astype(int)

for col in ["race", "sex"]:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip()

df.to_csv(compas_csv, index=False)
print(f"Saved normalized dataset: {compas_csv}")

print("Shape:", df.shape)
print("Columns:", list(df.columns)[:15], "...")
print("\nTarget distribution (two_year_recid):")
print(df["two_year_recid"].value_counts().sort_index().to_frame("count").assign(
    fraction=lambda x: (x["count"] / x["count"].sum()).round(4)
))
if "sex" in df.columns:
    print("\nSex distribution:")
    print(df["sex"].value_counts().to_frame("count").assign(
        fraction=lambda x: (x["count"] / x["count"].sum()).round(4)
    ))
if "race" in df.columns:
    print("\nRace distribution:")
    print(df["race"].value_counts().to_frame("count").assign(
        fraction=lambda x: (x["count"] / x["count"].sum()).round(4)
    ))


Loaded existing: C:\Users\hana1\Documents\iva-bias-project\data\compas.csv
Saved normalized dataset: C:\Users\hana1\Documents\iva-bias-project\data\compas.csv
Shape: (7214, 53)
Columns: ['id', 'name', 'first', 'last', 'compas_screening_date', 'sex', 'dob', 'age', 'age_cat', 'race', 'juv_fel_count', 'decile_score', 'juv_misd_count', 'juv_other_count', 'priors_count'] ...

Target distribution (two_year_recid):
                count  fraction
two_year_recid                 
0                3963    0.5493
1                3251    0.4507

Sex distribution:
        count  fraction
sex                    
Male     5819    0.8066
Female   1395    0.1934

Race distribution:
                  count  fraction
race                             
African-American   3696    0.5123
Caucasian          2454    0.3402
Hispanic            637    0.0883
Other               377    0.0523
Asian                32    0.0044
Native American      18    0.0025


In [21]:
# Prepare a model-ready COMPAS CSV and minimal EDA summaries.
# - Input:  data/compas.csv  (raw, normalized to have 'two_year_recid')
# - Output: data/compas_model.csv  (selected features + binary target 'label')
# - EDA:    results/eda/* balance tables for target and sensitive attributes

from pathlib import Path
import pandas as pd

project_root = Path(__file__).resolve().parent.parent if "__file__" in globals() else Path.cwd().resolve().parent
data_dir = project_root / "data"
results_eda = project_root / "results" / "eda"
results_eda.mkdir(parents=True, exist_ok=True)

src = data_dir / "compas.csv"
dst = data_dir / "compas_model.csv"

df = pd.read_csv(src)

# Ensure target column exists and standardize name
if "two_year_recid" not in df.columns and "is_recid" in df.columns:
    df = df.rename(columns={"is_recid": "two_year_recid"})
df["two_year_recid"] = pd.to_numeric(df["two_year_recid"], errors="coerce").fillna(0).astype(int)

# Binary target: 1 if re-offended within two years, 0 otherwise
df["label"] = df["two_year_recid"]

# Feature selection (exclude identifiers like name, id, dob, etc.)
candidate_features = [
    "sex", "race", "age", "age_cat", "priors_count",
    "juv_fel_count", "juv_misd_count", "juv_other_count", "decile_score",
]
features = [c for c in candidate_features if c in df.columns]
cols = features + ["label"]

clean = df[cols].dropna().reset_index(drop=True)
clean.to_csv(dst, index=False)

# Minimal EDA summaries: counts and fractions for target/sensitive attributes
def counts_and_frac(s: pd.Series) -> pd.DataFrame:
    vc = s.value_counts(dropna=False)
    frac = (vc / vc.sum()).round(4)
    return pd.DataFrame({"count": vc, "fraction": frac})

counts_and_frac(clean["label"]).to_csv(results_eda / "compas_target_balance.csv")
if "sex" in clean.columns:
    counts_and_frac(clean["sex"]).to_csv(results_eda / "compas_sensitive_sex_balance.csv")
if "race" in clean.columns:
    counts_and_frac(clean["race"]).to_csv(results_eda / "compas_sensitive_race_balance.csv")

print(f"Saved model-ready file: {dst}  | rows={len(clean)}  cols={len(clean.columns)}")
print("EDA summaries written to:", results_eda)


Saved model-ready file: C:\Users\hana1\Documents\iva-bias-project\data\compas_model.csv  | rows=7214  cols=10
EDA summaries written to: C:\Users\hana1\Documents\iva-bias-project\results\eda


## C. German Credit 

In [22]:
# German Credit Risk dataset
# - Task: predict whether a loan applicant is a good (1) or bad (2) credit risk.
# - Importance: widely used in fairness studies (bias across gender, age, etc.).
# - Source: UCI / OpenML.
# - Behavior: load local data/german.csv if present; otherwise fetch from OpenML.
#   Standardizes target column to 'credit_risk'.

from pathlib import Path
import pandas as pd
from sklearn.datasets import fetch_openml

project_root = Path(__file__).resolve().parent.parent if "__file__" in globals() else Path.cwd().resolve().parent
data_dir = project_root / "data"
data_dir.mkdir(parents=True, exist_ok=True)
german_csv = data_dir / "german.csv"

if german_csv.exists():
    df = pd.read_csv(german_csv)
    print(f"Loaded existing: {german_csv}")
else:
    german = fetch_openml(name="credit-g", version=1, as_frame=True)
    df = german.frame.copy()
    df.to_csv(german_csv, index=False)
    print(f"Fetched German Credit from OpenML and saved to: {german_csv}")

# Normalize target
if "credit_risk" not in df.columns and "class" in df.columns:
    df = df.rename(columns={"class": "credit_risk"})

df["credit_risk"] = df["credit_risk"].astype(str).str.strip()

df.to_csv(german_csv, index=False)
print(f"Saved normalized dataset: {german_csv}")

# Quick overview
print("Shape:", df.shape)
print("Columns:", list(df.columns)[:15], "...")
print("\nTarget distribution (credit_risk):")
print(df["credit_risk"].value_counts().to_frame("count").assign(
    fraction=lambda x: (x["count"] / x["count"].sum()).round(4)
))


Loaded existing: C:\Users\hana1\Documents\iva-bias-project\data\german.csv
Saved normalized dataset: C:\Users\hana1\Documents\iva-bias-project\data\german.csv
Shape: (1000, 21)
Columns: ['checking_status', 'duration', 'credit_history', 'purpose', 'credit_amount', 'savings_status', 'employment', 'installment_commitment', 'personal_status', 'other_parties', 'residence_since', 'property_magnitude', 'age', 'other_payment_plans', 'housing'] ...

Target distribution (credit_risk):
             count  fraction
credit_risk                 
good           700       0.7
bad            300       0.3


In [23]:
# Prepare a model-ready German Credit CSV and minimal EDA summaries.
# - Input:  data/german.csv  (raw, normalized to have 'credit_risk')
# - Output: data/german_model.csv  (selected features + binary target 'label')
# - EDA:    results/eda/* balance tables for target and sensitive attributes

from pathlib import Path
import pandas as pd

project_root = Path(__file__).resolve().parent.parent if "__file__" in globals() else Path.cwd().resolve().parent
data_dir = project_root / "data"
results_eda = project_root / "results" / "eda"
results_eda.mkdir(parents=True, exist_ok=True)

src = data_dir / "german.csv"
dst = data_dir / "german_model.csv"

df = pd.read_csv(src)

# Standardize target to binary label (1=good, 0=bad)
if "credit_risk" not in df.columns and "class" in df.columns:
    df = df.rename(columns={"class": "credit_risk"})
df["credit_risk"] = df["credit_risk"].astype(str).str.strip().str.lower()
df["label"] = (df["credit_risk"] == "good").astype(int)

# Derive sensitive attributes
# - sex: parsed from 'personal_status' values like 'male div/sep' or 'female div/dep/mar'
if "personal_status" in df.columns and "sex" not in df.columns:
    def _sex_from_personal_status(x: str) -> str:
        x = str(x).lower()
        if x.startswith("male"):
            return "Male"
        if x.startswith("female"):
            return "Female"
        return "Unknown"
    df["sex"] = df["personal_status"].map(_sex_from_personal_status)

# - age_group: coarse bins
if "age" in df.columns and "age_group" not in df.columns:
    bins = [-1, 24, 34, 44, 54, 120]
    labels = ["<25", "25-34", "35-44", "45-54", "55+"]
    df["age_group"] = pd.cut(df["age"], bins=bins, labels=labels)

# Feature selection (tabular baseline)
candidate_features = [
    "checking_status", "duration", "credit_history", "purpose", "credit_amount",
    "savings_status", "employment", "installment_commitment", "personal_status",
    "other_parties", "residence_since", "property_magnitude", "age",
    "other_payment_plans", "housing", "existing_credits", "job",
    "num_dependents", "own_telephone", "foreign_worker",
    # derived
    "sex", "age_group",
]
features = [c for c in candidate_features if c in df.columns]
cols = features + ["label"]

clean = df[cols].dropna().reset_index(drop=True)
clean.to_csv(dst, index=False)

# Minimal EDA summaries
def counts_and_frac(s: pd.Series) -> pd.DataFrame:
    vc = s.value_counts(dropna=False)
    frac = (vc / vc.sum()).round(4)
    return pd.DataFrame({"count": vc, "fraction": frac})

counts_and_frac(clean["label"]).to_csv(results_eda / "german_target_balance.csv")
if "sex" in clean.columns:
    counts_and_frac(clean["sex"]).to_csv(results_eda / "german_sensitive_sex_balance.csv")
if "age_group" in clean.columns:
    counts_and_frac(clean["age_group"]).to_csv(results_eda / "german_sensitive_age_balance.csv")

print(f"Saved model-ready file: {dst}  | rows={len(clean)}  cols={len(clean.columns)}")
print("EDA summaries written to:", results_eda)


Saved model-ready file: C:\Users\hana1\Documents\iva-bias-project\data\german_model.csv  | rows=1000  cols=23
EDA summaries written to: C:\Users\hana1\Documents\iva-bias-project\results\eda


# 2. NLP

## A. BOLD

In [24]:
# BOLD (Bias in Open-Ended Language Generation)
# - Task: probe social biases in text generation across demographic axes.
# - Source: Hugging Face Datasets (AmazonScience/bold).
# - Output: data/bold.csv (consolidated text + metadata)

from pathlib import Path
import pandas as pd
from datasets import load_dataset

project_root = Path(__file__).resolve().parent.parent if "__file__" in globals() else Path.cwd().resolve().parent
data_dir = project_root / "data"
data_dir.mkdir(parents=True, exist_ok=True)
bold_csv = data_dir / "bold.csv"

# Load and concatenate available split(s)
frames = []
ds = load_dataset("AmazonScience/bold")
for split in ds.keys():  # typically 'train'
    df_split = pd.DataFrame(ds[split])
    df_split["split"] = split
    frames.append(df_split)
df = pd.concat(frames, axis=0, ignore_index=True)

# Normalize text column name
if "prompt" in df.columns and "text" not in df.columns:
    df = df.rename(columns={"prompt": "text"})
if "prompts" in df.columns and "text" not in df.columns:
    df = df.rename(columns={"prompts": "text"})

# Keep informative columns
candidate_cols = [
    "text", "category", "subcategory", "topic", "template", "target",
    "demographic", "geo", "source", "split",
]
cols = [c for c in candidate_cols if c in df.columns]
if "text" in df.columns and "text" not in cols:
    cols = ["text"] + cols
df = df[cols].dropna(subset=[cols[0]]).reset_index(drop=True)

df.to_csv(bold_csv, index=False)
print(f"Saved: {bold_csv}")
print("Shape:", df.shape)
print("Columns:", list(df.columns))
if "category" in df.columns:
    print("\nCategory balance (top 20):")
    print(df["category"].value_counts().to_frame("count").assign(
        fraction=lambda x: (x["count"]/x["count"].sum()).round(4)
    ).head(20))


Saved: C:\Users\hana1\Documents\iva-bias-project\data\bold.csv
Shape: (7201, 3)
Columns: ['text', 'category', 'split']

Category balance (top 20):
                               count  fraction
category                                      
European_Americans              2029    0.2818
American_actors                 1587    0.2204
American_actresses               776    0.1078
African_Americans                721    0.1001
Asian_Americans                  408    0.0567
engineering_branches             248    0.0344
nationalism                      115    0.0160
dance_occupations                106    0.0147
sewing_occupations               104    0.0144
nursing_specialties               92    0.0128
theatre_personnel                 86    0.0119
democracy                         70    0.0097
scientific_occupations            69    0.0096
anarchism                         62    0.0086
socialism                         58    0.0081
healthcare_occupations            49    0.0068
corpora

In [25]:
# Prepare a model-ready BOLD CSV and minimal EDA summaries.
# - Input:  data/bold.csv  (text + metadata)
# - Output: data/bold_model.csv  (text + group column for analysis)
# - EDA:    results/eda/bold_group_balance.csv

from pathlib import Path
import pandas as pd

project_root = Path(__file__).resolve().parent.parent if "__file__" in globals() else Path.cwd().resolve().parent
data_dir = project_root / "data"
results_eda = project_root / "results" / "eda"
results_eda.mkdir(parents=True, exist_ok=True)

src = data_dir / "bold.csv"
dst = data_dir / "bold_model.csv"

df = pd.read_csv(src)

# Standardize columns: use 'text' and map 'category' -> 'group' for downstream analysis
df = df.rename(columns={"category": "group"})
keep_cols = [c for c in ["text", "group"] if c in df.columns]
clean = df[keep_cols].dropna(subset=["text"]).reset_index(drop=True)

# Persist model-ready file
clean.to_csv(dst, index=False)

# Minimal EDA: group balance
def counts_and_frac(s: pd.Series) -> pd.DataFrame:
    vc = s.value_counts(dropna=False)
    frac = (vc / vc.sum()).round(4)
    return pd.DataFrame({"count": vc, "fraction": frac})

if "group" in clean.columns:
    counts_and_frac(clean["group"]).to_csv(results_eda / "bold_group_balance.csv")

print(f"Saved model-ready file: {dst}  | rows={len(clean)}  cols={len(clean.columns)}")
print("EDA summaries written to:", results_eda)


Saved model-ready file: C:\Users\hana1\Documents\iva-bias-project\data\bold_model.csv  | rows=7201  cols=2
EDA summaries written to: C:\Users\hana1\Documents\iva-bias-project\results\eda


## C. WinoBias

In [26]:
# WinoBias (Gender bias in coreference-style sentences)
# - Task: evaluate gender/occupation bias via pro/anti-stereotype minimal pairs.
# - Source: Hugging Face Datasets (uclanlp/wino_bias, configs: type1_pro, type1_anti, type2_pro, type2_anti).
# - Output: data/winobias.csv (text + group + split).

from pathlib import Path
import pandas as pd
from datasets import load_dataset

project_root = Path(__file__).resolve().parent.parent if "__file__" in globals() else Path.cwd().resolve().parent
data_dir = project_root / "data"
data_dir.mkdir(parents=True, exist_ok=True)
wb_csv = data_dir / "winobias.csv"

frames = []
for cfg in ["type1_pro", "type1_anti", "type2_pro", "type2_anti"]:
    ds = load_dataset("uclanlp/wino_bias", cfg)
    for split in ds.keys():  # usually 'train' and 'test'
        df_split = pd.DataFrame(ds[split])
        df_split["config"] = cfg
        df_split["split"] = split
        frames.append(df_split)

df = pd.concat(frames, axis=0, ignore_index=True)

# Create text column from tokens
if "tokens" in df.columns and "text" not in df.columns:
    df["text"] = df["tokens"].apply(lambda xs: " ".join(xs))

# Standardize group column (pro/anti + type1/type2 from config)
df["group"] = df["config"]

# Keep only relevant cols
cols = ["text", "group", "split"]
df = df[cols].dropna(subset=["text"]).reset_index(drop=True)

df.to_csv(wb_csv, index=False)
print(f"Saved: {wb_csv}")
print("Shape:", df.shape)
print("Columns:", list(df.columns))
print("\nGroup balance:")
print(df["group"].value_counts())


Saved: C:\Users\hana1\Documents\iva-bias-project\data\winobias.csv
Shape: (3168, 3)
Columns: ['text', 'group', 'split']

Group balance:
group
type1_pro     792
type1_anti    792
type2_pro     792
type2_anti    792
Name: count, dtype: int64


In [27]:
# Prepare a model-ready WinoBias CSV and minimal EDA summaries.
# - Input:  data/winobias.csv  (text + group + split)
# - Output: data/winobias_model.csv  (text + group + split)
# - EDA:    results/eda/winobias_group_balance.csv
#           results/eda/winobias_split_balance.csv

from pathlib import Path
import pandas as pd

project_root = Path(__file__).resolve().parent.parent if "__file__" in globals() else Path.cwd().resolve().parent
data_dir = project_root / "data"
results_eda = project_root / "results" / "eda"
results_eda.mkdir(parents=True, exist_ok=True)

src = data_dir / "winobias.csv"
dst = data_dir / "winobias_model.csv"

df = pd.read_csv(src)

# Standardize to the columns used downstream
keep_cols = [c for c in ["text", "group", "split"] if c in df.columns]
clean = df[keep_cols].dropna(subset=["text"]).reset_index(drop=True)
clean.to_csv(dst, index=False)

# Minimal EDA summaries
def counts_and_frac(s: pd.Series) -> pd.DataFrame:
    vc = s.value_counts(dropna=False)
    frac = (vc / vc.sum()).round(4)
    return pd.DataFrame({"count": vc, "fraction": frac})

counts_and_frac(clean["group"]).to_csv(results_eda / "winobias_group_balance.csv")
if "split" in clean.columns:
    counts_and_frac(clean["split"]).to_csv(results_eda / "winobias_split_balance.csv")

print(f"Saved model-ready file: {dst}  | rows={len(clean)}  cols={len(clean.columns)}")
print("EDA summaries written to:", results_eda)


Saved model-ready file: C:\Users\hana1\Documents\iva-bias-project\data\winobias_model.csv  | rows=3168  cols=3
EDA summaries written to: C:\Users\hana1\Documents\iva-bias-project\results\eda


## C. Equity Evaluation Corpus

In [29]:
# Equity Evaluation Corpus (EEC)
# - Task: Detect and analyze implicit bias in English sentences containing emotional expressions.
# - Content: 8,640 sentences that pair emotion-related words with demographic names 
#            (e.g., “African-American | Alonzo” or “European | Adam”), labeled by gender and race.
# - Importance: Used in fairness research to measure differences in sentiment across demographic groups.
# - Source: Hugging Face dataset repo (peixian/equity_evaluation_corpus).
# - Behavior: Fetch direct CSV version (robust to Hugging Face script changes) 
#             and standardize to columns: text, person, gender, race, emotion, emotion_word, template, split.

from pathlib import Path
import pandas as pd

project_root = Path(__file__).resolve().parent.parent if "__file__" in globals() else Path.cwd().resolve().parent
data_dir = project_root / "data"
data_dir.mkdir(parents=True, exist_ok=True)
eec_csv = data_dir / "eec.csv"

# Direct CSV (same content as the dataset viewer's 'first_domain/train')
csv_url = "https://huggingface.co/datasets/peixian/equity_evaluation_corpus/resolve/main/Equity-Evaluation-Corpus.csv"

# Load and standardize headers (case-insensitive)
df_raw = pd.read_csv(csv_url)
df = df_raw.copy()
df.columns = [c.strip().lower() for c in df.columns]

# Map canonical column names
rename_map = {
    "sentence": "text",
    "emotion word": "emotion_word",
}
df = df.rename(columns={k: v for k, v in rename_map.items() if k in df.columns})

# Keep informative columns if present
candidate_cols = ["text", "person", "gender", "race", "emotion", "emotion_word", "template"]
cols = [c for c in candidate_cols if c in df.columns]
df = df[cols].copy()

# Light cleanup
for c in df.columns:
    df[c] = df[c].astype(str).str.strip()

# Add a split column to match our convention (EEC is a single split)
df["split"] = "train"

# Persist
df.to_csv(eec_csv, index=False)
print(f"Saved: {eec_csv}")
print("Shape:", df.shape)
print("Columns:", list(df.columns))
if {"race","person"}.issubset(df.columns):
    grp = (df["race"] + " | " + df["person"]).value_counts().to_frame("count")
    grp["fraction"] = (grp["count"]/grp["count"].sum()).round(4)
    print("\nRace|Person balance (top 12):")
    print(grp.head(12))


Saved: C:\Users\hana1\Documents\iva-bias-project\data\eec.csv
Shape: (8640, 8)
Columns: ['text', 'person', 'gender', 'race', 'emotion', 'emotion_word', 'template', 'split']

Race|Person balance (top 12):
                             count  fraction
African-American | Alonzo      144    0.0167
African-American | Jamel       144    0.0167
African-American | Alphonse    144    0.0167
African-American | Jerome      144    0.0167
African-American | Leroy       144    0.0167
African-American | Torrance    144    0.0167
African-American | Darnell     144    0.0167
African-American | Lamar       144    0.0167
African-American | Malik       144    0.0167
African-American | Terrence    144    0.0167
European | Adam                144    0.0167
European | Harry               144    0.0167


In [30]:
# Equity Evaluation Corpus (EEC) — model-ready export + EDA
# - Input:  data/eec.csv
# - Output: data/eec_model.csv  (text, race, gender, emotion, split)
# - EDA:    results/eda/eec_race_balance.csv
#           results/eda/eec_gender_balance.csv
#           results/eda/eec_emotion_balance.csv
#           results/eda/eec_race_gender_balance.csv

from pathlib import Path
import pandas as pd

project_root = Path(__file__).resolve().parent.parent if "__file__" in globals() else Path.cwd().resolve().parent
data_dir = project_root / "data"
results_eda = project_root / "results" / "eda"
results_eda.mkdir(parents=True, exist_ok=True)

src = data_dir / "eec.csv"
dst = data_dir / "eec_model.csv"

df = pd.read_csv(src)

# Standard columns for downstream use
keep_cols = [c for c in ["text", "race", "gender", "emotion", "split"] if c in df.columns]
clean = df[keep_cols].dropna(subset=["text"]).reset_index(drop=True)
clean.to_csv(dst, index=False)

# EDA helpers
def counts_and_frac(s: pd.Series) -> pd.DataFrame:
    vc = s.value_counts(dropna=False)
    frac = (vc / vc.sum()).round(4)
    return pd.DataFrame({"count": vc, "fraction": frac})

# EDA outputs
if "race" in clean.columns:
    counts_and_frac(clean["race"]).to_csv(results_eda / "eec_race_balance.csv")
if "gender" in clean.columns:
    counts_and_frac(clean["gender"]).to_csv(results_eda / "eec_gender_balance.csv")
if "emotion" in clean.columns:
    counts_and_frac(clean["emotion"]).to_csv(results_eda / "eec_emotion_balance.csv")
if {"race", "gender"}.issubset(clean.columns):
    rg = (clean["race"] + " | " + clean["gender"]).rename("race|gender")
    counts_and_frac(rg).to_csv(results_eda / "eec_race_gender_balance.csv")

print(f"Saved model-ready file: {dst}  | rows={len(clean)}  cols={len(clean.columns)}")
print("EDA summaries written to:", results_eda)


Saved model-ready file: C:\Users\hana1\Documents\iva-bias-project\data\eec_model.csv  | rows=8640  cols=5
EDA summaries written to: C:\Users\hana1\Documents\iva-bias-project\results\eda


# 3 Vision Datasets

## A. Fairface

In [5]:
# FairFace Dataset
# - Task: Evaluate fairness in facial attribute classification (race, gender, age).
# - Content: metadata (CSV labels) for ~108k images with 7 race groups.
# - Importance: benchmark dataset for bias analysis in computer vision.
# - Source: Official label CSVs linked from the FairFace GitHub (Google Drive).
# - Behavior: download labels via Google Drive file IDs, normalize, add split, and save one CSV.
# - Output: data/fairface.csv with columns [image_file, age, gender, race, service_test, split]

from pathlib import Path
import pandas as pd
import sys, subprocess

# Ensure gdown is available for Google Drive downloads
try:
    import gdown  # type: ignore
except Exception:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "gdown"])
    import gdown  # type: ignore

# Paths
project_root = Path(__file__).resolve().parent.parent if "__file__" in globals() else Path.cwd().resolve().parent
data_dir = project_root / "data"
data_dir.mkdir(parents=True, exist_ok=True)
fairface_csv = data_dir / "fairface.csv"

# Google Drive file IDs from the FairFace GitHub page:
#   Train labels: fairface_label_train.csv
#   Val labels:   fairface_label_val.csv
TRAIN_ID = "1i1L3Yqwaio7YSOCj7ftgk8ZZchPG7dmH"
VAL_ID   = "1wOdja-ezstMEp81tX1a-EYkFebev4h7D"

train_path = data_dir / "fairface_label_train.csv"
val_path   = data_dir / "fairface_label_val.csv"

# Download if not present locally
if not train_path.exists():
    gdown.download(id=TRAIN_ID, output=str(train_path), quiet=False)
if not val_path.exists():
    gdown.download(id=VAL_ID, output=str(val_path), quiet=False)

# Load and clean
def load_and_tag(p: Path, split_name: str) -> pd.DataFrame:
    df = pd.read_csv(p)
    # Normalize headers: lowercase, underscores
    df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]
    # Standardize expected column names
    # Common FairFace label columns include: file, gender, race, age, service_test
    rename_map = {"file": "image_file"}
    df = df.rename(columns=rename_map)
    df["split"] = split_name
    return df

df_train = load_and_tag(train_path, "train")
df_val   = load_and_tag(val_path, "val")

# Concatenate and keep a consistent column order
df = pd.concat([df_train, df_val], ignore_index=True)
candidate_cols = ["image_file", "age", "gender", "race", "service_test", "split"]
cols = [c for c in candidate_cols if c in df.columns]
df = df[cols].copy()

# Save consolidated metadata
df.to_csv(fairface_csv, index=False)
print(f"Saved: {fairface_csv}")
print("Shape:", df.shape)
print("Columns:", list(df.columns))

# Simple balance check (no images required)
if {"race","gender"}.issubset(df.columns):
    grp = (df["race"].astype(str) + " | " + df["gender"].astype(str)).value_counts()
    frac = (grp / grp.sum()).round(4)
    out = pd.DataFrame({"count": grp, "fraction": frac})
    print("\nRace | Gender balance (top 12):")
    print(out.head(12))


Downloading...
From: https://drive.google.com/uc?id=1i1L3Yqwaio7YSOCj7ftgk8ZZchPG7dmH
To: C:\Users\hana1\Documents\iva-bias-project\data\fairface_label_train.csv
100%|█████████████████████████████████████████████████████████████████████████████| 3.79M/3.79M [00:00<00:00, 3.81MB/s]
Downloading...
From: https://drive.google.com/uc?id=1wOdja-ezstMEp81tX1a-EYkFebev4h7D
To: C:\Users\hana1\Documents\iva-bias-project\data\fairface_label_val.csv
100%|███████████████████████████████████████████████████████████████████████████████| 448k/448k [00:00<00:00, 2.10MB/s]


Saved: C:\Users\hana1\Documents\iva-bias-project\data\fairface.csv
Shape: (97698, 6)
Columns: ['image_file', 'age', 'gender', 'race', 'service_test', 'split']

Race | Gender balance (top 12):
                          count  fraction
White | Male               9823    0.1005
White | Female             8789    0.0900
Latino_Hispanic | Female   7545    0.0772
Latino_Hispanic | Male     7445    0.0762
Middle Eastern | Male      7182    0.0735
Indian | Male              7163    0.0733
East Asian | Male          6923    0.0709
East Asian | Female        6914    0.0708
Black | Male               6895    0.0706
Black | Female             6894    0.0706
Indian | Female            6672    0.0683
Southeast Asian | Male     6347    0.0650


In [6]:
# FairFace — model-ready export and EDA
# - Input:  data/fairface.csv
# - Output: data/fairface_model.csv  (image_file, age, gender, race, split)
# - EDA:    results/eda/fairface_race_balance.csv
#           results/eda/fairface_gender_balance.csv
#           results/eda/fairface_age_balance.csv
#           results/eda/fairface_race_gender_balance.csv

from pathlib import Path
import pandas as pd

project_root = Path(__file__).resolve().parent.parent if "__file__" in globals() else Path.cwd().resolve().parent
data_dir = project_root / "data"
results_eda = project_root / "results" / "eda"
results_eda.mkdir(parents=True, exist_ok=True)

src = data_dir / "fairface.csv"
dst = data_dir / "fairface_model.csv"

df = pd.read_csv(src)

# Keep key demographic fields (images are not downloaded; this is metadata-only)
keep_cols = [c for c in ["image_file", "age", "gender", "race", "split"] if c in df.columns]
clean = df[keep_cols].dropna().reset_index(drop=True)
clean.to_csv(dst, index=False)

def counts_and_frac(s: pd.Series) -> pd.DataFrame:
    vc = s.value_counts(dropna=False)
    frac = (vc / vc.sum()).round(4)
    return pd.DataFrame({"count": vc, "fraction": frac})

# EDA exports
if "race" in clean.columns:
    counts_and_frac(clean["race"]).to_csv(results_eda / "fairface_race_balance.csv")
if "gender" in clean.columns:
    counts_and_frac(clean["gender"]).to_csv(results_eda / "fairface_gender_balance.csv")
if "age" in clean.columns:
    counts_and_frac(clean["age"]).to_csv(results_eda / "fairface_age_balance.csv")
if {"race", "gender"}.issubset(clean.columns):
    rg = (clean["race"].astype(str) + " | " + clean["gender"].astype(str)).rename("race|gender")
    counts_and_frac(rg).to_csv(results_eda / "fairface_race_gender_balance.csv")

print(f"Saved model-ready file: {dst}  | rows={len(clean)}  cols={len(clean.columns)}")
print("EDA summaries written to:", results_eda)


Saved model-ready file: C:\Users\hana1\Documents\iva-bias-project\data\fairface_model.csv  | rows=97698  cols=5
EDA summaries written to: C:\Users\hana1\Documents\iva-bias-project\results\eda
