# 1. Adult Income dataset

In [2]:
# Adult Income Dataset (UCI Census Income dataset)
# - Task: predict whether an individual's income exceeds $50K/year based on census attributes.
# - Importance: widely used in bias/fairness studies; known gender and race disparities.
# - Source: UCI ML Repository, mirrored on OpenML.
# - Behavior: load local data/adult.csv if present; otherwise fetch from OpenML.
#   In both cases, normalize the target column to 'income' and strip whitespace.

from pathlib import Path
import pandas as pd
from sklearn.datasets import fetch_openml

project_root = Path(__file__).resolve().parent.parent if "__file__" in globals() else Path.cwd().resolve().parent
data_dir = project_root / "data"
data_dir.mkdir(parents=True, exist_ok=True)
adult_csv = data_dir / "adult.csv"

# Load or fetch
if adult_csv.exists():
    df = pd.read_csv(adult_csv)
    print(f"Loaded existing: {adult_csv}")
else:
    adult = fetch_openml(name="adult", version=2, as_frame=True)
    df = adult.frame.copy()
    print("Fetched Adult Income from OpenML.")

# Normalize target column name and values
if "income" not in df.columns and "class" in df.columns:
    df = df.rename(columns={"class": "income"})
if "income" in df.columns:
    df["income"] = df["income"].astype(str).str.strip()

# Save normalized copy
df.to_csv(adult_csv, index=False)
print(f"Saved normalized dataset: {adult_csv}")

# Quick overview
print("Shape:", df.shape)
print("Columns:", list(df.columns)[:15], "...")
if "income" in df.columns:
    print(df["income"].value_counts())


Loaded existing: C:\Users\hana1\Documents\iva-bias-project\data\adult.csv
Saved normalized dataset: C:\Users\hana1\Documents\iva-bias-project\data\adult.csv
Shape: (48842, 15)
Columns: ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'] ...
income
<=50K    37155
>50K     11687
Name: count, dtype: int64


In [3]:
# Prepare a model-ready Adult Income CSV and minimal EDA summaries.
# - Input:  data/adult.csv  (raw, normalized to have 'income')
# - Output: data/adult_model.csv  (selected features + binary target 'label')
# - EDA:    results/eda/* balance tables for target and sensitive attributes

from pathlib import Path
import pandas as pd


project_root = Path(__file__).resolve().parent.parent if "__file__" in globals() else Path.cwd().resolve().parent
data_dir = project_root / "data"
data_dir.mkdir(parents=True, exist_ok=True)

results_eda = project_root / "results" / "eda"
results_eda.mkdir(parents=True, exist_ok=True)

src = data_dir / "adult.csv"
dst = data_dir / "adult_model.csv"

# Load raw (normalized) Adult dataset
df = pd.read_csv(src)

# Standardize target column
if "income" not in df.columns and "class" in df.columns:
    df = df.rename(columns={"class": "income"})
df["income"] = df["income"].astype(str).str.strip()

# Binary target: 1 iff income > 50K, else 0
df["label"] = (df["income"] == ">50K").astype(int)

# Feature selection (exclude technical fields like 'fnlwgt')
candidate_features = [
    "age", "workclass", "education", "education-num", "marital-status",
    "occupation", "relationship", "race", "sex",
    "capital-gain", "capital-loss", "hours-per-week", "native-country",
]
features = [c for c in candidate_features if c in df.columns]
cols = features + ["label"]

clean = df[cols].dropna().reset_index(drop=True)
clean.to_csv(dst, index=False)

# Minimal EDA summaries (counts and fractions)
def counts_and_frac(s: pd.Series) -> pd.DataFrame:
    vc = s.value_counts(dropna=False)
    frac = (vc / vc.sum()).round(4)
    return pd.DataFrame({"count": vc, "fraction": frac})

counts_and_frac(clean["label"]).to_csv(results_eda / "adult_target_balance.csv")
if "sex" in clean.columns:
    counts_and_frac(clean["sex"]).to_csv(results_eda / "adult_sensitive_sex_balance.csv")
if "race" in clean.columns:
    counts_and_frac(clean["race"]).to_csv(results_eda / "adult_sensitive_race_balance.csv")

print(f"Saved model-ready file: {dst}  | rows={len(clean)}  cols={len(clean.columns)}")
print("EDA summaries written to:", results_eda)


Saved model-ready file: C:\Users\hana1\Documents\iva-bias-project\data\adult_model.csv  | rows=45222  cols=14
EDA summaries written to: C:\Users\hana1\Documents\iva-bias-project\results\eda


# 2 COMPAS Dataset


In [4]:
# COMPAS Recidivism dataset (ProPublica)
# - Task: predict whether a defendant re-offends within two years.
# - Importance: benchmark dataset for fairness studies due to racial disparities.
# - Source: ProPublica GitHub (https://github.com/propublica/compas-analysis)
# - Behavior: download CSV if not already saved locally, normalize columns, and save to data/compas.csv.

from pathlib import Path
import pandas as pd

project_root = Path(__file__).resolve().parent.parent if "__file__" in globals() else Path.cwd().resolve().parent
data_dir = project_root / "data"
data_dir.mkdir(parents=True, exist_ok=True)
compas_csv = data_dir / "compas.csv"

if compas_csv.exists():
    df = pd.read_csv(compas_csv)
    print(f"Loaded existing: {compas_csv}")
else:
    url = "https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv"
    df = pd.read_csv(url)
    df.to_csv(compas_csv, index=False)
    print(f"Downloaded & saved: {compas_csv}")

# Standardize key columns
if "two_year_recid" not in df.columns and "is_recid" in df.columns:
    df = df.rename(columns={"is_recid": "two_year_recid"})
df["two_year_recid"] = pd.to_numeric(df["two_year_recid"], errors="coerce").fillna(0).astype(int)

for col in ["race", "sex"]:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip()

df.to_csv(compas_csv, index=False)
print(f"Saved normalized dataset: {compas_csv}")

print("Shape:", df.shape)
print("Columns:", list(df.columns)[:15], "...")
print("\nTarget distribution (two_year_recid):")
print(df["two_year_recid"].value_counts().sort_index().to_frame("count").assign(
    fraction=lambda x: (x["count"] / x["count"].sum()).round(4)
))
if "sex" in df.columns:
    print("\nSex distribution:")
    print(df["sex"].value_counts().to_frame("count").assign(
        fraction=lambda x: (x["count"] / x["count"].sum()).round(4)
    ))
if "race" in df.columns:
    print("\nRace distribution:")
    print(df["race"].value_counts().to_frame("count").assign(
        fraction=lambda x: (x["count"] / x["count"].sum()).round(4)
    ))


Loaded existing: C:\Users\hana1\Documents\iva-bias-project\data\compas.csv
Saved normalized dataset: C:\Users\hana1\Documents\iva-bias-project\data\compas.csv
Shape: (7214, 53)
Columns: ['id', 'name', 'first', 'last', 'compas_screening_date', 'sex', 'dob', 'age', 'age_cat', 'race', 'juv_fel_count', 'decile_score', 'juv_misd_count', 'juv_other_count', 'priors_count'] ...

Target distribution (two_year_recid):
                count  fraction
two_year_recid                 
0                3963    0.5493
1                3251    0.4507

Sex distribution:
        count  fraction
sex                    
Male     5819    0.8066
Female   1395    0.1934

Race distribution:
                  count  fraction
race                             
African-American   3696    0.5123
Caucasian          2454    0.3402
Hispanic            637    0.0883
Other               377    0.0523
Asian                32    0.0044
Native American      18    0.0025


In [5]:
# Prepare a model-ready COMPAS CSV and minimal EDA summaries.
# - Input:  data/compas.csv  (raw, normalized to have 'two_year_recid')
# - Output: data/compas_model.csv  (selected features + binary target 'label')
# - EDA:    results/eda/* balance tables for target and sensitive attributes

from pathlib import Path
import pandas as pd

project_root = Path(__file__).resolve().parent.parent if "__file__" in globals() else Path.cwd().resolve().parent
data_dir = project_root / "data"
results_eda = project_root / "results" / "eda"
results_eda.mkdir(parents=True, exist_ok=True)

src = data_dir / "compas.csv"
dst = data_dir / "compas_model.csv"

df = pd.read_csv(src)

# Ensure target column exists and standardize name
if "two_year_recid" not in df.columns and "is_recid" in df.columns:
    df = df.rename(columns={"is_recid": "two_year_recid"})
df["two_year_recid"] = pd.to_numeric(df["two_year_recid"], errors="coerce").fillna(0).astype(int)

# Binary target: 1 if re-offended within two years, 0 otherwise
df["label"] = df["two_year_recid"]

# Feature selection (exclude identifiers like name, id, dob, etc.)
candidate_features = [
    "sex", "race", "age", "age_cat", "priors_count",
    "juv_fel_count", "juv_misd_count", "juv_other_count", "decile_score",
]
features = [c for c in candidate_features if c in df.columns]
cols = features + ["label"]

clean = df[cols].dropna().reset_index(drop=True)
clean.to_csv(dst, index=False)

# Minimal EDA summaries: counts and fractions for target/sensitive attributes
def counts_and_frac(s: pd.Series) -> pd.DataFrame:
    vc = s.value_counts(dropna=False)
    frac = (vc / vc.sum()).round(4)
    return pd.DataFrame({"count": vc, "fraction": frac})

counts_and_frac(clean["label"]).to_csv(results_eda / "compas_target_balance.csv")
if "sex" in clean.columns:
    counts_and_frac(clean["sex"]).to_csv(results_eda / "compas_sensitive_sex_balance.csv")
if "race" in clean.columns:
    counts_and_frac(clean["race"]).to_csv(results_eda / "compas_sensitive_race_balance.csv")

print(f"Saved model-ready file: {dst}  | rows={len(clean)}  cols={len(clean.columns)}")
print("EDA summaries written to:", results_eda)


Saved model-ready file: C:\Users\hana1\Documents\iva-bias-project\data\compas_model.csv  | rows=7214  cols=10
EDA summaries written to: C:\Users\hana1\Documents\iva-bias-project\results\eda


# 3 German Credit 

In [6]:
# German Credit Risk dataset
# - Task: predict whether a loan applicant is a good (1) or bad (2) credit risk.
# - Importance: widely used in fairness studies (bias across gender, age, etc.).
# - Source: UCI / OpenML.
# - Behavior: load local data/german.csv if present; otherwise fetch from OpenML.
#   Standardizes target column to 'credit_risk'.

from pathlib import Path
import pandas as pd
from sklearn.datasets import fetch_openml

project_root = Path(__file__).resolve().parent.parent if "__file__" in globals() else Path.cwd().resolve().parent
data_dir = project_root / "data"
data_dir.mkdir(parents=True, exist_ok=True)
german_csv = data_dir / "german.csv"

if german_csv.exists():
    df = pd.read_csv(german_csv)
    print(f"Loaded existing: {german_csv}")
else:
    german = fetch_openml(name="credit-g", version=1, as_frame=True)
    df = german.frame.copy()
    df.to_csv(german_csv, index=False)
    print(f"Fetched German Credit from OpenML and saved to: {german_csv}")

# Normalize target
if "credit_risk" not in df.columns and "class" in df.columns:
    df = df.rename(columns={"class": "credit_risk"})

df["credit_risk"] = df["credit_risk"].astype(str).str.strip()

df.to_csv(german_csv, index=False)
print(f"Saved normalized dataset: {german_csv}")

# Quick overview
print("Shape:", df.shape)
print("Columns:", list(df.columns)[:15], "...")
print("\nTarget distribution (credit_risk):")
print(df["credit_risk"].value_counts().to_frame("count").assign(
    fraction=lambda x: (x["count"] / x["count"].sum()).round(4)
))


Loaded existing: C:\Users\hana1\Documents\iva-bias-project\data\german.csv
Saved normalized dataset: C:\Users\hana1\Documents\iva-bias-project\data\german.csv
Shape: (1000, 21)
Columns: ['checking_status', 'duration', 'credit_history', 'purpose', 'credit_amount', 'savings_status', 'employment', 'installment_commitment', 'personal_status', 'other_parties', 'residence_since', 'property_magnitude', 'age', 'other_payment_plans', 'housing'] ...

Target distribution (credit_risk):
             count  fraction
credit_risk                 
good           700       0.7
bad            300       0.3
