### Bias & Fairness in Data: Bias Mitigation Techniques
**Question**: Use the Adult Income dataset and apply reweighing technique to balance the
class weights based on sensitive attributes (e.g., gender).

In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from aif360.datasets import BinaryLabelDataset
from aif360.algorithms.preprocessing import Reweighing
from aif360.metrics import BinaryLabelDatasetMetric

# Step 1: Load the dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
           'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
           'hours-per-week', 'native-country', 'income']

df = pd.read_csv(url, names=columns, sep=',\s*', engine='python')
df = df.dropna()

# Step 2: Encode categorical variables
for col in df.select_dtypes(include='object'):
    df[col] = LabelEncoder().fit_transform(df[col])

# Step 3: Define features and labels
X = df.drop('income', axis=1)
y = df['income']  # 0: <=50K, 1: >50K

# Step 4: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 5: Combine for AIF360
train_df = X_train.copy()
train_df['income'] = y_train

test_df = X_test.copy()
test_df['income'] = y_test

# Step 6: Create BinaryLabelDataset for training data
dataset_orig_train = BinaryLabelDataset(df=train_df,
                                        label_names=['income'],
                                        protected_attribute_names=['sex'])

dataset_orig_test = BinaryLabelDataset(df=test_df,
                                       label_names=['income'],
                                       protected_attribute_names=['sex'])

# Define privileged and unprivileged groups
privileged_groups = [{'sex': 1}]   # male
unprivileged_groups = [{'sex': 0}] # female

# Step 7: Apply Reweighing
RW = Reweighing(unprivileged_groups=unprivileged_groups,
                privileged_groups=privileged_groups)
dataset_transf_train = RW.fit_transform(dataset_orig_train)

# Step 8: Train Logistic Regression with and without reweighing

def train_and_evaluate(X_train, y_train, sample_weight, X_test, y_test, label=""):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    model = LogisticRegression(max_iter=1000)
    model.fit(X_train_scaled, y_train, sample_weight=sample_weight)
    y_pred = model.predict(X_test_scaled)

    print(f"\nClassification Report {label}:")
    print(classification_report(y_test, y_pred))

# Without reweighing
train_and_evaluate(X_train, y_train, None, X_test, y_test, label="(Original)")

# With reweighing
train_and_evaluate(X_train, y_train, dataset_transf_train.instance_weights,
                   X_test, y_test, label="(With Reweighing)")

# Step 9: Check fairness metric
metric_orig = BinaryLabelDatasetMetric(dataset_orig_train, 
                                       unprivileged_groups=unprivileged_groups,
                                       privileged_groups=privileged_groups)

metric_rw = BinaryLabelDatasetMetric(dataset_transf_train, 
                                     unprivileged_groups=unprivileged_groups,
                                     privileged_groups=privileged_groups)

print("\nFairness Metrics:")
print("Mean difference before reweighing:", metric_orig.mean_difference())
print("Mean difference after reweighing:", metric_rw.mean_difference())



Classification Report (Original):
              precision    recall  f1-score   support

           0       0.85      0.94      0.89      7455
           1       0.71      0.45      0.55      2314

    accuracy                           0.83      9769
   macro avg       0.78      0.70      0.72      9769
weighted avg       0.81      0.83      0.81      9769


Classification Report (With Reweighing):
              precision    recall  f1-score   support

           0       0.83      0.94      0.89      7455
           1       0.69      0.39      0.50      2314

    accuracy                           0.81      9769
   macro avg       0.76      0.67      0.69      9769
weighted avg       0.80      0.81      0.79      9769


Fairness Metrics:
Mean difference before reweighing: -0.19585191128843257
Mean difference after reweighing: 0.0
