### Bias & Fairness in Data: Bias Mitigation Techniques
**Question**: Use the Adult Income dataset and apply reweighing technique to balance the
class weights based on sensitive attributes (e.g., gender).

In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# Load Adult dataset from UCI (sampled smaller for speed)
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
cols = [
    "age","workclass","fnlwgt","education","education-num",
    "marital-status","occupation","relationship","race","sex",
    "capital-gain","capital-loss","hours-per-week","native-country","income"
]
data = pd.read_csv(url, header=None, names=cols, na_values=" ?", skipinitialspace=True)

# Drop rows with missing values
data.dropna(inplace=True)

# Encode target and sensitive attribute (sex)
data['income'] = data['income'].apply(lambda x: 1 if x == '>50K' else 0)
data['sex'] = data['sex'].apply(lambda x: 1 if x == 'Male' else 0)

# Calculate weights to balance sex distribution relative to income
# Calculate P(sex, income) and then weights = 1 / P(sex, income)
counts = data.groupby(['sex', 'income']).size()
total = len(data)
joint_prob = counts / total
weights = 1 / joint_prob

# Map weights back to rows
data['weight'] = data.apply(lambda row: weights.loc[(row['sex'], row['income'])], axis=1)

# Features: just numeric + one-hot encoding for simplicity
X = pd.get_dummies(data.drop(columns=['income', 'weight']), drop_first=True)
y = data['income']
w = data['weight']

# Train-test split
X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(X, y, w, test_size=0.3, random_state=42)

# Train Logistic Regression with sample weights
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train, sample_weight=w_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# Show distribution of predicted income by sex to assess bias mitigation
pred_df = pd.DataFrame({'sex': data.loc[X_test.index, 'sex'], 'pred': y_pred})
print(pred_df.groupby('sex')['pred'].mean())
#

              precision    recall  f1-score   support

           0       0.91      0.81      0.86      7455
           1       0.55      0.75      0.63      2314

    accuracy                           0.80      9769
   macro avg       0.73      0.78      0.75      9769
weighted avg       0.83      0.80      0.81      9769

sex
0    0.252934
1    0.356760
Name: pred, dtype: float64


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
