# 5243 project4

## 1. Data pre-processing

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from scipy import optimize
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from scipy.optimize import minimize

In [6]:
# Load Dataset
url = 'https://raw.githubusercontent.com/LeeMere/ADS-Spring2024-Project4-MachineLearningFairness-Group7/main/data/compas-scores-two-years.csv'
df_raw = pd.read_csv(url)

In [7]:
df_raw.head()

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid
0,1,miguel hernandez,miguel,hernandez,2013-08-14,Male,1947-04-18,69,Greater than 45,Other,...,1,Low,2013-08-14,2014-07-07,2014-07-14,0,0,327,0,0
1,3,kevon dixon,kevon,dixon,2013-01-27,Male,1982-01-22,34,25 - 45,African-American,...,1,Low,2013-01-27,2013-01-26,2013-02-05,0,9,159,1,1
2,4,ed philo,ed,philo,2013-04-14,Male,1991-05-14,24,Less than 25,African-American,...,3,Low,2013-04-14,2013-06-16,2013-06-16,4,0,63,0,1
3,5,marcu brown,marcu,brown,2013-01-13,Male,1993-01-21,23,Less than 25,African-American,...,6,Medium,2013-01-13,,,1,0,1174,0,0
4,6,bouthy pierrelouis,bouthy,pierrelouis,2013-03-26,Male,1973-01-22,43,25 - 45,Other,...,1,Low,2013-03-26,,,2,0,1102,0,0


In [8]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7214 entries, 0 to 7213
Data columns (total 53 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       7214 non-null   int64  
 1   name                     7214 non-null   object 
 2   first                    7214 non-null   object 
 3   last                     7214 non-null   object 
 4   compas_screening_date    7214 non-null   object 
 5   sex                      7214 non-null   object 
 6   dob                      7214 non-null   object 
 7   age                      7214 non-null   int64  
 8   age_cat                  7214 non-null   object 
 9   race                     7214 non-null   object 
 10  juv_fel_count            7214 non-null   int64  
 11  decile_score             7214 non-null   int64  
 12  juv_misd_count           7214 non-null   int64  
 13  juv_other_count          7214 non-null   int64  
 14  priors_count            

In [10]:
# We can see there are some missing data exist, check with the missing data

missing_data = pd.DataFrame(df_raw.isna().sum()).sort_values(by = 0, ascending=False)
missing_data.columns = ["count"]
missing_data[missing_data["count"]!=0]

Unnamed: 0,count
violent_recid,7214
vr_charge_degree,6395
vr_case_number,6395
vr_offense_date,6395
vr_charge_desc,6395
c_arrest_date,6077
r_jail_out,4898
r_jail_in,4898
r_days_from_arrest,4898
r_charge_desc,3801


In [11]:
# Filter the DataFrame for the two races
df_filtered = df_raw[df_raw['race'].isin(['Caucasian', 'African-American'])]

In [12]:
# Feature selection
# Since we do not use the columns with missing data, we can ignore that
features = ['age', 'race', 'sex', 'decile_score', 'priors_count']  # Example features
X = df_filtered[features]
y = df_filtered['two_year_recid']

In [13]:
# Check missing values again
print("Missing values in the dataset:")
print(X.isnull().sum())
print(y.isnull().sum())

Missing values in the dataset:
age             0
race            0
sex             0
decile_score    0
priors_count    0
dtype: int64
0


In [14]:
# Map variable to a binary variable
X['race'] = X['race'].map({'Caucasian': 1, 'African-American': 0})
X['sex'] = X['sex'].map({'Female': 0, 'Male': 1})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['race'] = X['race'].map({'Caucasian': 1, 'African-American': 0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['sex'] = X['sex'].map({'Female': 0, 'Male': 1})


In [15]:
X.head()

Unnamed: 0,age,race,sex,decile_score,priors_count
1,34,0,1,3,0
2,24,0,1,4,4
3,23,0,1,8,1
6,41,1,1,6,14
8,39,1,0,1,0


In [16]:
# Check the lengths of X and y to make sure they match
assert len(X) == len(y), "The lengths of X and y do not match."

In [17]:
df = pd.concat([X, y], axis=1)
df.head()

Unnamed: 0,age,race,sex,decile_score,priors_count,two_year_recid
1,34,0,1,3,0,1
2,24,0,1,4,4,1
3,23,0,1,8,1,0
6,41,1,1,6,14,1
8,39,1,0,1,0,0


In [18]:
#data splitting for baseline model
train_size = int(len(df) * 0.714)
remainder_size = int(len(df) * 0.143)

train = df[:train_size]
remainder = df[train_size:]

validation = remainder[:remainder_size]
test = remainder[remainder_size:]

label = "two_year_recid"
sensitive = "race"
features = ['age', 'sex', 'decile_score', 'priors_count']
features_race = ['race', 'age', 'race', 'sex', 'decile_score', 'priors_count']

x_train, y_train, race_train = train[features], train[label].to_numpy(), train[sensitive]
x_val, y_val, race_val = validation[features], validation[label].to_numpy(), validation[sensitive]
x_test, y_test, race_test = test[features], test[label].to_numpy(), test[sensitive]

x_train_race, x_test_race, x_val_race = train[features_race], test[features_race], validation[features_race]

## 2. Baseline Model - Logistic Regression (Without Constraints)

In [19]:
bsl = LogisticRegression().fit(x_train, y_train)

In [20]:
def calc_calibration(sensitive_attr, y_pred, y_true):
    cau_index = np.where(sensitive_attr == 1)[0]
    african_index = np.where(sensitive_attr == 0)[0]

    y_pred_cau = y_pred[cau_index]
    y_true_cau = y_true[cau_index]
    Acc_cau = sum(y_pred_cau == y_true_cau)/len(y_pred_cau)

    y_pred_african = y_pred[african_index]
    y_true_african = y_true[african_index]
    Acc_african = sum(y_pred_african == y_true_african)/len(y_pred_african)

    calibration = abs(Acc_cau - Acc_african)
    return(calibration)

In [21]:
#evaluating the baseline model
summary_bsl = {"Set": ["Train", "Validation", "Test"],
               "Accuracy":  [bsl.score(x_train, y_train), bsl.score(x_val, y_val), bsl.score(x_test, y_test)],
               "Calibration": [calc_calibration(race_train, bsl.predict(x_train), y_train),
                               calc_calibration(race_val, bsl.predict(x_val), y_val),
                               calc_calibration(race_test, bsl.predict(x_test), y_test)]}
pd.DataFrame(summary_bsl)

Unnamed: 0,Set,Accuracy,Calibration
0,Train,0.680938,0.012042
1,Validation,0.682594,0.005245
2,Test,0.65,0.047773


## Algorithm 2: Learning Fair Representations

In [22]:
# data split for LFR
df_filtered = df_raw[df_raw['race'].isin(['Caucasian', 'African-American'])]

# Perform data splitting
train_size = int(len(df_filtered) * 0.714)
remainder_size = int(len(df_filtered) * 0.143)

train = df_filtered[:train_size]
remainder = df_filtered[train_size:]
validation = remainder[:remainder_size]
test = remainder[remainder_size:]

# Define the features, label, and sensitive attribute
features = ['age', 'sex', 'decile_score', 'priors_count']
label = "two_year_recid"
sensitive = "race"

# Encode categorical variables
train['sex'] = train['sex'].map({'Female': 0, 'Male': 1})
validation['sex'] = validation['sex'].map({'Female': 0, 'Male': 1})
test['sex'] = test['sex'].map({'Female': 0, 'Male': 1})

train['race'] = train['race'].map({'African-American': 0, 'Caucasian': 1})
validation['race'] = validation['race'].map({'African-American': 0, 'Caucasian': 1})
test['race'] = test['race'].map({'African-American': 0, 'Caucasian': 1})

# Define X_train, X_val, X_test, y_train, y_val, y_test, sensitive_train, sensitive_val, sensitive_test
X_train, y_train, sensitive_train = train[features].values, train[label].values, train[sensitive].values
X_val, y_val, sensitive_val = validation[features].values, validation[label].values, validation[sensitive].values
X_test, y_test, sensitive_test = test[features].values, test[label].values, test[sensitive].values

print(f"Training set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Testing set size: {len(X_test)}")

Training set size: 4391
Validation set size: 879
Testing set size: 880


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['sex'] = train['sex'].map({'Female': 0, 'Male': 1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validation['sex'] = validation['sex'].map({'Female': 0, 'Male': 1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['sex'] = test['sex'].map({'Female': 0, 'Male': 1})
A value is trying to be

In [23]:
# Define the distance function using weighted Euclidean distance
def weighted_euclidean_distance(x, v, alpha):
    return np.sqrt(np.sum(alpha * (x - v)**2, axis=-1))

# Define the probability matrix calculation function
def calculate_probability_matrix(X, V, alpha):
    num_samples, num_features = X.shape
    num_prototypes = V.shape[0]
    P = np.zeros((num_samples, num_prototypes))
    for i in range(num_samples):
        distances = weighted_euclidean_distance(X[i], V, alpha)
        P[i] = np.exp(-distances) / np.sum(np.exp(-distances))
    return P

In [24]:
# Define the function to calculate the reconstruction of X and the associated loss
def reconstruct_x_and_loss(X, P, V):
    X_hat = np.dot(P, V)  # Reconstruction of X
    Loss_X = np.mean((X - X_hat) ** 2)  # Reconstruction loss
    return X_hat, Loss_X

# Define the function to calculate the prediction of Y and the associated loss
def predict_y_and_loss(P, w, y):
    y_hat = np.dot(P, w)  # Prediction of Y
    Loss_Y = -np.mean(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))  # Prediction loss
    return y_hat, Loss_Y

In [25]:
# Define the objective function for LFR
def LFR_objective(params, X_sensitive, X_nonsensitive, y_sensitive, y_nonsensitive, num_prototypes, A_z, A_x, A_y):
    num_features = X_sensitive.shape[1]
    # Unpack parameters
    alpha_sensitive = params[:num_features]
    alpha_nonsensitive = params[num_features:2*num_features]
    w = params[2*num_features:(2*num_features + num_prototypes)]
    V = params[-(num_features * num_prototypes):].reshape((num_prototypes, num_features))

    # Calculate probability matrices
    P_sensitive = calculate_probability_matrix(X_sensitive, V, alpha_sensitive)
    P_nonsensitive = calculate_probability_matrix(X_nonsensitive, V, alpha_nonsensitive)

    # Calculate losses for sensitive group
    X_hat_sensitive, Loss_X_sensitive = reconstruct_x_and_loss(X_sensitive, P_sensitive, V)
    y_hat_sensitive, Loss_Y_sensitive = predict_y_and_loss(P_sensitive, w, y_sensitive)

    # Calculate losses for nonsensitive group
    X_hat_nonsensitive, Loss_X_nonsensitive = reconstruct_x_and_loss(X_nonsensitive, P_nonsensitive, V)
    y_hat_nonsensitive, Loss_Y_nonsensitive = predict_y_and_loss(P_nonsensitive, w, y_nonsensitive)

    # Calculate fairness loss
    M_k_sensitive = P_sensitive.mean(axis=0)
    M_k_nonsensitive = P_nonsensitive.mean(axis=0)
    Loss_Z = np.abs(M_k_sensitive - M_k_nonsensitive).sum()

    # Combine losses into final objective
    objective = A_z * Loss_Z + A_x * (Loss_X_sensitive + Loss_X_nonsensitive) + A_y * (Loss_Y_sensitive + Loss_Y_nonsensitive)
    return objective

In [26]:
# Set the hyperparameters
num_prototypes = 10
A_z = 1
A_x = 0.01
A_y = 1

# Initialize parameters
num_features = X_train.shape[1]
alpha_sensitive = np.ones(num_features)
alpha_nonsensitive = np.ones(num_features)
w = np.random.rand(num_prototypes)
V = np.random.rand(num_prototypes, num_features)
params_init = np.concatenate((alpha_sensitive, alpha_nonsensitive, w, V.flatten()))

# Split the data into sensitive and nonsensitive groups
X_sensitive_train = X_train[sensitive_train == 1]
X_nonsensitive_train = X_train[sensitive_train == 0]
y_sensitive_train = y_train[sensitive_train == 1]
y_nonsensitive_train = y_train[sensitive_train == 0]

In [27]:
# Train the LFR model
result = minimize(LFR_objective, params_init, args=(X_sensitive_train, X_nonsensitive_train, y_sensitive_train, y_nonsensitive_train, num_prototypes, A_z, A_x, A_y), method='L-BFGS-B')
params_opt = result.x

# Unpack optimized parameters
alpha_sensitive_opt = params_opt[:num_features]
alpha_nonsensitive_opt = params_opt[num_features:2*num_features]
w_opt = params_opt[2*num_features:(2*num_features + num_prototypes)]
V_opt = params_opt[-(num_features * num_prototypes):].reshape((num_prototypes, num_features))

# Make predictions on validation and test sets
P_val = calculate_probability_matrix(X_val, V_opt, alpha_sensitive_opt if sensitive_val.mean() > 0.5 else alpha_nonsensitive_opt)
y_pred_val = np.round(np.dot(P_val, w_opt))

P_test = calculate_probability_matrix(X_test, V_opt, alpha_sensitive_opt if sensitive_test.mean() > 0.5 else alpha_nonsensitive_opt)
y_pred_test = np.round(np.dot(P_test, w_opt))

# Evaluate the model
accuracy_val = accuracy_score(y_val, y_pred_val)
calibration_val = calc_calibration(sensitive_val, y_pred_val, y_val)

accuracy_test = accuracy_score(y_test, y_pred_test)
calibration_test = calc_calibration(sensitive_test, y_pred_test, y_test)

print(f"Validation Accuracy: {accuracy_val:.4f}")
print(f"Validation Calibration: {calibration_val:.4f}")
print(f"Test Accuracy: {accuracy_test:.4f}")
print(f"Test Calibration: {calibration_test:.4f}")

  return np.sqrt(np.sum(alpha * (x - v)**2, axis=-1))
  Loss_Y = -np.mean(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))  # Prediction loss
  P[i] = np.exp(-distances) / np.sum(np.exp(-distances))


Validation Accuracy: 0.5518
Validation Calibration: 0.0673
Test Accuracy: 0.5170
Test Calibration: 0.1752


The baseline model outperforms the Learning Fair Representations (LFR) model in accuracy across training, validation, and test datasets. LFR's decrease in accuracy suggests a trade-off for improved fairness, which is typical for fairness-focused models. However, the LFR model does not show better calibration compared to the baseline, indicating it may not have improved fairness as intended. Higher calibration values in the LFR model, particularly on the test data, suggest that it might be less fair than the baseline, despite the fairness adjustments. To address these issues, further model tuning and investigation into the balance and representation of the sensitive attribute within the data are necessary.