In [25]:
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from xgboost import XGBClassifier
import os
import numpy as np
import random
import matplotlib.pyplot as plt
from scipy.stats import uniform
from fairlearn.metrics import *
from sklearn.metrics import *
import scipy.stats.distributions as dists
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from fairmlhealth import report, measure

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [26]:
# Set random seeds
seed = 1234
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)

In [27]:
path = 'data/census/adult-preprocessed.csv'
file_dataframe = pd.read_csv(path, delimiter=',')

# Columns
CATEGORICAL_COLUMNS = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
BIAS_COLUMNS = [
    'age', 'race_Amer-Indian-Eskimo', 'race_Asian-Pac-Islander', 'race_Black',
    'race_Other', 'race_White', 'sex_Female', 'sex_Male']
LABEL_COLUMN = "income"
NUMERICAL_COLUMNS = ['capital-loss', 'capital-gain', 'age', 'fnlwgt', 'education-num', 'hours-per-week']
IGNORE_COLUMNS = ['is_train']

categorical_columns_df = file_dataframe[CATEGORICAL_COLUMNS]
file_dataframe.drop(columns=CATEGORICAL_COLUMNS, inplace=True)

SyntaxError: invalid syntax (Temp/ipykernel_4564/2106370369.py, line 8)

In [4]:
def min_max_scaling(dataframe):
    scaler = MinMaxScaler()
    for i in NUMERICAL_COLUMNS:
        dataframe[i] = scaler.fit_transform(dataframe[[i]])
    return dataframe

# def standard_scaling(dataframe):
#     scaler = StandardScaler()
#     for i in NUMERICAL_COLUMNS:
#         dataframe[i] = scaler.fit_transform(dataframe[[i]])
#     return dataframe

In [5]:
def predict_all_classifiers(output_dir, X, y, X_test, y_test):
    # Calculate negative positive ratio
    negative_positive_ratio = sum(file_dataframe[LABEL_COLUMN] == 0) / sum(file_dataframe[LABEL_COLUMN] == 1)

    # Define the classifiers
    classifiers = [
        SVC(max_iter=1000, C=1e9, class_weight="balanced"),#SGDClassifier(loss='log', alpha=0.01, max_iter=2000, tol=0, class_weight='balanced'),s
        RandomForestClassifier(class_weight="balanced"),
        XGBClassifier(objective='binary:logistic', scale_pos_weight=negative_positive_ratio, use_label_encoder=False, seed=seed, eval_metric='auc')
        ]

    # Define hyperparameters for the classifiers. 
    # Note: I did not use hyperparameters since performance is not a focus of the excersise as mentioned by the professor
    hyperparameters = [
        dict(C= uniform(0.1, 100), gamma=['scale', 'auto'], kernel=['linear', 'rbf']),
        dict(bootstrap= [True, False], max_features= ['auto', 'sqrt'], max_depth=dists.randint(5, 50), n_estimators= dists.randint(50, 200), min_samples_leaf= [1, 2, 4], min_samples_split= [2, 5, 10]),
        dict(eta=[0.001, 0.1, 0.3, 0.5], min_child_weight=[1, 3, 5], max_depth=dists.randint(5, 50), n_estimators=  dists.randint(50, 200), subsample= [0.5, 0.8, 1.0])
    ]

    # Define the crosss validation strategy
    # cv = KFold(n_splits=10)
    best_auc, best_y_pred = 0, None

    # Iterate all classifiers
    for i in range(len(classifiers)):
        # Select the classifier and its hyperparameters for the experimentation
        classifier = classifiers[i]
        hp = hyperparameters[i]

        # Define the hyperparameter search strategy and find the best model accordingly
        clf = RandomizedSearchCV(classifier, hp, n_iter=30, scoring ='roc_auc', n_jobs=-1, verbose=0)
        clf.fit(X, y)
        best_model = clf.best_estimator_

        # Predict the labels given the test set's features
        y_pred = best_model.predict(X_test)

        with open(f'{output_dir}/{best_model.__class__.__name__}.npy', 'wb') as f:
            np.save(f, y_pred)
        
        # Evaluate the performance of the model based on the test set
        performance_metrics(output_dir, i, y_test, y_pred)

        fairmlhealth_metrics(best_model, X_test, y_test, y_pred)

        if best_auc < auc:
            best_y_pred = y_pred
            best_auc = auc
    return best_y_pred, best_auc

def performance_metrics(output_dir, classifier_id, y_test, y_predictions):
    classifier_names = ['SVC', 'RandomForestClassifier', 'XGBClassifier']
    with open(f'{output_dir}/{classifier_names[classifier_id]}_results.txt', 'w') as f:
        print("----- PERFORMANCE METRICS -----", file=f)
        print("--- ACCURACY SCORE ---", file=f)
        print(accuracy_score(y_test, y_predictions), file=f)
        print("--- PRECISION SCORE ---", file=f)
        print(precision_score(y_test, y_predictions, average='weighted'), file=f)
        print("--- RECALL SCORE ---", file=f)
        print(recall_score(y_test, y_predictions, average='weighted'), file=f)
        print("--- F1 SCORE ---", file=f)
        print(f1_score(y_test, y_predictions, average='weighted'), file=f)
        print("--- CLASSIFICATION REPORT ---", file=f)
        print(classification_report(y_test, y_predictions), file=f)
        print("--- CONFUSION MATRIX ---", file=f)
        print(confusion_matrix(y_test, y_predictions), file=f)
        false_pos_rate, true_pos_rate, thresholds = roc_curve(y_test, y_predictions, pos_label=1)
        print("--- FALSE POSITIVE RATE ---", file=f)
        print(false_pos_rate, file=f)
        print("--- TRUE POSITIVE RATE ---", file=f)
        print(true_pos_rate, file=f)
        print("--- AREA UNDER CURVE ---", file=f)
        print(auc(false_pos_rate, true_pos_rate), file=f)
        print("--- ROC AUC ---", file=f)
        print(roc_auc_score(y_test, y_predictions), file=f)
        
def fairmlhealth_metrics(model, X_test, y_test, y_pred):
    for bias_feature in BIAS_COLUMNS:
        report.compare(test_data=X_test, targets=y_test, predictions=y_pred, protected_attr=X_test[bias_feature], models=model)
    measure.performance(X=X_test, y_true=y_test, y_pred=y_pred, features=BIAS_COLUMNS)
    measure.bias(X_test[BIAS_COLUMNS], y_test, y_pred)

In [6]:
def plotOccurence(data,colname,label):
    plot=pd.crosstab(index=data[colname],columns=data[label]).plot(kind='bar',stacked=True,figsize=(16,5))
    plt.xlabel(colname)
    plt.ylabel('Count')
    plt.grid(axis='y',linestyle='-')
    plt.title(colname+" vs "+label+" count")

def plotProportion(data,colname,label):
    plot=pd.crosstab(index=data[colname],columns=data[label],normalize='index').plot(kind='bar',stacked=True,figsize=(16,5))
    plt.xlabel(colname)
    plt.ylabel('Proportion')
    plt.grid(axis='y',linestyle='-')
    plt.title(colname+" vs "+label+" proportion")

def draw_fairlearn_figure(y_test, y_pred, data):
    metrics = {
        'accuracy': accuracy_score,
        'f1': f1_score,
        'precision': precision_score,
        'recall': recall_score,
        'false positive rate': false_positive_rate,
        'true positive rate': true_positive_rate,
        'selection rate': selection_rate,
        'count': count,
        'demographic parity difference': demographic_parity_difference,
        'demographic parity ratio': demographic_parity_ratio,
        'equalized odds difference': equalized_odds_difference,
        'equalized odds ratio': equalized_odds_ratio}
    metric_frame = MetricFrame(metrics=metrics,
                            y_true=y_test,
                            y_pred=y_pred,
                            sensitive_features=data)
    metric_frame.by_group.plot.bar(
        subplots=True,
        layout=[4, 2],
        legend=False,
        figsize=[12, 8],
        title="Show all metrics",
    )

In [4]:
# Split the data into train and test
train = file_dataframe[file_dataframe.is_train==True].drop(columns=['is_train'])
test = file_dataframe[file_dataframe.is_train==False].drop(columns=['is_train'])

train = min_max_scaling(train)
test = min_max_scaling(test)

# train = standard_scaling(train)
# test = standard_scaling(test)

# Prepare train set's features and labels
# X = train.drop(columns=[LABEL_COLUMN]).to_numpy()
# y = train[LABEL_COLUMN].to_numpy().astype(bool)
X = train.drop(columns=[LABEL_COLUMN])
y = train[LABEL_COLUMN].astype(bool)

# Prepare test set's features and labels
# X_test = test.drop(columns=[LABEL_COLUMN]).to_numpy()
# y_test = test[LABEL_COLUMN].to_numpy().astype(bool)
X_test = test.drop(columns=[LABEL_COLUMN])
y_test = test[LABEL_COLUMN].astype(bool)

best_y_pred, best_auc = predict_all_classifiers('outputs/census', X, y, X_test, y_test)

y_pred = best_y_pred
PRED_COLUMN = 'predicted '+ LABEL_COLUMN
preds = pd.DataFrame(y_pred, columns=[PRED_COLUMN])

file_dataframe = pd.concat([file_dataframe, categorical_columns_df], axis=1)

gold_data = file_dataframe[file_dataframe.is_train==False]
gold_data.reset_index(drop=True, inplace=True)
gold_data[LABEL_COLUMN].replace({False: '<=50K', True: '>50K'}, inplace=True)
gold_data.age = gold_data.age // 10 * 10

pred_data = pd.concat([gold_data, preds], axis=1)
pred_data[PRED_COLUMN].replace({False: '<=50K', True: '>50K'}, inplace=True)

for col in BIAS_COLUMNS:
    plotOccurence(gold_data, col, LABEL_COLUMN)
    plotOccurence(pred_data, col, PRED_COLUMN)
    plotProportion(gold_data, col, LABEL_COLUMN)
    plotProportion(pred_data, col, PRED_COLUMN)
    draw_fairlearn_figure(y_test, y_pred, gold_data[col])

