In [282]:
import pandas as pd
import numpy as np 
from scipy.stats import chi2_contingency
from scipy.stats import ks_2samp
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from statsmodels.stats.proportion import proportions_ztest


## Read the Data

In [283]:
# Reading the data for training and testing
train = pd.read_csv("../Output/compass_cleaned_train.csv")
test = pd.read_csv("../Output/compass_cleaned_test.csv")

In [284]:
# Incase I need to know who is who
train['training'] = 1
test['training'] = 0
data_cleaned = pd.concat([train, test])
data_cleaned.to_csv("../Output/compass_cleaned.csv", index=False)

In [285]:
# Creating a full dataset 
df = pd.concat([train, test])
df

Unnamed: 0,id,sex_cat_sen,age_sen,race_cat_sen,juv_fel_count,juv_misd_count,juv_other_count,priors_count,c_charge_degree_cat,went_to_jail_cat,days_jail,predict_feat,age_sen_cat,training
0,10698,Male,24,African-American,0,0,3,0,F,True,0,0,18_to_25,1
1,4464,Male,37,African-American,0,0,0,3,M,False,2,0,25_to_45,1
2,5262,Female,48,Caucasian,0,0,0,2,F,False,33,0,45_to_65,1
3,6866,Female,23,African-American,0,1,0,1,M,False,0,0,18_to_25,1
4,7242,Female,22,African-American,0,0,0,0,M,False,2,0,18_to_25,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1438,6500,Female,24,Caucasian,0,0,0,0,F,False,1,0,18_to_25,0
1439,7538,Female,49,African-American,0,0,0,1,F,False,0,0,45_to_65,0
1440,8641,Male,77,Caucasian,0,0,0,1,M,False,4,0,65_and_over,0
1441,6383,Male,55,Caucasian,0,0,0,0,M,True,0,0,45_to_65,0


In [286]:
# Collecting any sensitive variable 
sensitive = []
categorical = []
continuous = []

for column in df.columns:
    if "sen" in column.lower():
        sensitive.append(column)
        
    if 'cat' in column.lower():
        categorical.append(column)
        
    if ("cat" not in column.lower()) and ("predict_feat" not in column.lower()):
        continuous.append(column)
        
print("Sensitive:", sensitive)
print("Categorical:", categorical)
print("Continuous:", continuous)


Sensitive: ['sex_cat_sen', 'age_sen', 'race_cat_sen', 'age_sen_cat']
Categorical: ['sex_cat_sen', 'race_cat_sen', 'c_charge_degree_cat', 'went_to_jail_cat', 'age_sen_cat']
Continuous: ['id', 'age_sen', 'juv_fel_count', 'juv_misd_count', 'juv_other_count', 'priors_count', 'days_jail', 'training']


## Distribution Test Functions

In [287]:
# Function for testing distribution across categorical values 
def test_chi_squared(train, test, column):
    contingency_table = pd.crosstab(train[column], test[column])

    # Performing the Chi-Squared test
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    

    print(f'Chi-squared statistic for {column}:', chi2)
    print('p-value:', p)

    # Interpret the p-value
    alpha = 0.05  # Significance level
    if p < alpha:
        print(f"The distributions of the variable {column} are significantly different.\n")
    else:
        print(f"There is no significant difference in the distributions of the variable {column}.\n")

In [288]:
# Function for testing distribution of a continuous variable across two datasets
def test_kolmogorov(train, test, column):
    # Perform the Kolmogorov-Smirnov test
    statistic, p_value = ks_2samp(train[column].dropna(), test[column].dropna())

    print(f'Kolmogorov-Smirnov statistic for {column}:', statistic)
    print(f'p-value for {column}:', p_value)

    # Interpret the p-value
    alpha = 0.05  # Significance level
    if p_value < alpha:
        print(f"The distributions of the variable {column} are significantly different.\n")
    else:
        print(f"There is no significant difference in the distributions of the variable {column}.\n")


In [289]:
def test_proportion_diffs(train, test, category):
    categories = train[category].unique()
    print(f"Testing differences in proportions for: {category}\n")
    
    for cat in categories:
        count_train = sum(train[category] == cat)
        count_test = sum(test[category] == cat)
        nobs_train = train.shape[0]
        nobs_test = test.shape[0]
        
        # Proportion of the category in each dataset
        prop_train = count_train / nobs_train
        prop_test = count_test / nobs_test
        
        # Perform Z-test for the difference between two proportions
        stat, pval = proportions_ztest([count_train, count_test], [nobs_train, nobs_test])
        
        print(f"Category: {cat}")
        print(f"Proportion in train: {prop_train:.4f}, Proportion in test: {prop_test:.4f}")
        print(f"Z-stat: {stat:.4f}, p-value: {pval:.4f}\n")
        
        # Interpret the p-value
        alpha = 0.05  # Significance level
        if pval < alpha:
            print(f"There is a significant difference in the proportion of {cat} between train and test.\n")
        else:
            #print(f"No significant difference in the proportion of {cat} between train and test.\n")
            pass


## Testing Splits across data

First level distributions across sensible features

In [290]:
# Test splits accross main variable
for column in continuous:
    if column in sensitive:
        #test_dis_cat(train, temp_test, feature_test)
        test_kolmogorov(train, test, column)

for column in categorical:
    if column in sensitive:
        #test_dis_cat(temp_train, temp_test, feature_test)
        test_kolmogorov(train, test, column)     

Kolmogorov-Smirnov statistic for age_sen: 0.018046597842127213
p-value for age_sen: 0.8387619415000188
There is no significant difference in the distributions of the variable age_sen.

Kolmogorov-Smirnov statistic for sex_cat_sen: 0.003498626787484871
p-value for sex_cat_sen: 1.0
There is no significant difference in the distributions of the variable sex_cat_sen.

Kolmogorov-Smirnov statistic for race_cat_sen: 0.026592685750544007
p-value for race_cat_sen: 0.3800297428438196
There is no significant difference in the distributions of the variable race_cat_sen.

Kolmogorov-Smirnov statistic for age_sen_cat: 0.014071600625057565
p-value for age_sen_cat: 0.9734312402112565
There is no significant difference in the distributions of the variable age_sen_cat.



In [291]:
train["race_cat_sen"].value_counts(normalize=True)

race_cat_sen
African-American    0.507018
Caucasian           0.344481
Hispanic            0.087160
Other               0.053717
Asian               0.004852
Native American     0.002772
Name: proportion, dtype: float64

In [292]:
test["race_cat_sen"].value_counts(normalize=True)

race_cat_sen
African-American    0.533611
Caucasian           0.322938
Hispanic            0.092862
Other               0.046431
Asian               0.002772
Native American     0.001386
Name: proportion, dtype: float64

## Testing Splits for Sub-Groups

In [293]:
for column in sensitive:
    if 'cat' in column:
        print(f"********* TESTING FEATURE: {column}*******************")
        for value in df[column].unique():
            print(f"----Sub Group: {value}")
            for feature_test in categorical:
                if (feature_test != column) and ('cat' in feature_test):
                    temp_train = train[train[column] == value]
                    temp_test = test[test[column] == value]
                    try:
                        #test_dis_cat(temp_train, temp_test, feature_test)
                        test_kolmogorov(temp_train, temp_test, feature_test)
                    except:
                        print(f"There is significant difference in the distributions of the variable {feature_test}.\n")

                    # if feature_test in continuous:
                    #     test_dis_cont(temp_train, temp_test, feature_test)
                        
                    # if feature_test in categorical:
                        
                    

********* TESTING FEATURE: sex_cat_sen*******************
----Sub Group: Male
Kolmogorov-Smirnov statistic for race_cat_sen: 0.025711255202151253
p-value for race_cat_sen: 0.5572106369487201
There is no significant difference in the distributions of the variable race_cat_sen.

Kolmogorov-Smirnov statistic for c_charge_degree_cat: 0.0291031461786094
p-value for c_charge_degree_cat: 0.3987563216968265
There is no significant difference in the distributions of the variable c_charge_degree_cat.

Kolmogorov-Smirnov statistic for went_to_jail_cat: 0.001690423034669227
p-value for went_to_jail_cat: 1.0
There is no significant difference in the distributions of the variable went_to_jail_cat.

Kolmogorov-Smirnov statistic for age_sen_cat: 0.01726834411807153
p-value for age_sen_cat: 0.9379754041627556
There is no significant difference in the distributions of the variable age_sen_cat.

----Sub Group: Female
Kolmogorov-Smirnov statistic for race_cat_sen: 0.03939935064935065
p-value for race_cat_

In [294]:
train[train["age_sen_cat"] == "18_to_25"]["sex_cat_sen"].value_counts(normalize=True)

sex_cat_sen
Male      0.810024
Female    0.189976
Name: proportion, dtype: float64

In [295]:
test[test["age_sen_cat"] == "18_to_25"]["sex_cat_sen"].value_counts(normalize=True)

sex_cat_sen
Male      0.818493
Female    0.181507
Name: proportion, dtype: float64

## Testing Splits Group by Outcome and Sensitive Features

In [296]:
for column in sensitive:
    if 'cat' in column:
        print(f"********* TESTING FEATURE: {column}*******************")
        for value in df["predict_feat"].unique():
            temp_train = train[train["predict_feat"] == value]
            temp_test = test[test["predict_feat"] == value]
            try:
                print(f"When prediction is {value}")
                #test_dis_cat(temp_train, temp_test, feature_test)
                #test_dis_cont(temp_train, temp_test, feature_test)
                test_kolmogorov(temp_train, temp_test, feature_test)
            except:
                print(f"There is significant difference in the distributions of the variable {column}.\n")

********* TESTING FEATURE: sex_cat_sen*******************
When prediction is 0
Kolmogorov-Smirnov statistic for age_sen_cat: 0.009856467939175193
p-value for age_sen_cat: 0.9999832090084211
There is no significant difference in the distributions of the variable age_sen_cat.

When prediction is 1
Kolmogorov-Smirnov statistic for age_sen_cat: 0.042882245101210976
p-value for age_sen_cat: 0.7841432589613637
There is no significant difference in the distributions of the variable age_sen_cat.

********* TESTING FEATURE: race_cat_sen*******************
When prediction is 0
Kolmogorov-Smirnov statistic for age_sen_cat: 0.009856467939175193
p-value for age_sen_cat: 0.9999832090084211
There is no significant difference in the distributions of the variable age_sen_cat.

When prediction is 1
Kolmogorov-Smirnov statistic for age_sen_cat: 0.042882245101210976
p-value for age_sen_cat: 0.7841432589613637
There is no significant difference in the distributions of the variable age_sen_cat.

********* T

In [297]:
train[train["predict_feat"] == 1]["race_cat_sen"].value_counts(normalize=True)

race_cat_sen
African-American    0.715686
Caucasian           0.203209
Hispanic            0.053476
Other               0.020499
Native American     0.004456
Asian               0.002674
Name: proportion, dtype: float64

In [298]:
test[test["predict_feat"] == 1]["race_cat_sen"].value_counts(normalize=True)

race_cat_sen
African-American    0.790036
Caucasian           0.170819
Hispanic            0.024911
Other               0.010676
Native American     0.003559
Name: proportion, dtype: float64

## Prediction Outcomes Across Groups

In [299]:
outcome = pd.read_csv("../Output/test_outcome.csv")

In [300]:
# Create a dictionary that keeps the positive outcomes for all the data and each sensitive feature split
Prediction = {}

temp_df = outcome["predict_feat"].value_counts(normalize=True).to_frame().reset_index()
Prediction["General_Pop"] = temp_df[temp_df["predict_feat"] == 1]

Prediction["General_Pop"]

Unnamed: 0,predict_feat,proportion
1,1,0.194733


In [301]:
# Create the positive outcomes for each split
for column in sensitive:
    if column in categorical:
        temp_df = outcome.groupby([column])["predict_feat"].value_counts(normalize=True).to_frame().reset_index()
        print(temp_df[temp_df["predict_feat"] == 1])
        Prediction[column] = temp_df[temp_df["predict_feat"] == 1]

  sex_cat_sen  predict_feat  proportion
1      Female             1    0.116364
3        Male             1    0.213185
        race_cat_sen  predict_feat  proportion
1   African-American             1    0.288312
4          Caucasian             1    0.103004
6           Hispanic             1    0.052239
8    Native American             1    0.500000
10             Other             1    0.044776
  age_sen_cat  predict_feat  proportion
1    18_to_25             1    0.294521
3    25_to_45             1    0.210094
5    45_to_65             1    0.056738


## Testing outcome variations across Groups

In [302]:
population_value = Prediction["General_Pop"]["proportion"].iloc[0]

for column in sensitive:
    if column in categorical:
        print(f"\n\n********* TESTING FEATURE: {column}*******************")
        for group in Prediction[column][column].unique():
            temp = Prediction[column]
            group_value = temp[temp[column] == group]["proportion"].iloc[0]
            print(f"For the group {group}: Percentage difference from General Population: {((group_value - population_value) * 100).round(2)}")




********* TESTING FEATURE: sex_cat_sen*******************
For the group Female: Percentage difference from General Population: -7.84
For the group Male: Percentage difference from General Population: 1.85


********* TESTING FEATURE: race_cat_sen*******************
For the group African-American: Percentage difference from General Population: 9.36
For the group Caucasian: Percentage difference from General Population: -9.17
For the group Hispanic: Percentage difference from General Population: -14.25
For the group Native American: Percentage difference from General Population: 30.53
For the group Other: Percentage difference from General Population: -15.0


********* TESTING FEATURE: age_sen_cat*******************
For the group 18_to_25: Percentage difference from General Population: 9.98
For the group 25_to_45: Percentage difference from General Population: 1.54
For the group 45_to_65: Percentage difference from General Population: -13.8


## Metric Evaluation of Predictions Across Groups

In [303]:

# Calculate confusion matrix
def calculate_confusion_matrix(true_values, predicted_values):
    tn, fp, fn, tp = confusion_matrix(true_values, predicted_values, labels=[0,1]).ravel()
    return tn, fp, fn, tp

# Calculate precision
def calculate_precision(true_values, predicted_values):
    return precision_score(true_values, predicted_values)

# Calculate recall
def calculate_recall(true_values, predicted_values):
    return recall_score(true_values, predicted_values)

# Calculate F1 score
def calculate_f1_score(true_values, predicted_values):
    return f1_score(true_values, predicted_values)

# Calculate ratios
def calculate_ratios(tn, fp, fn, tp):
    total_samples = tn + fp + fn + tp
    
    tn_ratio = tn / total_samples
    fp_ratio = fp / total_samples
    fn_ratio = fn / total_samples
    tp_ratio = tp / total_samples
    
    return tn_ratio, fp_ratio, fn_ratio, tp_ratio

# Pipeline function
def evaluation_pipeline(true_values, predicted_values):
    tn, fp, fn, tp = calculate_confusion_matrix(true_values, predicted_values)
    precision = calculate_precision(true_values, predicted_values)
    recall = calculate_recall(true_values, predicted_values)
    f1 = calculate_f1_score(true_values, predicted_values)
    tn_ratio, fp_ratio, fn_ratio, tp_ratio = calculate_ratios(tn, fp, fn, tp)
    
    return {
        # 'True Negatives': tn,
        # 'False Positives': fp,
        # 'False Negatives': fn,
        # 'True Positives': tp,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'True Negatives Ratio': tn_ratio,
        'False Positives Ratio': fp_ratio,
        'False Negatives Ratio': fn_ratio,
        'True Positives Ratio': tp_ratio
    }


In [304]:
Results = {}

predicted_values = outcome["prediction"]
true_values = outcome["predict_feat"]

valuation_results = evaluation_pipeline(true_values, predicted_values)

# Convert valuation_results to a DataFrame with an index
result_df = pd.DataFrame(valuation_results, index=[0]).transpose()
result_df.columns = ['Full_Data']
result_df = result_df
Results['Full_Data'] = result_df
result_df

Unnamed: 0,Full_Data
Precision,0.540107
Recall,0.359431
F1 Score,0.431624
True Negatives Ratio,0.745669
False Positives Ratio,0.059598
False Negatives Ratio,0.12474
True Positives Ratio,0.069993


Getting each one as a Data Frame

In [306]:
# getting the metrics across each split
for column in sensitive:
    if 'cat' in column:
        print(f"\n\n********** FEATURE {column}**************")
        # Initialize an empty DataFrame to store valuation results
        result_df = pd.DataFrame()

        for x in outcome[column].unique():
            predicted_values = outcome[outcome[column] == x]["prediction"]
            true_values = outcome[outcome[column] == x]["predict_feat"]
            valuation_results = evaluation_pipeline(true_values, predicted_values)
            
            # Add valuation results as a new column with the name as x
            result_df[x] = pd.Series(valuation_results)

        # Store the resulting DataFrame in the Results dictionary with column name as key
        Results[column] = result_df
        print(result_df)




********** FEATURE sex_cat_sen**************
                           Male    Female
Precision              0.549133  0.428571
Recall                 0.381526  0.187500
F1 Score               0.450237  0.260870
True Negatives Ratio   0.720034  0.854545
False Positives Ratio  0.066781  0.029091
False Negatives Ratio  0.131849  0.094545
True Positives Ratio   0.081336  0.021818


********** FEATURE race_cat_sen**************
                       Caucasian  African-American     Other  Hispanic  \
Precision               0.434783          0.580000  0.333333  0.200000   
Recall                  0.208333          0.391892  0.333333  0.285714   
F1 Score                0.281690          0.467742  0.333333  0.235294   
True Negatives Ratio    0.869099          0.629870  0.925373  0.888060   
False Positives Ratio   0.027897          0.081818  0.029851  0.059701   
False Negatives Ratio   0.081545          0.175325  0.029851  0.037313   
True Positives Ratio    0.021459          0.112987 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))


## Testing Metric Differences Across Groups 

In [307]:
population_value = Results['Full_Data'].iloc[-4:,0]

for column in sensitive:
    if column in categorical:
        print(f"\n\n********* TESTING FEATURE: {column}*******************")
        for group in Prediction[column][column].unique():
            group_value = Results[column].iloc[-4:][group]
            print(f"For the group {group}: Percentage difference from General Population:\n{((group_value - population_value) * 100).round(2)}\n")



********* TESTING FEATURE: sex_cat_sen*******************
For the group Female: Percentage difference from General Population:
True Negatives Ratio     10.89
False Positives Ratio    -3.05
False Negatives Ratio    -3.02
True Positives Ratio     -4.82
dtype: float64

For the group Male: Percentage difference from General Population:
True Negatives Ratio    -2.56
False Positives Ratio    0.72
False Negatives Ratio    0.71
True Positives Ratio     1.13
dtype: float64



********* TESTING FEATURE: race_cat_sen*******************
For the group African-American: Percentage difference from General Population:
True Negatives Ratio    -11.58
False Positives Ratio     2.22
False Negatives Ratio     5.06
True Positives Ratio      4.30
dtype: float64

For the group Caucasian: Percentage difference from General Population:
True Negatives Ratio     12.34
False Positives Ratio    -3.17
False Negatives Ratio    -4.32
True Positives Ratio     -4.85
dtype: float64

For the group Hispanic: Percentage d