## Import

In [None]:
import pandas as pd
from sklearn.metrics import confusion_matrix, precision_score


In [None]:
# set the max columns to none
pd.set_option('display.max_columns', None)

### So what we want to do in this notebook is create a confusion matrix for each feature, for each dataset, for each model
- Steps:
    - Split the datasets
- Create confusion matrix per feature

## Load data

In [None]:
df = pd.read_excel("\data\final_bias_data\20240203_bias_lhr_processed.xlsx")

## EDA + global variables

In [None]:
# for column in df.columns:
#     print(column)
#     print(df[column].nunique())

In [None]:
df

In [None]:
bias_features = ['geslacht', # Gender
                 'is_parttime_parent', 'is_fulltime_parent', # Parenthood
                 'Leeftijd<30', 'Leeftijd<40', 'Leeftijd<50', # Age
                 'IsNederlands', 'IsWesters'] # Nationality

In [None]:
for feature in bias_features:
    print(feature)
    print(df[feature].unique())

In [None]:
# privileged_unprivileged = {'geslacht' : {'privileged' : 'M',
#                                          'unprivileged' : 'V'}, # Gender
#                  'is_parttime_parent': {'privileged' : True,
#                                          'unprivileged' : False}, # Parenthood
#                  'is_fulltime_parent': {'privileged' : True,
#                                          'unprivileged' : False}, 
#                  'Leeftijd<30': {'privileged' : 1,
#                                          'unprivileged' : 0}, # Age
#                  'Leeftijd<40': {'privileged' : 1,
#                                          'unprivileged' : 0}, 
#                  'Leeftijd<50': {'privileged' : 1,
#                                          'unprivileged' : 0}, 
#                  'IsNederlands': {'privileged' : 1,
#                                          'unprivileged' : 0}, # Nationality
#                  'IsWesters': {'privileged' : 1,
#                                          'unprivileged' : 0}
#                  }

In [None]:
bias_features_values = {'geslacht' : ['M', 'V'], # Gender
                 'IsFulltimeParent':[1, 0], # Parenthood
                 'IsParttimeParent': [1, 0], 
                 'Leeftijd<30': [1, 0], # Age
                 'Leeftijd<40': [1, 0],
                 'Leeftijd<50': [1, 0], 
                 'IsNederlands': [1, 0], # Nationality
                 'IsWesters': [1, 0]
                 }

In [None]:
df = df.rename(columns = {'model_before_reweighing_prediction' : 'BR', 
                     'model_after_reweighing_prediction': 'AR'})

In [None]:
# We run in to issies with underscores when creating the resulting df, so we remove those where needed
df.loc[df.dataset == 'Training_test', 'dataset'] = 'TrainingTest'
df.loc[df.dataset == 'Training_train', 'dataset'] = 'TrainingTrain'

df = df.rename(columns = {'is_fulltime_parent':'IsFulltimeParent',
                     'is_parttime_parent': 'IsParttimeParent'})

In [None]:
# All our binary columns are in 1 and 0, not in boolean
df.loc[df['IsFulltimeParent'] == True, 'IsFulltimeParent'] = 1
df.loc[df['IsFulltimeParent'] == False, 'IsFulltimeParent'] = 0
df.loc[df['IsParttimeParent'] == True, 'IsParttimeParent'] = 1
df.loc[df['IsParttimeParent'] == False, 'IsParttimeParent'] = 0

In [None]:
model_pred_columns = ['BR', 'AR']

In [None]:
# We have multiple ways of labeling, make uniform
dict_map_label = {'Onderzoekswaardig': 1,
                    'Niet onderzoekswaardig': 0}

df['Label'] = df['Label'].replace(dict_map_label)
df['Label'].value_counts()

## Split the datasets

In [None]:
# # Lets do some psuedo code for our structure
# def calculate_CM(
#     data: pd.DataFrame = df,
#     feature: str,
#     feature_value
#     ) -> list:
#     """
#     Creates the confusion matrix for the feature, feature_value pair

#         Parameters
#         ----------
#         data
#             Data to be analyzed.
#         feature
#             The feature we want to calculate the CMs for.
#         feature_values
#             The feature values we want to calculate the CMs for.
#             We don't want to do this for all values of the features since there are 'Onbekend' and additional values.
#             We want binary grouping.

#         Returns:
#         Confusion Matrix as an (2, 2) array with order tn, fp, fn, tp
#     """
   
#     df_temp = df.loc[df[feature] == value]
#     CM_feature_value = confusion_matrix()
        
        
#     return CM_dict
       

In [None]:
df['dataset'].unique()

In [None]:
# This ugly nested for-loop galore for quick and dirty, maybe rewrite
# Create dict of CMs for each combination of dataset, model, feature, and feature value
CM_dict = {}

for dataset in df['dataset'].unique():
    print(dataset)
    df_temp = df.loc[df['dataset'] == dataset]
    for model in model_pred_columns:
        for feature in bias_features_values.keys():
            for feature_value in bias_features_values[feature]:
                df_CM = df_temp.loc[df_temp[feature] == feature_value]
                # if df_temp.shape != (0,30):
                #     # display(df_temp)
                CM_feature_value_model = confusion_matrix(df_CM['Label'], df_CM[model])
                CM_dict[f"{dataset}_{model}_{feature}_{feature_value}"] = CM_feature_value_model

In [None]:
# Create an empty dictionary for the long data format
long_data_dict = {
    'Dataset': [],
    'Model': [],
    'Feature': [],
    'Feature_Value': [],
    'Metric': [],
    'Value': []
}

for key, value in CM_dict.items():
    # The key contains all info we need on dataset, model, feature, and feature value, so we split
    parts = key.split('_')
    # Get the info
    dataset = parts[0]
    model = parts[1]
    feature = parts[2]
    feature_value = parts[3]
    # metric_values is (2,2) matrix, we want a list
    metric_values = [val for sublist in value for val in sublist]
    # Check if any metric value is lower than 10
    if any(val < 10 for val in metric_values):
        # We don't communicate numbers below 10, so remove those
        metric_values = [0 if val < 10 else val for val in metric_values]
        # If there is only one metric value below 10, the other three values can be calculated based on group size
        # So then we remove the two lowest values
        if sum(val < 10 for val in metric_values) == 1:
            min_values = sorted(metric_values)[:2]
            metric_values = [0 if val in min_values else val for val in metric_values]
    # Add each metric value along with other information to the long data dictionary
    # The order of metrics seems unintuitive but is in accordance to the output from sklearn confmatrix
    for metric, metric_value in zip(["TN", "FP", "FN", "TP"], metric_values):
        long_data_dict['Dataset'].append(dataset)
        long_data_dict['Model'].append(model)
        long_data_dict['Feature'].append(feature)
        long_data_dict['Feature_Value'].append(feature_value)
        long_data_dict['Metric'].append(metric)
        long_data_dict['Value'].append(metric_value)

# Convert the long data dictionary to a DataFrame
df_cms = pd.DataFrame(long_data_dict)

display(df_cms)


In [None]:
df_pilot = df.loc[df['dataset'] == 'Pilot']

In [None]:
df_cms_under10 = df_cms.loc[df_cms['Value'] < 10]

In [None]:
CM_dict['Prepilot_AR_IsNederlands_1']

In [None]:
df_cms.to_excel("\bias_analysis\20240228_CMs_LHR_SlimmeCheck.xlsx", index=False)

## Create count for most important features

In [None]:
pd.options.display.float_format = '{:,.0f}'.format

In [None]:
belangrijkste_feature_columnnames = ['Belangrijkste feature 1', 'Belangrijkste feature 2', 'Belangrijkste feature 3']

In [None]:
# We want counts so we remap the geslacht column to values
dict_map_label = {'M': 1,
                    'V': 0}

df['geslacht'] = df['geslacht'].replace(dict_map_label)
df['geslacht'].value_counts()

In [None]:
import numpy as np

In [None]:
# For the groupby we need to change string dtype to floats in order to sum
for column in bias_features_values.keys():
    print(df[column].unique())
    df.loc[~df[column].isin([1, 0]), column] = np.nan
    print(df[column].unique())
    df[column] = df[column].astype(float)

In [None]:
# We again need a split
df_fimp_pilot = df.loc[df['dataset'] == 'Pilot']
df_fimp_prepilot = df.loc[(df['dataset'] == 'Prepilot')
                          # The most important features are only known for part of the prepilot dataset, so we only add those
                          & (df['Belangrijkste feature 1'] != 'Onbekend')]

In [None]:
# We want a function to calculate the count of each of the "most important feature" communicated to HH.
# Here we disregard the order (because the group sizes would become to small for sharing with LHR)
def get_imp_feature_counts(df: pd.DataFrame, 
                           belangrijkste_features: list = []
                           ) -> pd.DataFrame:
    
    for column in belangrijkste_feature_columnnames:
        belangrijkste_features.extend(list(df_fimp_prepilot[column].unique()))
    
    belangrijkste_features = list(set(belangrijkste_features))
    
    df_imp_feature_counts = pd.DataFrame(columns = ['Feature', 'Value', 'Important Feature', 'Count'])
    df_imp_feature_counts['Important Feature'] = belangrijkste_features
    
    df_temp_values = pd.DataFrame(columns = ['Feature', 'Value', 'Important Feature', 'Count'])
    df_temp_values['Important Feature'] = belangrijkste_features
    
    for column in bias_features_values.keys():
        for value in [0,1]:
            df_temp = df.loc[df[column] == value]
            stacked = df_temp[belangrijkste_feature_columnnames].stack()
            counts = stacked.value_counts()
            df_temp_values['Feature'] = column
            df_temp_values['Value'] = value
            for imp_feature in counts.index:
                df_temp_values.loc[df_temp_values['Important Feature'] == imp_feature, 'Count'] = counts[imp_feature]
            df_imp_feature_counts = pd.concat([df_imp_feature_counts, df_temp_values])
            
    df_imp_feature_counts = df_imp_feature_counts.loc[~df_imp_feature_counts['Feature'].isna()]
    
    df_imp_feature_counts = df_imp_feature_counts.fillna(0)
    df_imp_feature_counts.loc[df_imp_feature_counts['Count'] < 10, 'Count'] = 0    
    
    return df_imp_feature_counts

In [None]:
df_fimp_pilot_counts = get_imp_feature_counts(df_fimp_pilot)
df_fimp_prepilot_counts = get_imp_feature_counts(df_fimp_prepilot)

In [None]:
df_fimp_pilot_counts['dataset'] = 'Pilot'
df_fimp_prepilot_counts['dataset'] = 'Prepilot'

df_fimp_counts = pd.concat([df_fimp_pilot_counts, df_fimp_prepilot_counts])

In [None]:
df_fimp_counts.to_excel("\bias_analysis\20240308_Important_Features_Counts.xlsx", index=False)