In [None]:
import numpy as np
import pandas as pd
import datetime as dt
import joblib
from collections import defaultdict
import json
import datetime as dt
from pathlib import Path
import os

# To display BSNs fully
pd.set_option("display.max_colwidth", 1000)

# For convenience
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)

import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [None]:
from wpi_uitkeringsfraude.project_paths import ARTIFACT_PATH, DATA_PATH, CONFIG_PATH, INFO_PATH
from wpi_uitkeringsfraude.model.manage_model_info import load_feature_list
# from wpi_uitkeringsfraude.model.build_model import filter_application_handling
from wpi_uitkeringsfraude.settings.settings import WPISettings
# from wpi_uitkeringsfraude.components import SocratesDienstPersoonJoin, SocratesAdresFeatures
from wpi_uitkeringsfraude.scorer import Scorer

In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
# set the max columns to none
pd.set_option('display.max_columns', None)

In [None]:
# Read base pre-pilot data file
df_prepilot = pd.read_excel("\data\pre-pilot\2022.05.23_Merged_pre-pilot_templates.xlsx", header = 1)
df_prepilot_bias = pd.read_csv("\data\pre-pilot\20220523_data_for_bias_analysis.csv")
# df_prepilot_data = pd.read_excel("\data\pre-pilot\pre_pilot_data.xlsx")

In [None]:
for df in [df_prepilot, df_prepilot_bias]:
    print(df.shape)
    display(df.head())

### Preprocess

In [None]:
# df_prepilot_bias.loc[df_prepilot_bias['srp_id'].isin(srp_id_onderzoekswaardig), 'onderzoekswaardig'] = 1

# # We want all of the is_onderzoek_hh ones for the analysis
# df_prepilot_bias_hh = df_prepilot_bias.loc[~(df_prepilot_bias['srp_id'].isna())
#                     #  & (df_prepilot_bias['is_screening_hh'] == False) 
#                      ]

In [None]:
# We want all of the is_onderzoek_hh ones for the analysis
df_prepilot_bias_hh = df_prepilot_bias.loc[ (df_prepilot_bias['is_onderzoek_hh'] == True)
                    #  & (df_prepilot_bias['is_screening_hh'] == False) 
                     ]

In [None]:
# We want onderzoekswaardig to be differently mapped
dict_map_onderzoekswaardig = {'Onderzoekswaardig' : 1,
                              'Niet onderzoekswaardig' : 0}

df_prepilot['onderzoekswaardig'] = df_prepilot['Label'].replace(dict_map_onderzoekswaardig)

### Merge datasets

In [None]:
df_prepilot_merged = pd.merge(df_prepilot, df_prepilot_bias, 
         left_on = 'Dienstnummer',
         right_on = 'application_dienstnr',
         how='left')

In [None]:
df_prepilot_merged['onderzoekswaardig'] = df_prepilot_merged['onderzoekswaardig_x']

In [None]:
df_prepilot_merged = pd.concat([df_prepilot_merged, df_prepilot_bias_hh])

In [None]:
df_prepilot_merged['onderzoekswaardig'].value_counts()

In [None]:
df_prepilot_merged['Label'] = df_prepilot_merged['onderzoekswaardig']

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from interpret.glassbox import ExplainableBoostingClassifier
from interpret import show

# df_training = df_training.sort_values(by='dtaanvraag')

X = df_prepilot_merged.loc[:, df_prepilot_merged.columns != 'onderzoekswaardig']
y = df_prepilot_merged['onderzoekswaardig']

seed = 42
# np.random.seed(seed)
X_test, y_test = X, y
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, shuffle=True, random_state=seed)
# Load models
model_file_prepilot = Path('\Models\20220523_model_used_in_prepilot.pkl')
model_file_pilot = Path('\Models\20240228_wpi_model_pilot_31.pkl')
models = {'model_before_reweighing':model_file_prepilot,
          'model_after_reweighing': model_file_pilot}
# Set threshold
thr=0.63

for model_name, model_file in models.items():
    model_dict = joblib.load(model_file)
    model = model_dict["model"]
    prep = model[:-1]  # all but the last pipeline steps, hence all transformers, but not the model
    clf = model[-1]  # the actual model
    num_cols, cat_cols = load_feature_list()
    label = "onderzoekswaardig"
    # Model scores
    X_test[f"{model_name}_score"] = model.predict_proba(X_test)[:,1]
    # Check if above theshold
    X_test[f"{model_name}_prediction"] = (model.predict_proba(X_test)[:, 1] >= thr).astype('int')
    
models = {'model_before_reweighing':model_file_prepilot,
          'model_after_reweighing': model_file_pilot}

# X_train['onderzoekswaardig'] = y_train
X_test['onderzoekswaardig'] = y_test

from sklearn.metrics import confusion_matrix, precision_score

conf_before_reweigh = confusion_matrix(X_test['onderzoekswaardig'], X_test['model_before_reweighing_prediction'])
conf_after_reweigh = confusion_matrix(X_test['onderzoekswaardig'], X_test['model_after_reweighing_prediction'])

print("Conf matrix before reweighing")
display(conf_before_reweigh)

print("Conf matrix after reweighing")
display(conf_after_reweigh)

print("Precision before reweighing")
# display(conf_before_reweigh[0,0]/(conf_before_reweigh[0,0]+conf_before_reweigh[0,1]))
print(precision_score(X_test['onderzoekswaardig'], X_test['model_before_reweighing_prediction']))

print("Precision after reweighing")
# display(conf_after_reweigh[0,0]/(conf_after_reweigh[0,0]+conf_after_reweigh[0,1]))
print(precision_score(X_test['onderzoekswaardig'], X_test['model_after_reweighing_prediction']))

In [None]:
df_prepilot_merged = X_test

## Complete / Merge data before preprocessing

In [None]:
prepilot_bias_cols = ['application_dienstnr', 'Label', 'received_same_product_last_year', 
                      'applied_for_same_product_last_year', 'days_since_last_dienst_end',
                      'days_since_last_relocation', 'active_address_count', 'dtaanvraag', 'geslacht', 'geboortejaar',
                      'is_parttime_parent', 'is_fulltime_parent', 
                      'onderzoekswaardig', 'model_prob', 'model_before_reweighing_score', 
        'model_before_reweighing_prediction', 'model_after_reweighing_score',
        'model_after_reweighing_prediction',
        'Belangrijkste feature 1', 'Belangrijkste feature 2', 'Belangrijkste feature 3']

In [None]:
df_prepilot_bias = df_prepilot_merged[prepilot_bias_cols]

In [None]:
df_prepilot_to_enrich = df_prepilot_bias[['application_dienstnr', 'dtaanvraag']]

In [None]:
# df_prepilot_to_enrich.to_csv("\data\interim_data\Enrichment_files\20240306_prepilot_data_verrijking.csv")

In [None]:
df_prepilot_bias['dtaanvraag'] = pd.to_datetime(df_prepilot_bias['dtaanvraag'])

In [None]:
# # Still need to add nationality from enriched_dataset
# df_enriched = pd.read_excel("\data\interim_data\Enrichment_files\20240130_Enriched_dataset.xlsx")
# df_enriched_prepilot = df_enriched.loc[df_enriched['LABELDATA'] == 'prepilot']

# # We have new enriched dataset including the negatives
df_enriched_prepilot = pd.read_excel("\data\interim_data\Enrichment_files\20240307_Enriched_dataset_prepilot.xlsx")


In [None]:
# df_enriched_prepilot

In [None]:
df_enriched_prepilot = df_enriched_prepilot[['DIENSTNR', 'NATIONALITEIT1', 'NATIONALITEIT1_OMSCHRIJVING', 'DTGEBOORTE']]

In [None]:
df_prepilot_bias = pd.merge(df_prepilot_bias, df_enriched_prepilot, left_on='application_dienstnr', right_on = 'DIENSTNR', how='left')

In [None]:
# Calculate age and store the result in a new column 'Leeftijd'
df_prepilot_bias['Leeftijd'] = np.floor((df_prepilot_bias['dtaanvraag'] - df_prepilot_bias['DTGEBOORTE']) / np.timedelta64(1, 'Y'))

In [None]:
df_prepilot_bias

In [None]:
df_prepilot_bias.to_excel("\data\processed_bias_data\20240131_prepilot_processed.xlsx")