## Import 

In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
import numpy as np
import pandas as pd
import datetime as dt
import joblib
from collections import defaultdict
import json
import datetime as dt
from pathlib import Path
import pyarrow

# To display BSNs fully
pd.set_option("display.max_colwidth", 1000)

# For convenience
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)

import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [None]:
from wpi_uitkeringsfraude.project_paths import ARTIFACT_PATH, DATA_PATH, CONFIG_PATH, INFO_PATH
from wpi_uitkeringsfraude.model.manage_model_info import load_feature_list
# from wpi_uitkeringsfraude.model.build_model import filter_application_handling
from wpi_uitkeringsfraude.settings.settings import WPISettings
# from wpi_uitkeringsfraude.components import SocratesDienstPersoonJoin, SocratesAdresFeatures
from wpi_uitkeringsfraude.scorer import Scorer

In [None]:
# This is the entire dataset of the period of the training data (excluding necessary filters)
df = pd.read_parquet("\data\training\transformed_data.parquet")

In [None]:
# set the max columns to none
pd.set_option('display.max_columns', None)

In [None]:
df.head()

In [None]:
# We know from documentation that the final dataset is around 3400 applications with about 55% positive label
# So we can be relatively certain this is indeed the final training dataset
display(df.loc[df['is_onderzoek_hh'] == 1, 'onderzoekswaardig'].value_counts())
print(1860/(1860+1538))

In [None]:
# Here we extract the training data from the full dataset
df_training = df.loc[df['is_onderzoek_hh'] == 1]

In [None]:
pd.set_option('mode.chained_assignment', None)

## Add Slimme Check Score

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from interpret.glassbox import ExplainableBoostingClassifier
from interpret import show

df_training = df_training.sort_values(by='subjectnr')

X = df_training.loc[:, df_training.columns != 'onderzoekswaardig']
y = df_training['onderzoekswaardig']

seed = 42
# np.random.seed(seed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, shuffle=True, random_state=seed)

In [None]:
# Load models
model_file_prepilot = Path('\Models\20220523_model_used_in_prepilot.pkl')
model_file_pilot = Path('\Models\20240228_wpi_model_pilot_31.pkl')

In [None]:
# Test set
models = {'model_before_reweighing':model_file_prepilot,
          'model_after_reweighing': model_file_pilot}
# Set threshold
thr=0.63

for model_name, model_file in models.items():
    model_dict = joblib.load(model_file)
    model = model_dict["model"]
    prep = model[:-1]  # all but the last pipeline steps, hence all transformers, but not the model
    clf = model[-1]  # the actual model
    num_cols, cat_cols = load_feature_list()
    label = "onderzoekswaardig"
    # Model scores
    X_test[f"{model_name}_score"] = model.predict_proba(X_test)[:,1]
    # Check if above threshold
    X_test[f"{model_name}_prediction"] = (model.predict_proba(X_test)[:, 1] >= thr).astype('int')

In [None]:
# Training set
models = {'model_before_reweighing':model_file_prepilot,
          'model_after_reweighing': model_file_pilot}


for model_name, model_file in models.items():
    model_dict = joblib.load(model_file)
    model = model_dict["model"]
    prep = model[:-1]  # all but the last pipeline steps, hence all transformers, but not the model
    clf = model[-1]  # the actual model
    num_cols, cat_cols = load_feature_list()
    label = "onderzoekswaardig"
    # Model scores
    X_train[f"{model_name}_score"] = model.predict_proba(X_train)[:,1]
    # Check if above threshold
    X_train[f"{model_name}_prediction"] = (model.predict_proba(X_train)[:, 1] >= thr).astype('int')

In [None]:
X_train['onderzoekswaardig'] = y_train
X_test['onderzoekswaardig'] = y_test

In [None]:
from sklearn.metrics import confusion_matrix, precision_score

conf_before_reweigh = confusion_matrix(X_test['onderzoekswaardig'], X_test['model_before_reweighing_prediction'])
conf_after_reweigh = confusion_matrix(X_test['onderzoekswaardig'], X_test['model_after_reweighing_prediction'])

print("Conf matrix before reweighing")
print(conf_before_reweigh)

print("Conf matrix after reweighing")
print(conf_after_reweigh)

print()
print("Precision before reweighing")
# display(conf_before_reweigh[0,0]/(conf_before_reweigh[0,0]+conf_before_reweigh[0,1]))
print(precision_score(X_test['onderzoekswaardig'], X_test['model_before_reweighing_prediction']))


print("Precision after reweighing")
print(precision_score(X_test['onderzoekswaardig'], X_test['model_after_reweighing_prediction']))


In [None]:
conf_before_reweigh.shape

## Process for bias analysis

In [None]:
X_train['dataset'] = 'Training_train'
X_test['dataset'] = 'Training_test'
df_training = pd.concat([X_train, X_test])

In [None]:
# We only need certain columns for the bias analysis
columns_filter_training  = ['application_dienstnr', 'dtaanvraag', 'onderzoekswaardig', 'model_before_reweighing_score', 'model_before_reweighing_prediction',
                            'model_after_reweighing_score', 'model_after_reweighing_prediction']

df_training_interim = df_training[columns_filter_training]

In [None]:
# df_training_interim = df_training_interim.rename(columns = {'application_dienstnr':'dienstnr'})

In [None]:
columns_verrijking = ['application_dienstnr', 'dtaanvraag']
df_training_interim_verrijking = df_training_interim[columns_verrijking]

In [None]:
df_training_interim_verrijking.to_csv("20240111_training_data_verrijking.csv", index=False)

In [None]:
df_training.head()

## Merge enriched data and training data for preprocessing

In [None]:
df_training_bias = df_training[['application_dienstnr', 'received_same_product_last_year',
       'applied_for_same_product_last_year', 'days_since_last_dienst_end',
       'days_since_last_relocation', 'active_address_count', 'dtaanvraag',
       'is_parttime_parent', 'is_fulltime_parent',
       'model_before_reweighing_score', 'model_before_reweighing_prediction',
       'model_after_reweighing_score', 'model_after_reweighing_prediction',
       'onderzoekswaardig', 'dataset']]

In [None]:
# Still need to add nationality from enriched_dataset
df_enriched = pd.read_excel("\data\interim_data\Enrichment_files\20240201_Enriched_dataset.xlsx")

In [None]:
df_enriched.head()

In [None]:
df_enriched_training = df_enriched.loc[df_enriched['LABELDATA'] == 'training']
df_enriched_training = df_enriched_training[['DIENSTNR', 'DTGEBOORTE','NATIONALITEIT1','NATIONALITEIT1_OMSCHRIJVING', 'GESLACHT']]

In [None]:
df_training_bias['application_dienstnr'] = df_training_bias['application_dienstnr'].astype(int)

In [None]:
df_training_bias = pd.merge(df_training_bias, df_enriched_training, left_on='application_dienstnr', right_on = 'DIENSTNR', how='left')

In [None]:
df_training_bias

In [None]:
df_training_bias['onderzoekswaardig'].value_counts()

In [None]:
import datetime as dt
from dateutil.relativedelta import relativedelta
import numpy as np

In [None]:
# Calculate age and store the result in a new column 'Leeftijd'
df_training_bias['Leeftijd'] = np.floor((df_training_bias['dtaanvraag'] - df_training_bias['DTGEBOORTE']) / np.timedelta64(1, 'Y'))


In [None]:
df_training_bias.to_excel("\data\processed_bias_data\20240131_training_processed.xlsx")

In [None]:
df_training_bias['GESLACHT'].value_counts()