In [None]:
import numpy as np
import config as cfg
import pandas as pd
from tools import file_reader, file_writer, explainer
from utility import metrics
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import recall_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import xgboost as xgb
import os
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn import preprocessing

from sklearn.model_selection import KFold

In [None]:
from utils_Copy import *

In [None]:
def LFR_custom(df_train,y_train,lfr=None):
    from aif360.algorithms.preprocessing import LFR
    from aif360.datasets import BinaryLabelDataset
    
    df_train=pd.concat([df_train,y_train],axis=1)
    
    X_col_names_f=['Gender', 'BirthYear', 'LoanPeriod', 'NumberAts']
    df2_all=df_train.drop(columns=X_col_names_f).copy() #Saves all columns, except numerical og gender
    df2=df_train[X_col_names_f+["Fall"]].copy() #Saves only numerical features
    df2_gender=df_train["Gender"].copy() #Saves only gender
    
    
    #Create the binarylabeldataset
    df_BLD = BinaryLabelDataset(favorable_label='1',
                                unfavorable_label='0',
                                df=df2,
                                label_names=['Fall'],
                                protected_attribute_names=["Gender"],
                                unprivileged_protected_attributes=['0'])
    #Define the DI remover
    if lfr is None:
        lfr = LFR(privileged_groups=[{"Gender": 1}], 
                                    unprivileged_groups=[{"Gender": 0}]
                  
                 )
        rp_df = lfr.fit_transform(df_BLD)
    else:
        rp_df = lfr.transform(df_BLD)
        

    #Save the columnnames
    all_col_names=df_BLD.feature_names+df_BLD.label_names
        
    #Save repaired data as pandas DF
    rp_df_pd = pd.DataFrame(np.hstack([rp_df.features,rp_df.labels]),columns=all_col_names) 
    
    # OBS: Somehow gender is also transformed! So we drop it! 
    rp_df_pd = rp_df_pd.drop(columns=["Gender"])
    
    #Concatenate the non-numerical columns
    transformed_data = pd.concat ([rp_df_pd,df2_all], axis=1)
    
    
    transformed_data=transformed_data.drop(columns=["Fall"])
    
    return transformed_data,lfr

In [None]:
def DI_remove_custom(df_train,RP_level=1.0,drop_d=False,y_train=None):
    from aif360.algorithms.preprocessing import DisparateImpactRemover
    from aif360.datasets import BinaryLabelDataset
    
    if drop_d:
        df_train=pd.concat([df_train,y_train],axis=1)
    
    X_col_names_f=['Gender', 'BirthYear', 'LoanPeriod', 'NumberAts']
    df2_all=df_train.drop(columns=X_col_names_f).copy() #Saves all columns, except numerical og gender
    df2=df_train[X_col_names_f].copy() #Saves only numerical features
    
    df2["dummy"]=1 # this is a dummy variable, since DI remover dont use y. 
    
    #Create the binarylabeldataset
    df_BLD = BinaryLabelDataset(favorable_label='1',
                                unfavorable_label='0',
                                df=df2,
                                label_names=['dummy'],
                                protected_attribute_names=["Gender"],
                                unprivileged_protected_attributes=['0'])
    #Define the DI remover
    di = DisparateImpactRemover(repair_level=RP_level)
    #Save the columnnames
    all_col_names=df_BLD.feature_names+df_BLD.label_names
    #Reparing the data
    rp_df = di.fit_transform(df_BLD)  
    #Save repaired data as pandas DF
    rp_df_pd = pd.DataFrame(np.hstack([rp_df.features,rp_df.labels]),columns=all_col_names) 
    #Concatenate the non-numerical columns
    transformed_data = pd.concat ([rp_df_pd,df2_all], axis=1)
    
    transformed_data_train=transformed_data.drop(columns=["dummy"])
    
    if drop_d:
        transformed_data_train=transformed_data_train.drop(columns=["Gender"])

    
    return transformed_data_train


In [None]:
procted_col_name="Gender"
target_name = "Fall"
y_col_name=target_name

In [None]:
pathRoot="../../Data_air/"
pathFall=pathRoot+"Fall_count_clusterOHE_std.csv"
df=pd.read_csv(pathFall)

In [None]:
titel_mitigation="DI remove no gender" #example

In [None]:
dropping_D=True
gender_swap=False
DI_remove=True
LFR_mitigation=False #Set dropping_D=True, men without removing it from X

In [None]:
PATH_orig="../Xgboost/models/"+titel_mitigation+"/"
PATH=PATH_orig


#Make dir to files
if not os.path.exists(PATH):
    os.makedirs(PATH)
    print("Created new path!: ",PATH)

In [None]:

X_col_names=[
#'Gender',
'BirthYear',
'LoanPeriod',
'NumberAts',
'Ats_Polstring',
'Ats_Mobilitystokke',
'Ats_Belysning',
'Ats_Underlag',
'Ats_ToiletforhøjereStativ',
'Ats_Signalgivere',
'Ats_EldrevneKørestole',
'Ats_Forstørrelsesglas',
'Ats_Nødalarmsystemer',
'Ats_MobilePersonløftere',
'Ats_TrappelifteMedPlatforme',
'Ats_Badekarsbrætter',
'Ats_Albuestokke',
'Ats_MaterialerOgRedskaberTilAfmærkning',
'Ats_Ryglæn',
#'Ats_0',
'Ats_GanghjælpemidlerStøtteTilbehør',
'Ats_Støttebøjler',
'Ats_Lejringspuder',
'Ats_Strømpepåtagere',
'Ats_Dørtrin',
'Ats_Spil',
'Ats_BordePåStole',
'Ats_Drejeskiver',
'Ats_Toiletstole',
'Ats_LøftereStationære',
'Ats_Madmålingshjælpemidler',
'Ats_Fodbeskyttelse',
'Ats_Ståløftere',
'Ats_Stole',
'Ats_Sengeborde',
'Ats_Toiletter',
'Ats_ToiletforhøjereFaste',
'Ats_Påklædning',
'Ats_Brusere',
'Ats_VævsskadeLiggende',
'Ats_Døråbnere',
'Ats_ServeringAfMad',
'Ats_TrappelifteMedSæder',
'Ats_SæderTilMotorkøretøjer',
'Ats_KørestoleManuelleHjælper',
'Ats_Gangbukke',
'Ats_Rollatorer',
'Ats_TryksårsforebyggendeSidde',
'Ats_Fastnettelefoner',
'Ats_Bækkener',
'Ats_Vendehjælpemidler',
'Ats_Sanseintegration',
'Ats_Kørestolsbeskyttere',
'Ats_Arbejdsstole',
'Ats_Løftesejl',
'Ats_KørestoleForbrændingsmotor',
'Ats_Løftestropper',
'Ats_Stiger',
'Ats_TransportTrapper',
'Ats_DrivaggregaterKørestole',
'Ats_Emballageåbnere',
'Ats_ToiletforhøjereLøse',
'Ats_Hårvask',
'Ats_PersonløftereStationære',
'Ats_Madrasser',
'Ats_Vinduesåbnere',
'Ats_Læsestativer',
'Ats_KørestoleManuelleDrivringe',
'Ats_Sædepuder',
'Ats_UdstyrCykler',
'Ats_Karkludsvridere',
'Ats_Vaskeklude',
'Ats_Sengeudstyr',
'Ats_Madlavningshjælpemidler',
'Ats_Skohorn',
'Ats_GribetængerManuelle',
'Ats_Hvilestole',
'Ats_EldrevneKørestoleStyring',
'Ats_BærehjælpemidlerTilKørestole',
'Ats_LøftegalgerSeng',
'Ats_Høreforstærkere',
'Ats_Kalendere',
'Ats_Stokke',
'Ats_Løftegalger',
'Ats_Ure',
'Ats_StøttegrebFlytbare',
'Ats_Forflytningsplatforme',
'Ats_RamperFaste',
'Ats_Rygehjælpemidler',
'Ats_Personvægte',
'Ats_Manøvreringshjælpemidler',
'Ats_Overtøj',
'Ats_Lydoptagelse',
'Ats_Gangborde',
'Ats_Ståstøttestole',
'Ats_RamperMobile',
'Ats_Bærehjælpemidler',
'Ats_Badekarssæder',
'Ats_Siddemodulsystemer',
'Ats_Videosystemer',
'Ats_Siddepuder',
'Ats_Sengeheste',
'Ats_Stolerygge',
'Ats_Rulleborde',
'Ats_Sengeforlængere',
'Ats_Madningsudstyr',
'Ats_Brusestole',
'Ats_Flerpunktsstokke',
'Ats_SengebundeMedMotor',
'Ats_Cykler',
'Ats_CykelenhederKørestole',
'Ats_Stokkeholdere',
'Ats_Toiletarmstøtter',
'Ats_Coxitstole',
'Ats_Toiletsæder',
'Ats_Rebstiger',
'Ats_Forhøjerklodser',
'Cluster_0',
'Cluster_1',
'Cluster_2',
'Cluster_3',
'Cluster_4',
'Cluster_5',
'Cluster_6',
'Cluster_7',
'Cluster_8',
'Cluster_9',
'Cluster_10',
'Cluster_11',
'Cluster_12',
'Cluster_13',
'Cluster_14',
'Cluster_15',
'Cluster_16',
'Cluster_17',
'Cluster_18',
'Cluster_19']

In [None]:
modelcounter=0
df_test=pd.DataFrame([],columns=list(X_col_names)+["Fall"]+["output"]+["output_prob"])

if LFR_mitigation==True:
    df_test.drop(columns=["Gender"])

for new_seed in range(1,11):
    df = df.sample(frac=1, random_state=new_seed).reset_index(drop=True)
      
    X = df[X_col_names]
    y = df[target_name].to_frame()
    
    neg, pos = np.bincount(y[target_name])
    scale_pos_weight = neg / pos

    params = {"n_estimators": 400,
            "objective": "binary:logistic",
            "scale_pos_weight": scale_pos_weight,
            "use_label_encoder": False,
            "learning_rate": 0.1,
            "eval_metric": "logloss",
            "seed": 0
    }
    
    
    model = xgb.XGBClassifier(**params)
    #skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=new_seed)
    skf=KFold(n_splits=5, random_state=new_seed, shuffle=True)

    
    y_valid_pred = 0*y[target_name]
    valid_acc, valid_pre, valid_recall, valid_roc_auc = list(), list(), list(), list()
    #for train_index, valid_index in skf.split(X_train, y_train):
    for train_index, valid_index in skf.split(X):
        print(f"Running model {modelcounter}")

        X_train_split, X_valid_split = X.iloc[train_index,:], X.iloc[valid_index,:]
        y_train_split, y_valid_split = y.iloc[train_index], y.iloc[valid_index]
        
        if gender_swap==True:
            X_train_split_copy=X_train_split.copy()
            y_train_split_copy=y_train_split.copy()
            
            X_train_split_copy["Gender"]=(X_train_split_copy["Gender"]-1)*(-1)
            
            X_train_split=pd.concat([X_train_split,X_train_split_copy])
            
            y_train_split=pd.concat([y_train_split,y_train_split_copy])
            
        if DI_remove==True:
            X_train_split=DI_remove_custom(X_train_split.reset_index(drop=True),drop_d=dropping_D,y_train=df[procted_col_name].iloc[train_index].to_frame().reset_index(drop=True))
            X_valid_split=DI_remove_custom(X_valid_split.reset_index(drop=True),drop_d=dropping_D,y_train=df[procted_col_name].iloc[valid_index].to_frame().reset_index(drop=True))
            
        if LFR_mitigation==True:
            X_train_split,lfr=LFR_custom(X_train_split.reset_index(drop=True),
                                         y_train_split.reset_index(drop=True),
                                         lfr=None)
            X_valid_split,lfr=LFR_custom(X_valid_split.reset_index(drop=True),
                                         y_valid_split.reset_index(drop=True),
                                         lfr)
            
        
        optimize_rounds = True
        early_stopping_rounds = 50
        if optimize_rounds:
            eval_set=[(X_valid_split, y_valid_split)]
            fit_model = model.fit(X_train_split, y_train_split, 
                                    eval_set=eval_set,
                                    eval_metric=metrics.gini_xgb,
                                    early_stopping_rounds=early_stopping_rounds,
                                    verbose=False)

        else:
            fit_model = model.fit(X_train_split, y_train_split)

        pred = fit_model.predict_proba(X_valid_split)[:,1]
        y_valid_pred.iloc[valid_index] = pred

        y_valid_scores = (y_valid_pred.iloc[valid_index] > 0.5)


        #### SAVE DATA####
        y_true_pd=y_valid_split.reset_index(drop=True)
        y_pred_pd=y_valid_scores.apply(lambda x: 1 if x==True else 0).to_frame().reset_index(drop=True).rename(columns={"Fall":"output"})
        y_pred_prob_pd=pd.DataFrame(pred, columns = ["output_prob"])
        

        df_subset=pd.concat([X_valid_split.reset_index(drop=True),y_true_pd,y_pred_pd,y_pred_prob_pd],axis=1)
        
        
        if dropping_D==True:
            df_subset[procted_col_name]=list(df[procted_col_name].iloc[valid_index])
        
        df_subset["Model"]="Model"+str(modelcounter)
        
        df_subset.to_csv(PATH+"model"+str(modelcounter)+"_test_data.csv")

        df_test=df_test.append(df_subset, ignore_index=True)

        valid_acc.append(accuracy_score(y_valid_split, y_valid_scores))
        valid_pre.append(precision_score(y_valid_split, y_valid_scores))
        valid_recall.append(recall_score(y_valid_split, y_valid_scores))
        valid_roc_auc.append(roc_auc_score(y_valid_split, y_valid_pred.iloc[valid_index]))

        modelcounter=modelcounter+1

df_test.to_csv(PATH+"all_test_data.csv")
print("The full test data lies here:",PATH+"all_test_data.csv")