<a href="https://colab.research.google.com/github/MDankloff/Cluster-Bias-Disco/blob/main/BAFV2data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries

In [167]:
import pickle
import lightgbm as lgbm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
import glob
import random
import os
import joblib
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, roc_auc_score
from sklearn.utils import resample

In [168]:
!pip install dask[dataframe]



In [169]:
from google.colab import drive
drive.mount('/content/drive')
! cd '/content/drive/MyDrive/Mirthe_Supervision /Paper#3/BAF'


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# LOAD DATA & MODEL

In [170]:
base_path = '/content/drive/MyDrive/Mirthe_Supervision /Paper#3/BAF/parquet data'

extension = "parquet" #for smaller "parquet" depending on the downloaded file
data_paths = glob.glob(f"{base_path}/*.{extension}")

def read_dataset(path, ext = extension):
    if ext == "csv":
      return pd.read_csv(path)
    elif ext == "parquet":
      return pd.read_parquet(path)
    else:
      raise ValueError(f"Unsupported file extension: {ext}")

# Extract variant name from the file path (without the extension)
def get_variant(path):
    return os.path.basename(path).split(".")[0]

# Dictionary comprehension to read all CSV files into a dictionary of DataFrames
dataframes = {
    get_variant(path): read_dataset(path) for path in data_paths
}
print(f"Loaded datasets: {list(dataframes.keys())}")

datasets_paths = {
    "Base": base_path + "/Base.parquet", # sampled to best represent original dataset
    "Variant I": base_path + "/Variant I.parquet", # higher group size disparity than base - reducing the size of the minority group from approx 20 - 10% of the dataset
    "Variant II": base_path + "/Variant II.parquet", # higher prevalence disparity than base - one group has 5 x the fraud detection rate of the other while group sizes are equal
    "Variant III": base_path + "/Variant III.parquet", # better separability for one of the groups -
    "Variant IV": base_path + "/Variant IV.parquet", # higher prevalence disparity in train
    "Variant V": base_path + "/Variant V.parquet", # better separability in train for one of the groups
}


Loaded datasets: ['Base', 'Variant I', 'Variant II', 'Variant III', 'Variant IV', 'Variant V']


load best light gbm model from variant 2

In [171]:
# directory containing the model files
model_dir = '/content/drive/MyDrive/Mirthe_Supervision /Paper#3/BAF/Best Model per Variant/Accuracy'

# Get list of all model files in the directory
model_files = glob.glob(os.path.join(model_dir, '*.pkl'))

# Dictionary to store loaded models
models = {}

# Load all models from the directory and save them to the dictionary
for model_file in model_files:
    # Load the model
    with open(model_file, 'rb') as f:
        model = joblib.load(f)

    # Extract the model name from the file path (without extension)
    model_name = os.path.basename(model_file).split('.')[0]

    # Add the model to the dictionary
    models[model_name] = model

    # Optional: Save the model back (though it seems redundant here)
    save_path = os.path.join(model_dir, f'{model_name}.pkl')
    joblib.dump(model, save_path)

    print(f"Model '{model_name}' loaded and saved to: {save_path}")

# Accessing the best model for variant 2
modelv2 = models.get("model_Variant II_top_4")

print(modelv2)

Model 'model_Variant II_top_4' loaded and saved to: /content/drive/MyDrive/Mirthe_Supervision /Paper#3/BAF/Best Model per Variant/Accuracy/model_Variant II_top_4.pkl
Model 'model_Base_top_4' loaded and saved to: /content/drive/MyDrive/Mirthe_Supervision /Paper#3/BAF/Best Model per Variant/Accuracy/model_Base_top_4.pkl
Model 'model_Variant III_top_0' loaded and saved to: /content/drive/MyDrive/Mirthe_Supervision /Paper#3/BAF/Best Model per Variant/Accuracy/model_Variant III_top_0.pkl
Model 'model_Variant I_top_4' loaded and saved to: /content/drive/MyDrive/Mirthe_Supervision /Paper#3/BAF/Best Model per Variant/Accuracy/model_Variant I_top_4.pkl
Model 'model_Variant V_top_0' loaded and saved to: /content/drive/MyDrive/Mirthe_Supervision /Paper#3/BAF/Best Model per Variant/Accuracy/model_Variant V_top_0.pkl
Model 'model_Variant IV_top_6' loaded and saved to: /content/drive/MyDrive/Mirthe_Supervision /Paper#3/BAF/Best Model per Variant/Accuracy/model_Variant IV_top_6.pkl
LGBMClassifier(boo

# Explore data

In [172]:
#rename dataset variant 2 and create a copy
v2 = dataframes["Variant II"]
v2_old = v2.copy()

In [173]:
pd.set_option('display.max_columns', 50) # Increase the maximum number of columns displayed in Pandas to 50
pd.set_option('display.max_rows', 50)
v2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 32 columns):
 #   Column                            Non-Null Count    Dtype  
---  ------                            --------------    -----  
 0   fraud_bool                        1000000 non-null  int64  
 1   income                            1000000 non-null  float64
 2   name_email_similarity             1000000 non-null  float64
 3   prev_address_months_count         1000000 non-null  int64  
 4   current_address_months_count      1000000 non-null  int64  
 5   customer_age                      1000000 non-null  int64  
 6   days_since_request                1000000 non-null  float64
 7   intended_balcon_amount            1000000 non-null  float64
 8   payment_type                      1000000 non-null  object 
 9   zip_count_4w                      1000000 non-null  int64  
 10  velocity_6h                       1000000 non-null  float64
 11  velocity_24h                      1000

In [174]:
v2['payment_type'].unique() #remove because anonymized
v2['days_since_request'].unique() #remove because unclear values

array([0.01673985, 0.01900183, 0.04706417, ..., 0.03555377, 0.02066269,
       0.0074784 ])

In [175]:
print(v2['days_since_request'])

0         0.016740
1         0.019002
2         0.047064
3         0.008007
4         2.513544
            ...   
999995    0.001835
999996    0.023952
999997    0.035554
999998    0.020663
999999    0.007478
Name: days_since_request, Length: 1000000, dtype: float64


In [176]:
v2_old['source'].unique()
v2_old['device_os'].unique()
v2_old['keep_alive_session'].unique()
#v2_old['device_fraud_count'].unique()

array([0, 1])

In [177]:
1 in v2_old['device_fraud_count'].unique()

False

how to deal with missing values

In [178]:
v2_old['device_os'].unique()

array(['other', 'macintosh', 'linux', 'windows', 'x11'], dtype=object)

Master dataset

In [179]:
#FEATURES
META = ['clusters', 'new_clusters']
ERROR = ['errors', 'TP', 'TN', 'FN', 'FP']
REG = ['bank_branch_count_8w', 'credit_risk_score', 'device_os','month', 'session_length_in_minutes', 'email_is_free', 'proposed_credit_limit', 'name_email_similarity',
      'zip_count_4w', 'date_of_birth_distinct_emails_4w', 'phone_mobile_valid', 'has_other_cards', 'foreign_request']
SEN = ['customer_age', 'income'] #protected attributes in BAF paper
DUMMY = ['source_INTERNET', 'Source_TELEAPP', 'device_os_other', 'device_os_macintosh','device_os_linux','device_os_windows', 'device_os_x11']

#FEATURES SCALED
ERROR_scaled = ['errors_scaled']
REG_scaled = ['bank_branch_count_8w_scaled', 'credit_risk_score_scaled', 'device_os_scaled','month_scaled', 'session_length_in_minutes_scaled',
              'email_is_free_scaled', 'proposed_credit_limit_scaled', 'name_email_similarity_scaled', 'zip_count_4w_scaled',
              'date_of_birth_distinct_emails_4w_scaled', 'phone_home_valid_scaled','has_other_cards_scaled', 'foreign_request_scaled']
SEN_scaled = ['customer_age_scaled', 'income_scaled']
DUMMY_scaled = ['source_INTERNET_scaled', 'Source_TELEAPP_scaled', 'device_os_other_scaled', 'device_os_macintosh_scaled', 'device_os_linux_scaled', 'device_os_windows_scaled', 'device_os_x11_scaled']


#SHAP FEATURES
SHAP_REG = ['bank_branch_count_8w_shap', 'credit_risk_score_shap', 'device_os_shap', 'month_shap', 'session_length_in_minutes_shap', 'email_is_free_shap', 'proposed_credit_limit_shap', 'name_email_similarity_shap',
 'zip_count_4w_shap', 'date_of_birth_distinct_emails_4w_shap', 'phone_mobile_valid_shap', 'has_other_cards_shap', 'foreign_request_shap']
SHAP_SEN = ['customer_age_shap', 'income_shap']
SHAP_DUMMY = ['source_INTERNET_shap', 'Source_TELEAPP_shap', 'device_os_other_shap', 'device_os_macintosh_shap','device_os_linux_shap','device_os_windows_shap', 'device_os_x11_shap']

#SHAP FEATURES SCALED
SHAP_REG_scaled = ['bank_branch_count_8w_shap_scaled', 'credit_risk_score_shap_scaled', 'device_os_shap_scaled', 'month_shap_scaled', 'session_length_in_minutes_shap_scaled',
 'email_is_free_shap_scaled', 'proposed_credit_limit_shap_scaled', 'name_email_similarity_shap_scaled', 'zip_count_4w_shap_scaled',
 'date_of_birth_distinct_emails_4w_shap_scaled', 'phone_mobile_valid_shap_scaled', 'has_other_cards_shap_scaled', 'foreign_request_shap_scaled']
SHAP_SEN_scaled = ['customer_age_shap_scaled', 'income_shap_scaled']
SHAP_DUMMY_scaled = ['source_INTERNET_shap_scaled', 'Source_TELEAPP_shap_scaled', 'device_os_other_shap_scaled', 'device_os_macintosh_shap_scaled',
 'device_os_linux_shap_scaled', 'device_os_windows_shap_scaled', 'device_os_x11_shap_scaled']

'''removed features = 'device_fraud_count', 'intended_balcon_amount', 'payment_type', 'days_since_request',
'velocity_6h', 'velocity_24h', 'velocity_4w', 'keep_alive_session', 'prev_address_months_count', 'current_address_months_count', 'phone_home_valid', 'bank_months_count', 'device_distinct_emails_8w', 'housing_status', 'employment_status' '''

"removed features = 'device_fraud_count', 'intended_balcon_amount', 'payment_type', 'days_since_request', \n'velocity_6h', 'velocity_24h', 'velocity_4w', 'keep_alive_session', 'prev_address_months_count', 'current_address_months_count', 'phone_home_valid', 'bank_months_count', 'device_distinct_emails_8w', 'housing_status', 'employment_status' "

In [180]:
v2_old['housing_status'].unique()

array(['BC', 'BA', 'BE', 'BB', 'BD', 'BF', 'BG'], dtype=object)

In [181]:
v2_old['employment_status'].unique()

array(['CB', 'CA', 'CE', 'CD', 'CC', 'CF', 'CG'], dtype=object)

In [182]:
v2_old['source'].unique()

array(['INTERNET', 'TELEAPP'], dtype=object)

In [183]:
v2_old['device_os'].unique()

array(['other', 'macintosh', 'linux', 'windows', 'x11'], dtype=object)

Remove features

In [184]:
features_to_remove = ['device_fraud_count', 'intended_balcon_amount', 'payment_type', 'days_since_request',
'velocity_6h', 'velocity_24h', 'velocity_4w', 'keep_alive_session', 'prev_address_months_count', 'current_address_months_count', 'phone_home_valid', 'bank_months_count', 'device_distinct_emails_8w', 'housing_status', 'employment_status' ]

v2 = v2.drop(columns = features_to_remove)

In [185]:
v2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 17 columns):
 #   Column                            Non-Null Count    Dtype  
---  ------                            --------------    -----  
 0   fraud_bool                        1000000 non-null  int64  
 1   income                            1000000 non-null  float64
 2   name_email_similarity             1000000 non-null  float64
 3   customer_age                      1000000 non-null  int64  
 4   zip_count_4w                      1000000 non-null  int64  
 5   bank_branch_count_8w              1000000 non-null  int64  
 6   date_of_birth_distinct_emails_4w  1000000 non-null  int64  
 7   credit_risk_score                 1000000 non-null  int64  
 8   email_is_free                     1000000 non-null  int64  
 9   phone_mobile_valid                1000000 non-null  int64  
 10  has_other_cards                   1000000 non-null  int64  
 11  proposed_credit_limit             1000

In [186]:
#get a smaller sample
v2_sample = v2.sample(n= 8000, random_state = 42)

In [187]:
v2_sample.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8000 entries, 987231 to 534950
Data columns (total 17 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   fraud_bool                        8000 non-null   int64  
 1   income                            8000 non-null   float64
 2   name_email_similarity             8000 non-null   float64
 3   customer_age                      8000 non-null   int64  
 4   zip_count_4w                      8000 non-null   int64  
 5   bank_branch_count_8w              8000 non-null   int64  
 6   date_of_birth_distinct_emails_4w  8000 non-null   int64  
 7   credit_risk_score                 8000 non-null   int64  
 8   email_is_free                     8000 non-null   int64  
 9   phone_mobile_valid                8000 non-null   int64  
 10  has_other_cards                   8000 non-null   int64  
 11  proposed_credit_limit             8000 non-null   float64
 12  fore

In [188]:
'''missing_values = v2_sample.isna()
print(missing_values)'''

'missing_values = v2_sample.isna()\nprint(missing_values)'

In [189]:
#print(v2_sample.loc[534950])

Get dummies for objects

In [190]:
dummy_cols = ['source', 'device_os']
v2_sample = pd.get_dummies(v2_sample, columns = dummy_cols)
v2_sample.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8000 entries, 987231 to 534950
Data columns (total 22 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   fraud_bool                        8000 non-null   int64  
 1   income                            8000 non-null   float64
 2   name_email_similarity             8000 non-null   float64
 3   customer_age                      8000 non-null   int64  
 4   zip_count_4w                      8000 non-null   int64  
 5   bank_branch_count_8w              8000 non-null   int64  
 6   date_of_birth_distinct_emails_4w  8000 non-null   int64  
 7   credit_risk_score                 8000 non-null   int64  
 8   email_is_free                     8000 non-null   int64  
 9   phone_mobile_valid                8000 non-null   int64  
 10  has_other_cards                   8000 non-null   int64  
 11  proposed_credit_limit             8000 non-null   float64
 12  fore

# Train RF

In [191]:
X = v2_sample.drop(columns = ['fraud_bool'], axis = 1)
Y = v2_sample['fraud_bool']
Y.value_counts()

Unnamed: 0_level_0,count
fraud_bool,Unnamed: 1_level_1
0,7918
1,82


In [192]:
X = X.astype(int)
Y = Y.astype(int)
X.iloc[7]

Unnamed: 0,732057
income,0
name_email_similarity,0
customer_age,20
zip_count_4w,990
bank_branch_count_8w,9
date_of_birth_distinct_emails_4w,6
credit_risk_score,116
email_is_free,1
phone_mobile_valid,1
has_other_cards,1


In [193]:
# Split the data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.7, shuffle=True, stratify=Y)

Have to use the different split here

Upsample minority class

In [204]:
df_majority = v2_sample[v2_sample['fraud_bool'] == 0]
df_minority = v2_sample[v2_sample['fraud_bool'] == 1]

#  minority_ratio (float): The desired ratio of minority samples to majority samples. Default is 0.1.
minority_ratio = 0.1
# Calculate the number of samples for upsampling
n_samples = int(minority_ratio * len(df_majority) / (1 - minority_ratio)) #proportional group sizes
#n_samples = len(df_majority) # equal class sizes

df_minority_upsampled = resample(df_minority, replace=True, n_samples=n_samples, random_state=42)

# Combine the upsampled minority class with the majority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

v2_upsampled = df_upsampled

Change booleans to numeric for shap to work later

Create scaled versions of the features

In [205]:
scaler = StandardScaler()
X_scaled_train = scaler.fit_transform(X_train)
X_scaled_test = scaler.transform(X_test)

#X_scaled = pd.DataFrame(X_scaled, columns = X.columns)

# Name columns as '_scaled'
#X_scaled = X_scaled.rename(columns={col: col + '_scaled' for col in X_scaled.columns})

# Join the DataFrames
#X_all = X.join(X_scaled)

# Display updated dataframe
#X.info()
#

In [196]:
#X_scaled.info()

Train on scaled X (unnecessary for RF but good for SHAP later on)

In [206]:
#training on a scaled X unnecessary for random forest but good for shap later
X_scaled_train, X_scaled_test, Y_train, Y_test = train_test_split(X, Y,
                                                    #test_size = 0.3,
                                                    test_size = 0.7,
                                                                  shuffle = True, stratify = Y)

model = RandomForestClassifier()
#model = modelv2
model.fit(X_scaled_train, Y_train)

#predictions
Yhat = model.predict(X_scaled_test)
len(Yhat)

5600

In [207]:
Y_test.info()

<class 'pandas.core.series.Series'>
Index: 5600 entries, 245149 to 788369
Series name: fraud_bool
Non-Null Count  Dtype
--------------  -----
5600 non-null   int64
dtypes: int64(1)
memory usage: 87.5 KB


# Add Errors

In [208]:
#Create df for predicted and true classes - converting y_hat np to df
predictions = pd.DataFrame()
predictions['predicted_class'] = Yhat.tolist()
predictions = predictions.set_index(X_scaled_test.index)
predictions['true_class'] = Y_test.values

#Calculating the errors with the absolute value
predictions['errors'] = abs(predictions['predicted_class'] - predictions['true_class'])

#adding predictions to test data
compas_w_error = X_scaled_test.merge(predictions, how= 'outer', left_index = True, right_index = True)

errors = compas_w_error['errors']
df_out = compas_w_error.drop(['predicted_class', 'true_class', 'errors'], axis=1)
compas_w_error[compas_w_error['errors']==1]

#accuracy & confusion matrix
accuracy = accuracy_score(predictions['true_class'], predictions['predicted_class'])
print(f"Accuracy: {accuracy:.4f}")
cm = confusion_matrix(predictions['true_class'], predictions['predicted_class'])
print(cm)

Accuracy: 0.9898
[[5543    0]
 [  57    0]]
