<a href="https://colab.research.google.com/github/MDankloff/Cluster-Bias-Disco/blob/main/BAF_exploratory.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries

In [73]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing

import glob
import os

# Load Bank Account Fraud (BAF) Tabular Dataset Suite

In [76]:
dir_path = '/content/drive/MyDrive/Mirthe Supervision Map /Paper 3a: FC Bias Disco/BAF'

'''if os.path.exists(dir_path):
    print("Directory exists")
    print("Files in directory:")
    print(os.listdir(dir_path))  # This will list all files in the directory
else:
    print("Directory does not exist")'''

extension = "csv"
data_paths = glob.glob(f"{dir_path}/*.{extension}")

'''# Check if any CSV files are found
if not data_paths:
    raise FileNotFoundError(f"No files with extension '{extension}' found in the directory.")
else:
    print("CSV files found:")
    print(data_paths)'''

def read_dataset(path):
    return pd.read_csv(path)

# Extract variant name from the file path (without the extension)
def get_variant(path):
    return os.path.basename(path).split(".")[0]

# Dictionary comprehension to read all CSV files into a dictionary of DataFrames
dataframes = {
    get_variant(path): read_dataset(path) for path in data_paths
}
print(f"Loaded datasets: {list(dataframes.keys())}")

Loaded datasets: ['Base', 'Variant I', 'Variant II', 'Variant III', 'Variant IV', 'Variant V']


In [77]:
base = dataframes['Base'] # sampled to best represent original dataset
variant1 = dataframes['Variant I'] # higher group size disparity than base
variant2 = dataframes['Variant II'] # higher prevalance disparity than base
variant3 = dataframes['Variant III'] # better separability of one of the groups
variant4 = dataframes['Variant IV'] # higher prevalance disparity in train
variant5 = dataframes['Variant V'] # better separability in train for one of the groups
base.info()
#variant1.info()
#base.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 32 columns):
 #   Column                            Non-Null Count    Dtype  
---  ------                            --------------    -----  
 0   fraud_bool                        1000000 non-null  int64  
 1   income                            1000000 non-null  float64
 2   name_email_similarity             1000000 non-null  float64
 3   prev_address_months_count         1000000 non-null  int64  
 4   current_address_months_count      1000000 non-null  int64  
 5   customer_age                      1000000 non-null  int64  
 6   days_since_request                1000000 non-null  float64
 7   intended_balcon_amount            1000000 non-null  float64
 8   payment_type                      1000000 non-null  object 
 9   zip_count_4w                      1000000 non-null  int64  
 10  velocity_6h                       1000000 non-null  float64
 11  velocity_24h                      1000

In [81]:
'fraud_bool' in base.columns
'fraud_bool' in variant1.columns
'fraud_bool' in variant2.columns
'fraud_bool' in variant3.columns
'fraud_bool' in variant4.columns
'fraud_bool' in variant5.columns

True

# **Possible preprocessing steps:**

-Handling missing values

-Getting rid of redundant features

-Converting objects to integers

-Grouping aka one hot (dummy) encoding

binning certain features together

-Deciding on distance measures (euclidean, manhattan, gower)

-Scaling

-how to ‘measure’ error: variance vs mean vs accuracy, 1 vs rest, 1 vs best, 1 to 1

-min points and epsilon in DBSCAN

-model choice clustering


# Handling missing values

In [59]:
'''base_no_mv= base.dropna()
#base_no_mv.info()

base_no_mv = base_no_mv[base_no_mv['prev_address_months_count'] != -1]
base_no_mv.info()

print(base_no_mv['prev_address_months_count'])
'''

<class 'pandas.core.frame.DataFrame'>
Index: 287080 entries, 0 to 0
Data columns (total 31 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   income                            287080 non-null  float64
 1   name_email_similarity             287080 non-null  float64
 2   prev_address_months_count         287080 non-null  int64  
 3   current_address_months_count      287080 non-null  int64  
 4   customer_age                      287080 non-null  int64  
 5   days_since_request                287080 non-null  float64
 6   intended_balcon_amount            287080 non-null  float64
 7   payment_type                      287080 non-null  object 
 8   zip_count_4w                      287080 non-null  int64  
 9   velocity_6h                       287080 non-null  float64
 10  velocity_24h                      287080 non-null  float64
 11  velocity_4w                       287080 non-null  float64
 12

# One-hot-encode categorical features for collecting the errors
scaling or one-hot-encoding

In [93]:
Categorical_cols = base.select_dtypes(include=['object']).columns
print(Categorical_cols)

Index(['payment_type', 'employment_status', 'housing_status', 'source',
       'device_os'],
      dtype='object')


In [94]:
print(base['payment_type'].unique())
print(base['employment_status'].unique())
print(base['housing_status'].unique())
print(base['source'].unique())
print(base['device_os'].unique())

['AA' 'AD' 'AB' 'AC' 'AE']
['CB' 'CA' 'CC' 'CF' 'CD' 'CE' 'CG']
['BC' 'BE' 'BD' 'BA' 'BB' 'BF' 'BG']
['INTERNET' 'TELEAPP']
['linux' 'other' 'windows' 'x11' 'macintosh']


In [97]:
base_onehot = pd.get_dummies(base, Categorical_cols)
print(base_onehot['payment_type_AA'].unique())

[ True False]


# TRAIN

In [98]:
X_base = base_onehot.drop(['fraud_bool'], axis =1)
Y_base = base_onehot['fraud_bool']

In [101]:
X_base_train, X_base_test, Y_base_train, Y_base_test = train_test_split(X_base, Y_base, test_size=0.2, shuffle = True, stratify = Y_base) # Corrected the order of variables returned by train_test_split
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_base_train, Y_base_train)

Y_base_hat = model.predict(X_base_test)
accuracy = accuracy_score(Y_base_test, Y_base_hat)
print("Accuracy:", accuracy)

Accuracy: 0.98895


Collect errors

In [102]:
predictions = pd.DataFrame()
predictions['predicted_class'] = Y_base_hat.tolist()
predictions = predictions.set_index(X_base_test.index)
predictions['true_class'] = Y_base_test

#calculating the errors with the absolute value
predictions['errors'] = abs(predictions['predicted_class'] - predictions['true_class'])

#adding predictions to the test data
base_onehot_error = X_base_test.merge(predictions, how= 'outer', left_index = True, right_index = True)

errors = base_onehot_error['errors']
df_out = base_onehot_error.drop(['predicted_class', 'true_class', 'errors'], axis=1)
base_onehot_error[base_onehot_error['errors']==1]

cm = confusion_matrix(predictions['true_class'], predictions['predicted_class'])
print(cm)

[[197788      6]
 [  2204      2]]
