<a href="https://colab.research.google.com/github/MDankloff/Cluster-Bias-Disco/blob/main/baf_exploratory.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#!git clone https://github.com/MDankloff/Cluster-Bias-Disco.git
#!cd Cluster-Bias-Disco/

In [2]:
from google.colab import drive
drive.mount('/content/drive')
! cd '/content/drive/MyDrive/Mirthe_Supervision /Paper#3/BAF'

Mounted at /content/drive


In [19]:
import pickle
import lightgbm as lgbm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
import glob
import os
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [4]:
base_path = '/content/drive/MyDrive/Mirthe_Supervision /Paper#3/BAF/parquet data'

extension = "parquet" #for smaller "parquet" depending on the downloaded file
data_paths = glob.glob(f"{base_path}/*.{extension}")

def read_dataset(path, ext = extension):
    if ext == "csv":
      return pd.read_csv(path)
    elif ext == "parquet":
      return pd.read_parquet(path)
    else:
      raise ValueError(f"Unsupported file extension: {ext}")

# Extract variant name from the file path (without the extension)
def get_variant(path):
    return os.path.basename(path).split(".")[0]

# Dictionary comprehension to read all CSV files into a dictionary of DataFrames
dataframes = {
    get_variant(path): read_dataset(path) for path in data_paths
}
print(f"Loaded datasets: {list(dataframes.keys())}")

datasets_paths = {
    "Base": base_path + "/Base.parquet", # sampled to best represent original dataset
    "Variant I": base_path + "/Variant I.parquet", # higher group size disparity than base - reducing the size of the minority group from approx 20 - 10% of the dataset
    "Variant II": base_path + "/Variant II.parquet", # higher prevalence disparity than base - one group has 5 x the fraud detection rate of the other while group sizes are equal
    "Variant III": base_path + "/Variant III.parquet", # better separability for one of the groups -
    "Variant IV": base_path + "/Variant IV.parquet", # higher prevalence disparity in train
    "Variant V": base_path + "/Variant V.parquet", # better separability in train for one of the groups
}

Loaded datasets: ['Base', 'Variant I', 'Variant II', 'Variant III', 'Variant IV', 'Variant V']


In [5]:
#Rename datasets
base = dataframes['Base']
variant1 = dataframes['Variant I']
variant2 = dataframes['Variant II']
variant3 = dataframes['Variant III']
variant4 = dataframes['Variant IV']
variant5 = dataframes['Variant V']

In [16]:
# directory containing the model files
model_dir = '/content/drive/MyDrive/Mirthe_Supervision /Paper#3/BAF/Best 2 models per variant'

# Get list of all model files in the directory
model_files = glob.glob(os.path.join(model_dir, '*.pkl'))

# Dictionary to store loaded models
models = {}

# Load all models from the directory and save them to the dictionary
for model_file in model_files:
    # Load the model
    with open(model_file, 'rb') as f:
        model = joblib.load(f)

    # Extract the model name from the file path (without extension)
    model_name = os.path.basename(model_file).split('.')[0]

    # Add the model to the dictionary
    models[model_name] = model

    # Optional: Save the model back (though it seems redundant here)
    save_path = os.path.join(model_dir, f'{model_name}.pkl')
    joblib.dump(model, save_path)

    print(f"Model '{model_name}' loaded and saved to: {save_path}")

# Example: Accessing specific models
modelb1 = models.get("model_Base_4")  # Replace key with the desired model name
modelb2 = models.get("model_Base_7")
modelv11 = models.get("model_Variant I_4")
modelv12 = models.get("model_Variant I_7")
modelv21 = models.get("model_Variant II_4")
modelv22 = models.get("model_Variant II_7")
modelv31 = models.get("model_Variant III_4")
modelv32 = models.get("model_Variant III_7")
modelv41 = models.get("model_Variant IV_4")
modelv42 = models.get("model_Variant IV_7")
modelv51 = models.get("model_Variant V_4")
modelv52 = models.get("model_Variant V_7")

Model 'model_Base_4' loaded and saved to: /content/drive/MyDrive/Mirthe_Supervision /Paper#3/BAF/Best 2 models per variant/model_Base_4.pkl
Model 'model_Variant III_7' loaded and saved to: /content/drive/MyDrive/Mirthe_Supervision /Paper#3/BAF/Best 2 models per variant/model_Variant III_7.pkl
Model 'model_Variant II_4' loaded and saved to: /content/drive/MyDrive/Mirthe_Supervision /Paper#3/BAF/Best 2 models per variant/model_Variant II_4.pkl
Model 'model_Variant I_4' loaded and saved to: /content/drive/MyDrive/Mirthe_Supervision /Paper#3/BAF/Best 2 models per variant/model_Variant I_4.pkl
Model 'model_Base_7' loaded and saved to: /content/drive/MyDrive/Mirthe_Supervision /Paper#3/BAF/Best 2 models per variant/model_Base_7.pkl
Model 'model_Variant II_7' loaded and saved to: /content/drive/MyDrive/Mirthe_Supervision /Paper#3/BAF/Best 2 models per variant/model_Variant II_7.pkl
Model 'model_Variant IV_4' loaded and saved to: /content/drive/MyDrive/Mirthe_Supervision /Paper#3/BAF/Best 2 mo

In [18]:
variant1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 32 columns):
 #   Column                            Non-Null Count    Dtype  
---  ------                            --------------    -----  
 0   fraud_bool                        1000000 non-null  int64  
 1   income                            1000000 non-null  float64
 2   name_email_similarity             1000000 non-null  float64
 3   prev_address_months_count         1000000 non-null  int64  
 4   current_address_months_count      1000000 non-null  int64  
 5   customer_age                      1000000 non-null  int64  
 6   days_since_request                1000000 non-null  float64
 7   intended_balcon_amount            1000000 non-null  float64
 8   payment_type                      1000000 non-null  object 
 9   zip_count_4w                      1000000 non-null  int64  
 10  velocity_6h                       1000000 non-null  float64
 11  velocity_24h                      1000

In [21]:
X = variant1.drop(['fraud_bool'], axis=1)
Y = variant1['fraud_bool']

In [25]:
#Try out for modelv11, modelv12
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.7, shuffle = True, stratify = Y)

model = models.get("model_Variant I_4_model")
model.fit(X_train, Y_train)

Yhat = model.predict(X_test)
len(Yhat)

AttributeError: 'dict' object has no attribute 'fit'

In [26]:
#Try out for modelv11, modelv12
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.7, shuffle = True, stratify = Y)

# The original line was: model = models["model_Variant I_4"]
# This likely accesses a dictionary of model metrics instead of the model itself.
# Assuming your models are stored with keys like "model_Variant I_4_model" (or similar),
# you should access the actual model object.

# Assuming that the model name follows the pattern "model_Variant I_4_model"
model = models.get("model_Variant I_4_model")
# If the above fails, you need to find the correct key for the model
# Print out the keys of the "models" dictionary to see available model names
print(models.keys())


# Alternatively, if the model is stored within a nested dictionary under a key like 'model', try:
# model = models["model_Variant I_4"]["model"]

# Ensure 'model' now holds the actual model object before proceeding
if model is not None:
    model.fit(X_train, Y_train)

    Yhat = model.predict(X_test)
    len(Yhat)
else:
    print("Model not found. Please check the key used to access the model.")

dict_keys(['model_Base_4', 'model_Variant III_7', 'model_Variant II_4', 'model_Variant I_4', 'model_Base_7', 'model_Variant II_7', 'model_Variant IV_4', 'model_Variant I_7', 'model_Variant III_4', 'model_Variant V_4', 'model_Variant IV_7', 'model_Variant V_7'])
Model not found. Please check the key used to access the model.


In [27]:
#Try out for modelv11, modelv12
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.7, shuffle = True, stratify = Y)

model_data = models.get("model_Variant I_4") # Get the dictionary containing the model and other data
if model_data is not None and "model" in model_data:
    model = model_data["model"] # Access the model object from within the dictionary
    model.fit(X_train, Y_train)

    Yhat = model.predict(X_test)
    len(Yhat)
else:
    print("Model not found or structured differently. Check the contents of 'models'.")

Model not found or structured differently. Check the contents of 'models'.


In [2]:
import pprint # Import pprint for better formatting

# Print the contents of "model_Variant I_4" for inspection
pprint.pprint(["/content/model_Base_top_0.pkl"])

['/content/model_Base_top_0.pkl']


In [37]:
pprint.pprint(models["model_Variant V_7"])

{'fpr Older': 0.08309047555967677,
 'fpr Younger': 0.015993436855103352,
 'recall': 0.33295625942684764,
 'recall Older': 0.582962962962963,
 'recall Younger': 0.07296466973886329}
