# Preparation of data

Here we will procede to prepare the data to be fed into our models.

## Imports

In [1]:
import os
from pathlib import Path

import pandas as pd
import pandas.api.types as pdt
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
import joblib 

from src import paths, utils
from src.io_ops import read_csv_safely


import scipy.stats as stats

## Load the data

In [2]:
BASE_PATH = paths.RAW_DATA_DIR / "Base.csv"

df_base = read_csv_safely(BASE_PATH)
df_base.columns

Index(['fraud_bool', 'income', 'name_email_similarity',
       'prev_address_months_count', 'current_address_months_count',
       'customer_age', 'days_since_request', 'intended_balcon_amount',
       'payment_type', 'zip_count_4w', 'velocity_6h', 'velocity_24h',
       'velocity_4w', 'bank_branch_count_8w',
       'date_of_birth_distinct_emails_4w', 'employment_status',
       'credit_risk_score', 'email_is_free', 'housing_status',
       'phone_home_valid', 'phone_mobile_valid', 'bank_months_count',
       'has_other_cards', 'proposed_credit_limit', 'foreign_request', 'source',
       'session_length_in_minutes', 'device_os', 'keep_alive_session',
       'device_distinct_emails_8w', 'device_fraud_count', 'month'],
      dtype='object')

## Remove unwanted columns

As we checked in notebook 01 columns "device_fraud_count" is constant and so irrelevant for our model. We will remove it.

In [3]:

df_base.drop(columns=["device_fraud_count"], inplace=True)
df_base.columns

Index(['fraud_bool', 'income', 'name_email_similarity',
       'prev_address_months_count', 'current_address_months_count',
       'customer_age', 'days_since_request', 'intended_balcon_amount',
       'payment_type', 'zip_count_4w', 'velocity_6h', 'velocity_24h',
       'velocity_4w', 'bank_branch_count_8w',
       'date_of_birth_distinct_emails_4w', 'employment_status',
       'credit_risk_score', 'email_is_free', 'housing_status',
       'phone_home_valid', 'phone_mobile_valid', 'bank_months_count',
       'has_other_cards', 'proposed_credit_limit', 'foreign_request', 'source',
       'session_length_in_minutes', 'device_os', 'keep_alive_session',
       'device_distinct_emails_8w', 'month'],
      dtype='object')

## Train, Test, Val split.

We will divide the data in 3 splits using the month as an index for this division. We will first separate the categorical columns as they will have to be treated differently when fed into the model.

In [4]:
categorical_cols = ['device_os', 'employment_status', 'housing_status', 'payment_type', 'source']
target_col = 'fraud_bool'
index_col = 'month'
num_cols = [c for c in df_base.columns if c not in categorical_cols + [target_col, index_col]]
print(f"Numerical columns size: {len(num_cols)}")

Numerical columns size: 24


In [5]:
train_df = df_base[df_base[index_col].between(0, 4)].copy()
val_df   = df_base[df_base[index_col].between(5, 6)].copy()
test_df  = df_base[df_base[index_col] == 7].copy()

encoding categorical columns and scaling numerical columns.

In [6]:
# here we generate a new encoder for each categorical column
# and we save it in a dictionary to be used later if needed.
encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col].astype(str))
    val_df[col]   = val_df[col].map(lambda x: le.transform([x])[0] if x in le.classes_ else -1)
    test_df[col]  = test_df[col].map(lambda x: le.transform([x])[0] if x in le.classes_ else -1)
    encoders[col] = le

In [7]:
# scaling done with minmax

scaler = StandardScaler()
train_df[num_cols] = scaler.fit_transform(train_df[num_cols])
val_df[num_cols]   = scaler.transform(val_df[num_cols])
test_df[num_cols]  = scaler.transform(test_df[num_cols])


In [8]:
# we do not need the month column anymore
train_df = train_df.drop(columns=[index_col])
val_df = val_df.drop(columns=[index_col])
test_df = test_df.drop(columns=[index_col])

In [9]:
train_df.head()

Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,payment_type,zip_count_4w,...,phone_mobile_valid,bank_months_count,has_other_cards,proposed_credit_limit,foreign_request,source,session_length_in_minutes,device_os,keep_alive_session,device_distinct_emails_8w
0,0,-0.825977,1.658176,-0.409867,-0.680442,0.52219,-0.205566,4.546606,0,-0.630271,...,0.36885,-0.155614,-0.523999,1.907532,-0.164372,0,0.99291,0,0.877526,-0.113902
1,0,0.883512,0.391289,-0.409867,0.024036,-1.131329,-0.204957,-0.485509,3,-0.066291,...,0.36885,-0.732435,-0.523999,1.907532,-0.164372,0,-0.54686,2,0.877526,-0.113902
2,0,0.883512,1.69319,-0.187747,-0.801524,0.52219,-0.204554,-0.516725,1,-0.596375,...,0.36885,1.574849,-0.523999,-0.659191,-0.164372,0,1.771801,3,-1.139567,-0.113902
3,0,0.199716,-0.097252,-0.143323,-0.801524,-0.304569,-0.205519,-0.534881,1,1.652012,...,0.36885,-0.814838,-0.523999,-0.659191,-0.164372,0,0.872105,0,0.877526,-0.113902
4,0,1.225409,1.163204,-0.409867,-0.636412,0.52219,0.834317,1.85277,0,0.574895,...,0.36885,1.245237,-0.523999,-0.659191,-0.164372,0,-0.501462,2,-1.139567,-0.113902


## Extracting the splits

In [10]:
train_df.to_parquet(paths.PROCESSED_DATA_DIR / "train.parquet", index=False)
val_df.to_parquet(paths.PROCESSED_DATA_DIR / "val.parquet", index=False)
test_df.to_parquet(paths.PROCESSED_DATA_DIR / "test.parquet", index=False)

## Extra

In [11]:
#If we want we can save the encoders and scaler for later use
#joblib.dump(encoders, paths.PROCESSED_DATA_DIR / "label_encoders.joblib")
#joblib.dump(scaler, paths.PROCESSED_DATA_DIR / "scaler.joblib")

In [13]:
# to then load up our splits we can add this code to beggining of any script were we need the data
import joblib, pandas as pd
import torch
#encoders = joblib.load("label_encoders.pkl")
#scaler = joblib.load("scaler.pkl")

train_df = pd.read_parquet(paths.PROCESSED_DATA_DIR / "train.parquet")
val_df = pd.read_parquet(paths.PROCESSED_DATA_DIR / "val.parquet")
test_df = pd.read_parquet(paths.PROCESSED_DATA_DIR / "test.parquet")

categorical_cols = ['device_os', 'employment_status', 'housing_status', 'payment_type', 'source']
target_col = 'fraud_bool'
num_cols = [c for c in train_df.columns if c not in categorical_cols + [target_col]]

# Then prepare tensors:

X_train_cont = torch.tensor(train_df[num_cols].values, dtype=torch.float32)
X_train_cat = torch.tensor(train_df[categorical_cols].values, dtype=torch.long)
y_train = torch.tensor(train_df[target_col].values, dtype=torch.float32)
X_val_cont = torch.tensor(val_df[num_cols].values, dtype=torch.float32)
X_val_cat = torch.tensor(val_df[categorical_cols].values, dtype=torch.long)
y_val = torch.tensor(val_df[target_col].values, dtype=torch.float32)
X_test_cont = torch.tensor(test_df[num_cols].values, dtype=torch.float32)
X_test_cat = torch.tensor(test_df[categorical_cols].values, dtype=torch.long)
y_test = torch.tensor(test_df[target_col].values, dtype=torch.float32)


In [14]:
X_train_cont.shape, X_train_cat.shape, y_train.shape

(torch.Size([675666, 24]), torch.Size([675666, 5]), torch.Size([675666]))