# Preprocessing fraud dataset

In [12]:
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [13]:
import pandas as pd

df_fraud = pd.read_csv('../data/data_fraud.csv')
df_fraud.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


In [14]:
df_fraud.shape

(590540, 434)

In [15]:
df_fraud.drop(columns=['TransactionID', 'TransactionDT'], inplace=True)

## Splitting

In [16]:
from sklearn.model_selection import train_test_split

X, y = df_fraud.drop('isFraud', axis=1), df_fraud['isFraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

df_train = pd.concat([X_train, y_train], axis=1)
df_test = pd.concat([X_test, y_test], axis=1)

## Encoding

In [17]:
from sklearn.preprocessing import OneHotEncoder
import os
import pickle

import warnings
warnings.filterwarnings('ignore')

path_data_train_encoded = '../pickle/fraud/data/encoded_train.pkl'
path_data_test_encoded = '../pickle/fraud/data/encoded_test.pkl'

if not os.path.exists(path_data_train_encoded) or not os.path.exists(path_data_test_encoded):
    object_df_train = df_train.select_dtypes(include=['object'])
    object_df_test = df_test.select_dtypes(include=['object'])
    encoder = OneHotEncoder(drop='first', handle_unknown='ignore')
    encoder.fit(object_df_train)
    categories = encoder.categories_

    encoded_columns = []
    for i, col in enumerate(object_df_train.columns):
        encoded_columns.extend([f"{col}_{category}" for category in categories[i][1:]])

    encoded_train = encoder.transform(object_df_train)
    encoded_test = encoder.transform(object_df_test)

    encoded_df_train = pd.DataFrame(encoded_train.toarray(), columns=encoded_columns)
    encoded_df_test = pd.DataFrame(encoded_test.toarray(), columns=encoded_columns)

    df_train_non_categorical = df_train.drop(columns=object_df_train.columns)
    df_test_non_categorical = df_test.drop(columns=object_df_train.columns)

    df_train_non_categorical.reset_index(drop=True, inplace=True)
    encoded_df_train.reset_index(drop=True, inplace=True)
    df_test_non_categorical.reset_index(drop=True, inplace=True)
    encoded_df_test.reset_index(drop=True, inplace=True)

    df_train = pd.concat([df_train_non_categorical, encoded_df_train], axis=1)
    df_test = pd.concat([df_test_non_categorical, encoded_df_test], axis=1)

    pickle.dump(df_train, open(path_data_train_encoded, 'wb'))
    pickle.dump(df_test, open(path_data_test_encoded, 'wb'))
else:
    df_train = pickle.load(open(path_data_train_encoded, 'rb'))
    df_test = pickle.load(open(path_data_test_encoded, 'rb'))

In [18]:
print(df_train.shape)
print(df_test.shape)

(472432, 2718)
(118108, 2718)


Let's see the linear correlation between the columns and the output.

In [19]:
correlation = df_train.drop('isFraud', axis=1).corrwith(df_train['isFraud']).apply(abs).sort_values(ascending=False)
correlation.head()

V257    0.387404
V246    0.371294
V244    0.368243
V242    0.364785
V201    0.332932
dtype: float64

Some columns have a quite high correlation with the output. We can try to eliminate the variables with the lowest correlation.

In [20]:
import numpy as np
import plotly.graph_objects as go

fig = go.Figure()
thresholds = np.arange(0, correlation[0], 0.005)
nb_col_remaining = []

for threshold in thresholds:
    columns_to_drop = correlation[correlation < threshold].index
    nb_col_remaining.append(df_train.shape[1] - len(columns_to_drop))

fig.add_trace(go.Scatter(x=thresholds, y=nb_col_remaining, mode='lines+markers'))
fig.update_layout(title='Number of columns remaining depending on the correlation threshold',
                  xaxis_title='Threshold',
                  yaxis_title='Number of columns remaining')

If we eliminate all the columns which have a correlation lower than 0.05, we still keep a lot of columns. But we eliminate more than 2000 variable, most of them are from the encoder which mean some values encoded don't have correlation with the output.

In [10]:
path_data_train_filtered = '../pickle/fraud/data/filtered_train.pkl'
path_data_test_filtered = '../pickle/fraud/data/filtered_test.pkl'

if not os.path.exists(path_data_train_filtered) or not os.path.exists(path_data_test_filtered):
    columns_to_drop = correlation[correlation < 0.05].index
    df_train_filtered = df_train.drop(columns=columns_to_drop)
    df_test_filtered = df_test.drop(columns=columns_to_drop)
    pickle.dump(df_train_filtered, open(path_data_train_filtered, 'wb'))
    pickle.dump(df_test_filtered, open(path_data_test_filtered, 'wb'))
else:
    df_train_filtered = pickle.load(open(path_data_train_filtered, 'rb'))
    df_test_filtered = pickle.load(open(path_data_test_filtered, 'rb'))

print(df_train_filtered.shape)
print(df_test_filtered.shape)

(472432, 213)
(118108, 213)


## Imputation

### Evaluation of the imputation

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.pipeline import Pipeline
from utils import model_evaluation_clf
from time import time
import os

import warnings
warnings.filterwarnings("ignore")

path_results_imputation = "../pickle/fraud/results_imputation.pkl"

if not os.path.exists(path_results_imputation):
    results_imputation = pd.DataFrame(columns=['features', 'max_iters', 'CPU time', "Accuracy", "Precision", "Recall", "f1-score", "AUC"])
    nb_res = 0
    features = [5, 10, 20, 50]
    iters = [5, 10, 20, 50]

    X_train, y_train = df_train_filtered.drop('isFraud', axis=1), df_train_filtered['isFraud']
    X_train_imputation, X_test_imputation, y_train_imputation, y_test_imputation = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    for max_iter in iters:
        for n_nearest_features in features:
            print(f"Training with n_nearest_features: {n_nearest_features}, max_iter: {max_iter}")
            imputer = IterativeImputer(n_nearest_features=n_nearest_features, max_iter=max_iter, random_state=42)
            pipeline = Pipeline(steps=[('i', imputer), ('m', RandomForestClassifier(n_estimators=20, max_depth=10, random_state=42))])
            start_time = time()
            pipeline.fit(X_train_imputation, y_train_imputation)
            end_time = time()
            y_pred = pipeline.predict(X_test_imputation)
            eval = model_evaluation_clf(y_test_imputation, y_pred)
            results_imputation.loc[nb_res] = [n_nearest_features, max_iter, round(end_time - start_time, 1), eval['accuracy'], eval['precision'], eval['recall'], eval['f1'], eval['roc_auc']]
            nb_res += 1
            print()
else:
    results_imputation = pickle.load(open(path_results_imputation, 'rb'))

In [None]:
results_imputation = results_imputation.sort_values(by='AUC', ascending=False)
print(results_imputation.to_string(index=False))

In [21]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

import warnings
warnings.filterwarnings('ignore')

path_data_train_imputed = '../pickle/fraud/data/imputed_train.pkl'
path_data_test_imputed = '../pickle/fraud/data/imputed_test.pkl'

if not os.path.exists(path_data_train_imputed) or not os.path.exists(path_data_test_imputed):
    imputer = IterativeImputer(n_nearest_features=20, random_state=42)
    imputer.fit(df_train_filtered)
    df_fraud_train_imputed = imputer.transform(df_train_filtered)
    df_fraud_test_imputed = imputer.transform(df_test_filtered)
    df_fraud_train_imputed = pd.DataFrame(df_fraud_train_imputed, columns=df_train_filtered.columns)
    df_fraud_test_imputed = pd.DataFrame(df_fraud_test_imputed, columns=df_train_filtered.columns)
    pickle.dump(df_fraud_train_imputed, open(path_data_train_imputed, 'wb'))
    pickle.dump(df_fraud_test_imputed, open(path_data_test_imputed, 'wb'))
else:
    df_fraud_imputed = pickle.load(open(path_data_train_imputed, 'rb'))
    df_fraud_test_imputed = pickle.load(open(path_data_test_imputed, 'rb'))

### Delete multicolinearity with VIF

In [22]:
from utils import delete_multicollinearity

import warnings
warnings.filterwarnings('ignore')

path_data_train_preprocessed = '../pickle/fraud/data/preprocessed_train.pkl'
path_data_test_preprocessed = '../pickle/fraud/data/preprocessed_test.pkl'

if not os.path.exists(path_data_train_preprocessed) or not os.path.exists(path_data_test_preprocessed):
    df_sampled = df_fraud_train_imputed.sample(frac=0.005, random_state=42)

    df_fraud_train_preprocessed = delete_multicollinearity(df_sampled, 'isFraud', 10)
    columns_to_keep = list(df_fraud_train_preprocessed.columns) + ['isFraud']

    df_fraud_train_final = df_fraud_train_imputed[columns_to_keep]
    df_fraud_test_final = df_fraud_test_imputed[columns_to_keep]
    pickle.dump(df_fraud_train_final, open(path_data_train_preprocessed, 'wb'))
    pickle.dump(df_fraud_test_final, open(path_data_test_preprocessed, 'wb'))

else:
    df_fraud_train_final = pickle.load(open(path_data_train_preprocessed, 'rb'))
    df_fraud_test_final = pickle.load(open(path_data_test_preprocessed, 'rb'))

In [23]:
print(df_fraud_train_final.shape)
print(df_fraud_test_final.columns.to_list())

(472432, 53)
['D2', 'D4', 'D5', 'D8', 'D10', 'D15', 'V18', 'V30', 'V34', 'V36', 'V56', 'V62', 'V74', 'V75', 'V87', 'V142', 'V145', 'V147', 'V162', 'V165', 'V169', 'V176', 'V184', 'V220', 'V222', 'V232', 'V239', 'V251', 'V261', 'V281', 'V282', 'V283', 'V303', 'id_01', 'id_04', 'id_07', 'card6_credit', 'R_emaildomain_gmail.com', 'M2_nan', 'M3_T', 'M4_M2', 'M4_nan', 'M6_T', 'M6_nan', 'M9_T', 'id_12_NotFound', 'id_31_chrome 64.0 for android', 'id_31_chrome generic', 'id_37_T', 'DeviceType_mobile', 'DeviceInfo_SM-A300H Build/LRX22G', 'DeviceInfo_hi6210sft Build/MRA58K', 'isFraud']
