# Preprocessing fraud dataset

In [16]:
import pandas as pd

df_fraud = pd.read_csv('data/data_fraud.csv')
df_fraud.head(10)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M
5,2987005,0,86510,49.0,W,5937,555.0,150.0,visa,226.0,...,,,,,,,,,,
6,2987006,0,86522,159.0,W,12308,360.0,150.0,visa,166.0,...,,,,,,,,,,
7,2987007,0,86529,422.5,W,12695,490.0,150.0,visa,226.0,...,,,,,,,,,,
8,2987008,0,86535,15.0,H,2803,100.0,150.0,visa,226.0,...,mobile safari 11.0,32.0,1334x750,match_status:1,T,F,F,T,mobile,iOS Device
9,2987009,0,86536,117.0,W,17399,111.0,150.0,mastercard,224.0,...,,,,,,,,,,


In [17]:
df_fraud.shape

(590540, 434)

We have of lot of columns so we have to reduce the dimension

In [19]:
df_fraud.drop(columns=['TransactionID', 'TransactionDT'], inplace=True)

In [20]:
df_fraud.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590540 entries, 0 to 590539
Columns: 432 entries, isFraud to DeviceInfo
dtypes: float64(399), int64(2), object(31)
memory usage: 1.9+ GB


We have a lot of object columns, which we will need to convert to numerical columns.

In [21]:
nb_missing_values = df_fraud.isnull().sum().sum()
percentage_missing_values = round(100*nb_missing_values / (df_fraud.shape[0] * df_fraud.shape[1]), 1)
print(f'Missing values: {nb_missing_values} ({percentage_missing_values}%)')

Missing values: 115523073 (45.3%)


We have a lot of missing values: more than 45%

In [22]:
missing_data_count = df_fraud.isnull().sum()
missing_data_percentage = round((missing_data_count / len(df_fraud)) * 100, 1)

missing_data_stats = pd.DataFrame({
    'Missing data (count)': missing_data_count,
    'Missing data (%)': missing_data_percentage
})

missing_data_stats.sort_values(by='Missing data (%)', ascending=False).head(10)

Unnamed: 0,Missing data (count),Missing data (%)
id_24,585793,99.2
id_08,585385,99.1
id_23,585371,99.1
id_07,585385,99.1
id_21,585381,99.1
id_22,585371,99.1
id_27,585371,99.1
id_26,585377,99.1
id_25,585408,99.1
dist2,552913,93.6


For some columns, the missing data percentage is very high. We will have to see if we delete them or if the missing values give information.

## Encoding

To encode object data, we use a OneHotEncoder

In [23]:
from sklearn.preprocessing import OneHotEncoder

object_df = df_fraud.select_dtypes(include=['object'])

encoder = OneHotEncoder(drop='first')
encoded_data = encoder.fit_transform(object_df)

categories = encoder.categories_

encoded_columns = []
for i, col in enumerate(object_df.columns):
    encoded_columns.extend([f"{col}_{category}" for category in categories[i][1:]])

encoded_df = pd.DataFrame(encoded_data.toarray(), columns=encoded_columns)

df_fraud = df_fraud.drop(columns=object_df.columns)
df_fraud = pd.concat([df_fraud, encoded_df], axis=1)

In [24]:
df_fraud.shape

(590540, 2832)

Now we have an even higher number of columns.

Let's see the linear correlation between the columns and the output.

In [25]:
correlation = df_fraud.drop('isFraud', axis=1).corrwith(df_fraud['isFraud']).apply(abs).sort_values(ascending=False)
correlation.head()

V257    0.383060
V246    0.366878
V244    0.364129
V242    0.360590
V201    0.328005
dtype: float64

Some columns have a quite high correlation with the output. We can try to eliminate the variables with the lowest correlation.

In [26]:
import numpy as np
import plotly.graph_objects as go

fig = go.Figure()
thresholds = np.arange(0, correlation[0], 0.005)
nb_col_remaining = []

for threshold in thresholds:
    columns_to_drop = correlation[correlation < threshold].index
    nb_col_remaining.append(df_fraud.shape[1] - len(columns_to_drop))

fig.add_trace(go.Scatter(x=thresholds, y=nb_col_remaining, mode='lines+markers'))
fig.update_layout(title='Number of columns remaining depending on the correlation threshold',
                  xaxis_title='Threshold',
                  yaxis_title='Number of columns remaining')


If we eliminate all the columns which have a correlation lower than 0.05, we still keep a lot of columns. But we eliminate more than 2000 variable, most of them are from the encoder which mean some values encoded don't have correlation with the output.

In [27]:
threshold = 0.05
columns_to_drop = correlation[correlation < threshold].index

In [28]:
df_fraud_filtered = df_fraud.drop(columns=columns_to_drop)

In [29]:
df_fraud_filtered.shape

(590540, 216)

In [30]:
df_fraud_filtered.to_csv('data/data_fraud_filtered1.csv', index=False)

## Preprocessing

### Load encoded data

In [31]:
import pandas as pd

df_fraud_filtered = pd.read_csv('data/data_fraud_filtered1.csv')
df_fraud_filtered.head()

Unnamed: 0,isFraud,card3,D1,D2,D4,D5,D6,D7,D8,D10,...,id_35_nan,id_36_nan,id_37_T,id_37_nan,id_38_nan,DeviceType_mobile,DeviceType_nan,DeviceInfo_SM-A300H Build/LRX22G,DeviceInfo_hi6210sft Build/MRA58K,DeviceInfo_nan
0,0,150.0,14.0,,,,,,,13.0,...,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
1,0,150.0,0.0,,0.0,,,,,0.0,...,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
2,0,150.0,0.0,,0.0,,,,,0.0,...,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
3,0,150.0,112.0,112.0,94.0,0.0,,,,84.0,...,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
4,0,150.0,0.0,,,,,,,,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


Now, we are going to impute values to train models on it.

### Imputation of data

In [32]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

import warnings
warnings.filterwarnings('ignore')

imputer = IterativeImputer(n_nearest_features=20, random_state=42)
imputer.fit(df_fraud_filtered)
df_fraud_imputed = imputer.transform(df_fraud_filtered)
df_fraud_imputed = pd.DataFrame(df_fraud_imputed, columns=df_fraud_filtered.columns)

In [33]:
df_fraud_imputed.to_csv('data/data_fraud_imputed1.csv', index=False)

Now we can reduce again the dimension eliminating multicolinearity. To do this, we are going to use the Variance Inflation Factor (VIF). A VIF higher than 10 reveals a high multicolinearity so we can delete these variables.

Calculating the VIF require a lot of time so we are going to use a random sample to eliminate the multicolinearity.

### Delete multicolinearity with VIF on a sample

In [34]:
import pandas as pd

df_fraud_imputed = pd.read_csv('data/data_fraud_imputed1.csv')

In [35]:
size_sample = 5000
df_sampled = df_fraud_imputed.sample(n=size_sample, random_state=42)

In [36]:
from utils import delete_multicollinearity

import warnings
warnings.filterwarnings('ignore')

df_fraud_preprocessed = delete_multicollinearity(df_sampled, 'isFraud', 10)

Dropped column id_15_nan with VIF: inf (23s)
Dropped column M3_nan with VIF: inf (24s)
Dropped column M7_nan with VIF: inf (24s)
Dropped column M2_nan with VIF: inf (24s)
Dropped column M9_nan with VIF: inf (24s)
Dropped column id_29_nan with VIF: inf (23s)
Dropped column id_38_nan with VIF: inf (24s)
Dropped column id_37_nan with VIF: inf (23s)
Dropped column id_36_nan with VIF: inf (23s)
Dropped column id_35_nan with VIF: inf (23s)
Dropped column id_18 with VIF: 11525.2 (22s)
Dropped column DeviceType_nan with VIF: 5303.4 (22s)
Dropped column V155 with VIF: 4703.2 (22s)
Dropped column V247 with VIF: 4043.5 (21s)
Dropped column D7 with VIF: 3432.2 (21s)
Dropped column id_26 with VIF: 2923.8 (22s)
Dropped column V111 with VIF: 2291.9 (21s)
Dropped column id_22 with VIF: 2042.4 (20s)
Dropped column V186 with VIF: 1997.3 (20s)
Dropped column id_32 with VIF: 1974.7 (20s)
Dropped column V153 with VIF: 1918.6 (19s)
Dropped column V249 with VIF: 1760.6 (19s)
Dropped column V197 with VIF: 158

In [37]:
columns_to_keep = list(df_fraud_preprocessed.columns) + ['isFraud']
df_fraud_final = df_fraud_imputed[columns_to_keep]

In [38]:
df_fraud_final.shape

(590540, 57)

Now we have 57 columns to train our models.

In [39]:
print(df_fraud.columns)

Index(['isFraud', 'TransactionAmt', 'card1', 'card2', 'card3', 'card5',
       'addr1', 'addr2', 'dist1', 'dist2',
       ...
       'DeviceInfo_verykoolS5524', 'DeviceInfo_verykoolS5525',
       'DeviceInfo_verykoolS5530 Build/LMY47I', 'DeviceInfo_verykools4009',
       'DeviceInfo_verykools5004', 'DeviceInfo_verykools5034',
       'DeviceInfo_verykools5035', 'DeviceInfo_vivo',
       'DeviceInfo_xs-Z47b7VqTMxs', 'DeviceInfo_nan'],
      dtype='object', length=2832)


In [40]:
df_fraud_final.to_csv('data/data_preprocessed/data_fraud1.csv', index=False)