# Preprocessing fraud dataset

In [1]:
import pandas as pd

df_fraud = pd.read_csv('../data/data_fraud.csv')
df_fraud.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


In [2]:
df_fraud.drop(columns=['TransactionID', 'TransactionDT'], inplace=True)

In [3]:
missing_data_count = df_fraud.isnull().sum()
missing_data_percentage = round((missing_data_count / len(df_fraud)) * 100, 1)

missing_data_stats = pd.DataFrame({
    'Missing data (count)': missing_data_count,
    'Missing data (%)': missing_data_percentage
})

missing_data_stats.sort_values(by='Missing data (%)', ascending=False, inplace=True)

In [4]:
col2drop = missing_data_percentage[missing_data_percentage > 75].index
df_fraud.drop(columns=col2drop, inplace=True)

We have a lot of object columns, which we will need to convert to numerical columns.

In [5]:
object_df = df_fraud.select_dtypes(include=['object'])

object_df.nunique().sort_values(ascending=False)

P_emaildomain    59
ProductCD         5
card4             4
card6             4
M4                3
M1                2
M2                2
M3                2
M5                2
M6                2
M7                2
M8                2
M9                2
dtype: int64

## Encoding

To encode object data, we use a OneHotEncoder

In [6]:
from sklearn.preprocessing import OneHotEncoder

object_df = df_fraud.select_dtypes(include=['object'])

encoder = OneHotEncoder(drop='first')
encoded_data = encoder.fit_transform(object_df)

categories = encoder.categories_

encoded_columns = []
for i, col in enumerate(object_df.columns):
    encoded_columns.extend([f"{col}_{category}" for category in categories[i][1:]])

encoded_df = pd.DataFrame(encoded_data.toarray(), columns=encoded_columns)

df_fraud = df_fraud.drop(columns=object_df.columns)
df_fraud = pd.concat([df_fraud, encoded_df], axis=1)

In [7]:
df_fraud.shape

(590540, 301)

## Imputation

In [8]:
from sklearn.model_selection import train_test_split

df_fraud = df_fraud.sample(frac=1, random_state=42)

X = df_fraud.drop('isFraud', axis=1)
y = df_fraud['isFraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
df_train = pd.concat([X_train, y_train], axis=1)
X_dev, X_test, y_dev, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)
df_dev = pd.concat([X_dev, y_dev], axis=1)
df_test = pd.concat([X_test, y_test], axis=1)

### Evaluation of the imputation

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import recall_score
from time import time

features = [5, 10, 20, 50]
iters = [5, 10, 20, 50]

for max_iter in iters:
    for n_nearest_features in features:
        print(f"n_nearest_features: {n_nearest_features}, max_iter: {max_iter}")
        imputer = IterativeImputer(n_nearest_features=n_nearest_features, max_iter=max_iter, random_state=42)
        pipeline = Pipeline(steps=[('i', imputer), ('m', RandomForestClassifier(n_estimators=20, max_depth=10, random_state=42))])
        start = time()
        pipeline.fit(X_train, y_train)
        end = time()
        y_pred = pipeline.predict(X_dev)
        recall = recall_score(y_dev, y_pred)
        print(f"recall: {round(recall, 3)} ({round(end - start, 1)}s)")
        print()

n_nearest_features: 5, max_iter: 5




recall: 0.254 (311.9s)

n_nearest_features: 10, max_iter: 5




recall: 0.252 (448.2s)

n_nearest_features: 20, max_iter: 5




recall: 0.256 (699.6s)

n_nearest_features: 50, max_iter: 5




recall: 0.253 (2091.7s)

n_nearest_features: 5, max_iter: 10




recall: 0.251 (483.0s)

n_nearest_features: 10, max_iter: 10




recall: 0.246 (741.3s)

n_nearest_features: 20, max_iter: 10




recall: 0.255 (1108.5s)

n_nearest_features: 50, max_iter: 10




recall: 0.247 (3915.4s)

n_nearest_features: 5, max_iter: 20




recall: 0.248 (928.5s)

n_nearest_features: 10, max_iter: 20




recall: 0.242 (1419.1s)

n_nearest_features: 20, max_iter: 20




recall: 0.247 (2116.4s)

n_nearest_features: 50, max_iter: 20




recall: 0.256 (12304.0s)

n_nearest_features: 5, max_iter: 50




recall: 0.258 (2333.4s)

n_nearest_features: 10, max_iter: 50




recall: 0.245 (3506.1s)

n_nearest_features: 20, max_iter: 50




recall: 0.247 (5216.3s)

n_nearest_features: 50, max_iter: 50




recall: 0.24 (19058.4s)



In [9]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

import warnings
warnings.filterwarnings('ignore')

imputer = IterativeImputer(n_nearest_features=5, max_iter=20, random_state=42)
imputer.fit(df_train)
df_train_imputed = pd.DataFrame(imputer.transform(df_train), columns=df_train.columns)
df_test_imputed = pd.DataFrame(imputer.transform(df_test), columns=df_test.columns)

## Delete multicolinearity with VIF

In [10]:
df_sampled = df_train_imputed.sample(frac=1, random_state=42)

cols_V = [col for col in df_sampled.columns if col.startswith('V')] + ['isFraud']
cols_card = [col for col in df_sampled.columns if col.startswith('card')] + ['isFraud']
cols_C = [col for col in df_sampled.columns if col.startswith('C')] + ['isFraud']
cols_D = [col for col in df_sampled.columns if col.startswith('D')] + ['isFraud']
cols_M = [col for col in df_sampled.columns if col.startswith('M')] + ['isFraud']

df_card = df_sampled[[col for col in df_sampled.columns if col.startswith('card')] + ['isFraud']]
df_C = df_sampled[[col for col in df_sampled.columns if col.startswith('C')] + ['isFraud']]
df_D = df_sampled[[col for col in df_sampled.columns if col.startswith('D')] + ['isFraud']]
df_M = df_sampled[[col for col in df_sampled.columns if col.startswith('M')] + ['isFraud']]
df_V_0to20 = df_sampled[cols_V[:20] + ['isFraud']]
df_V_21to40 = df_sampled[cols_V[20:40] + ['isFraud']]
df_V_41to60 = df_sampled[cols_V[40:60] + ['isFraud']]
df_V_61to80 = df_sampled[cols_V[60:80] + ['isFraud']]
df_V_81to100 = df_sampled[cols_V[80:100] + ['isFraud']]
df_V_101to120 = df_sampled[cols_V[100:120] + ['isFraud']]
df_V_121to140 = df_sampled[cols_V[120:140] + ['isFraud']]
df_V_141to160 = df_sampled[cols_V[140:160] + ['isFraud']]
df_V_161to180 = df_sampled[cols_V[160:] + ['isFraud']]

In [11]:
from utils import delete_multicollinearity

import warnings
warnings.filterwarnings('ignore')

col2keep = ['isFraud']

col2keep += delete_multicollinearity(df_card, 'isFraud', 5).columns.tolist()
col2keep += delete_multicollinearity(df_C, 'isFraud', 5).columns.tolist()
col2keep += delete_multicollinearity(df_D, 'isFraud', 5).columns.tolist()
col2keep += delete_multicollinearity(df_M, 'isFraud', 5).columns.tolist()

Dropped column card6_debit with VIF: 226.5 (3s)
Dropped column card4_nan with VIF: 88.5 (2s)
Dropped column card3 with VIF: 68.2 (2s)
Dropped column card5 with VIF: 23.7 (2s)
Dropped column card4_visa with VIF: 6.5 (1s)
Dropped column C12 with VIF: 4159.0 (4s)
Dropped column C11 with VIF: 1699.0 (3s)
Dropped column C8 with VIF: 1240.1 (2s)
Dropped column C4 with VIF: 552.5 (2s)
Dropped column C10 with VIF: 340.4 (1s)
Dropped column C6 with VIF: 264.4 (1s)
Dropped column C1 with VIF: 180.0 (1s)
Dropped column C2 with VIF: 79.2 (0s)
Dropped column C13 with VIF: 35.3 (0s)
Dropped column C9 with VIF: 8.0 (0s)
Dropped column D2 with VIF: 7.4 (1s)
Dropped column D15 with VIF: 5.6 (0s)
Dropped column M1_nan with VIF: inf (9s)
Dropped column M9_nan with VIF: inf (8s)
Dropped column M2_nan with VIF: inf (7s)
Dropped column M8_nan with VIF: 23104.7 (6s)
Dropped column M5_nan with VIF: 27.2 (4s)
Dropped column M2_T with VIF: 18.6 (4s)
Dropped column M3_nan with VIF: 14.5 (3s)
Dropped column M1_T 

In [12]:
col2keep_V = ['isFraud']

col2keep_V += delete_multicollinearity(df_V_0to20, 'isFraud', 5).columns.tolist()
col2keep_V += delete_multicollinearity(df_V_21to40, 'isFraud', 5).columns.tolist()
col2keep_V += delete_multicollinearity(df_V_41to60, 'isFraud', 5).columns.tolist()
col2keep_V += delete_multicollinearity(df_V_61to80, 'isFraud', 5).columns.tolist()
col2keep_V += delete_multicollinearity(df_V_81to100, 'isFraud', 5).columns.tolist()
col2keep_V += delete_multicollinearity(df_V_101to120, 'isFraud', 5).columns.tolist()
col2keep_V += delete_multicollinearity(df_V_121to140, 'isFraud', 5).columns.tolist()
col2keep_V += delete_multicollinearity(df_V_141to160, 'isFraud', 5).columns.tolist()
col2keep_V += delete_multicollinearity(df_V_161to180, 'isFraud', 5).columns.tolist()

Dropped column V1 with VIF: 5000.4 (11s)
Dropped column V8 with VIF: 348.6 (10s)
Dropped column V9 with VIF: 81.9 (8s)
Dropped column V6 with VIF: 72.4 (7s)
Dropped column V17 with VIF: 47.2 (6s)
Dropped column V2 with VIF: 45.7 (5s)
Dropped column V15 with VIF: 35.5 (4s)
Dropped column V19 with VIF: 33.2 (3s)
Dropped column V4 with VIF: 24.7 (2s)
Dropped column V14 with VIF: 21.5 (2s)
Dropped column V10 with VIF: 19.8 (1s)
Dropped column V13 with VIF: 16.0 (1s)
Dropped column V5 with VIF: 10.1 (1s)
Dropped column V7 with VIF: 7.0 (0s)
Dropped column V25 with VIF: 125.4 (11s)
Dropped column V31 with VIF: 88.1 (9s)
Dropped column V23 with VIF: 57.8 (8s)
Dropped column V21 with VIF: 21.5 (7s)
Dropped column V27 with VIF: 20.7 (6s)
Dropped column V29 with VIF: 20.3 (4s)
Dropped column V37 with VIF: 20.0 (4s)
Dropped column V39 with VIF: 17.2 (3s)
Dropped column V26 with VIF: 15.4 (2s)
Dropped column V34 with VIF: 14.6 (2s)
Dropped column V36 with VIF: 13.3 (1s)
Dropped column V32 with VIF

In [13]:
print(len(col2keep_V))

68


In [14]:
col2keep_V1 = ['isFraud']

df_V1_0to23 = df_sampled[col2keep_V[:23] + ['isFraud']]
df_V1_24to46 = df_sampled[col2keep_V[23:46] + ['isFraud']]
df_V1_46to68 = df_sampled[col2keep_V[46:] + ['isFraud']]

col2keep_V1 += delete_multicollinearity(df_V1_0to23, 'isFraud', 5).columns.tolist()
col2keep_V1 += delete_multicollinearity(df_V1_24to46, 'isFraud', 5).columns.tolist()
col2keep_V1 += delete_multicollinearity(df_V1_46to68, 'isFraud', 5).columns.tolist()

Dropped column V18 with VIF: 14.4 (14s)
Dropped column V11 with VIF: 13.2 (12s)
Dropped column V16 with VIF: 12.2 (11s)
Dropped column V30 with VIF: 10.8 (9s)
Dropped column V43 with VIF: 10.0 (8s)
Dropped column V28 with VIF: 9.5 (7s)
Dropped column V62 with VIF: 8.9 (6s)
Dropped column V49 with VIF: 8.1 (4s)
Dropped column V38 with VIF: 8.0 (4s)
Dropped column V52 with VIF: 5.9 (3s)
Dropped column V95 with VIF: 222992.6 (15s)
Dropped column V279 with VIF: 418.5 (14s)
Dropped column V118 with VIF: 285.5 (12s)
Dropped column V80 with VIF: 16.2 (11s)
Dropped column V121 with VIF: 12.3 (9s)
Dropped column V94 with VIF: 7.4 (8s)
Dropped column V87 with VIF: 7.1 (7s)
Dropped column V293 with VIF: 116.5 (13s)
Dropped column V290 with VIF: 8.5 (12s)
Dropped column V287 with VIF: 7.2 (11s)


In [15]:
print(len(col2keep_V1))

48


In [16]:
col2keep += delete_multicollinearity(df_sampled[col2keep_V1], 'isFraud', 5).columns.tolist()

Dropped column V101 with VIF: 104.4 (96s)
Dropped column V3 with VIF: 37.8 (90s)
Dropped column V305 with VIF: 17.5 (84s)
Dropped column V310 with VIF: 11.2 (81s)
Dropped column V68 with VIF: 10.6 (76s)
Dropped column V83 with VIF: 9.9 (71s)
Dropped column V91 with VIF: 9.4 (66s)
Dropped column V297 with VIF: 9.4 (62s)
Dropped column V99 with VIF: 9.2 (58s)
Dropped column V129 with VIF: 8.0 (55s)
Dropped column V81 with VIF: 8.0 (50s)
Dropped column V56 with VIF: 7.6 (47s)
Dropped column V74 with VIF: 6.9 (44s)
Dropped column V282 with VIF: 5.9 (44s)
Dropped column V75 with VIF: 5.7 (38s)
Dropped column V22 with VIF: 5.6 (42s)


In [17]:
print(len(col2keep))
print(col2keep)

60
['isFraud', 'card1', 'card2', 'card4_discover', 'card4_mastercard', 'card6_credit', 'card6_debit or credit', 'card6_nan', 'C3', 'C5', 'C7', 'C14', 'D1', 'D3', 'D4', 'D5', 'D10', 'D11', 'M3_T', 'M4_M1', 'M4_M2', 'M4_nan', 'M5_T', 'M6_T', 'M6_nan', 'M7_T', 'M7_nan', 'M8_T', 'M9_T', 'V12', 'V20', 'V33', 'V35', 'V40', 'V45', 'V53', 'V70', 'V78', 'V89', 'V98', 'V100', 'V104', 'V130', 'V135', 'V281', 'V283', 'V284', 'V285', 'V286', 'V288', 'V291', 'V296', 'V301', 'V303', 'V311', 'V312', 'V313', 'V314', 'V316', 'V319']


In [18]:
df_sampled = df_train_imputed.sample(frac=0.1, random_state=42)

col2keep_final = ['isFraud'] + delete_multicollinearity(df_sampled[col2keep], 'isFraud', 5).columns.tolist()

Dropped column V135 with VIF: 8.9 (14s)
Dropped column M4_nan with VIF: 8.7 (10s)
Dropped column M7_nan with VIF: 7.5 (12s)
Dropped column V12 with VIF: 5.5 (8s)
Dropped column card2 with VIF: 5.4 (8s)
Dropped column V33 with VIF: 5.3 (9s)


In [19]:
print(len(col2keep_final))
print(col2keep_final)

54
['isFraud', 'card1', 'card4_discover', 'card4_mastercard', 'card6_credit', 'card6_debit or credit', 'card6_nan', 'C3', 'C5', 'C7', 'C14', 'D1', 'D3', 'D4', 'D5', 'D10', 'D11', 'M3_T', 'M4_M1', 'M4_M2', 'M5_T', 'M6_T', 'M6_nan', 'M7_T', 'M8_T', 'M9_T', 'V20', 'V35', 'V40', 'V45', 'V53', 'V70', 'V78', 'V89', 'V98', 'V100', 'V104', 'V130', 'V281', 'V283', 'V284', 'V285', 'V286', 'V288', 'V291', 'V296', 'V301', 'V303', 'V311', 'V312', 'V313', 'V314', 'V316', 'V319']


In [20]:
df_train_final = df_train_imputed[col2keep_final]

In [21]:
from collections import Counter
from imblearn.over_sampling import ADASYN

X_train, y_train = df_train_imputed.drop(columns=['isFraud']), df_train_imputed['isFraud']

ada = ADASYN(random_state=42)
X_train_res, y_train_res = ada.fit_resample(X_train, y_train)

df_train_res = pd.concat([X_train_res, y_train_res], axis=1)

print('Original train dataset shape {}'.format(Counter(y_train)))
print('Resampled train dataset shape {}'.format(Counter(y_train_res)))

Original train dataset shape Counter({0.0: 455895, 1.0: 16537})
Resampled train dataset shape Counter({0.0: 455895, 1.0: 455120})


In [22]:
df_train_res.to_csv('../data/data_preprocessed/data_fraud_train3.csv', index=False)
df_dev[col2keep_final].to_csv('../data/data_preprocessed/data_fraud_dev3.csv', index=False)
df_test[col2keep_final].to_csv('../data/data_preprocessed/data_fraud_test3.csv', index=False)