In [2]:
# Standard Imports
import pandas as pd
import numpy as np
import os
from datetime import datetime

# Model evaluation
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, classification_report, confusion_matrix, roc_curve, precision_recall_curve, fbeta_score

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Sampling for Imbalanced Data
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

# Feature Scaling
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Explanation Tools
#import shap
#from lime.lime_tabular import LimeTabularExplainer

In [3]:
df = pd.read_csv("C:/Users/janec/Downloads/updated_dataset.csv")
df

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud,VPN used,usual IP Address,known device,time to last transaction
0,57.877857,0.311140,1.945940,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.454267
1,10.829943,0.175592,1.294219,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.134098
2,5.091079,0.805153,0.427715,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.136783
3,2.247564,5.600044,0.362663,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,4.328188
4,44.190936,0.566486,2.222767,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2.309916
...,...,...,...,...,...,...,...,...,...,...,...,...
999995,2.207101,0.112651,1.626798,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.652585
999996,19.872726,2.683904,2.778303,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.849675
999997,2.914857,1.472687,0.218075,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,4.168035
999998,4.258729,0.242023,0.475822,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.880779


In [4]:
df.describe()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud,VPN used,usual IP Address,known device,time to last transaction
count,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0
mean,26.628792,5.036519,1.824182,0.881536,0.350399,0.100608,0.650552,0.087403,0.35,0.325165,0.325656,3.501953
std,65.390784,25.843093,2.799589,0.323157,0.477095,0.300809,0.476796,0.282425,0.47697,0.468437,0.46862,2.020649
min,0.004874,0.000118,0.004399,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8e-06
25%,3.878008,0.296671,0.475673,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.754444
50%,9.96776,0.99865,0.997717,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.501074
75%,25.743985,3.355748,2.09637,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,5.253067
max,10632.723672,11851.104565,267.802942,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,6.999996


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 12 columns):
 #   Column                          Non-Null Count    Dtype  
---  ------                          --------------    -----  
 0   distance_from_home              1000000 non-null  float64
 1   distance_from_last_transaction  1000000 non-null  float64
 2   ratio_to_median_purchase_price  1000000 non-null  float64
 3   repeat_retailer                 1000000 non-null  float64
 4   used_chip                       1000000 non-null  float64
 5   used_pin_number                 1000000 non-null  float64
 6   online_order                    1000000 non-null  float64
 7   fraud                           1000000 non-null  float64
 8   VPN used                        1000000 non-null  float64
 9   usual IP Address                1000000 non-null  float64
 10  known device                    1000000 non-null  float64
 11  time to last transaction        1000000 non-null  float64
dtypes

In [6]:
df["fraud"].value_counts()

fraud
0.0    912597
1.0     87403
Name: count, dtype: int64