In [9]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.combine import SMOTEENN

## Import Data

In [10]:
df = pd.read_csv('fraud_data.csv', index_col= 0)
df.head()

Unnamed: 0,amt,lat,long,city_pop,merch_lat,merch_long,is_fraud,age,merchant_count,category_food_dining,...,state_SD,state_TN,state_TX,state_UT,state_VA,state_VT,state_WA,state_WI,state_WV,state_WY
0,2.86,33.9659,-80.9355,333497,33.986391,-81.200714,0,52,2,0,...,0,0,0,0,0,0,0,0,0,0
1,29.84,40.3207,-110.436,302,39.450498,-109.960431,0,30,1,0,...,0,0,0,1,0,0,0,0,0,0
2,41.28,40.6729,-73.5365,34496,40.49581,-74.196111,0,50,3,0,...,0,0,0,0,0,0,0,0,0,0
3,60.05,28.5697,-80.8191,54767,28.812398,-80.883061,0,33,1,0,...,0,0,0,0,0,0,0,0,0,0
4,3.19,44.2529,-85.017,1126,44.959148,-85.884734,0,65,1,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# Create independent value
x = df.drop(['is_fraud'], axis=1)
x

Unnamed: 0,amt,lat,long,city_pop,merch_lat,merch_long,age,merchant_count,category_food_dining,category_gas_transport,...,state_SD,state_TN,state_TX,state_UT,state_VA,state_VT,state_WA,state_WI,state_WV,state_WY
0,2.86,33.9659,-80.9355,333497,33.986391,-81.200714,52,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,29.84,40.3207,-110.4360,302,39.450498,-109.960431,30,1,0,0,...,0,0,0,1,0,0,0,0,0,0
2,41.28,40.6729,-73.5365,34496,40.495810,-74.196111,50,3,0,0,...,0,0,0,0,0,0,0,0,0,0
3,60.05,28.5697,-80.8191,54767,28.812398,-80.883061,33,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3.19,44.2529,-85.0170,1126,44.959148,-85.884734,65,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
555714,43.77,40.4931,-91.8912,519,39.946837,-91.333331,54,2,0,0,...,0,0,0,0,0,0,0,0,0,0
555715,111.84,29.0393,-95.4401,28739,29.661049,-96.186633,21,1,0,0,...,0,0,1,0,0,0,0,0,0,0
555716,86.88,46.1966,-118.9017,3684,46.658340,-119.715054,39,1,0,0,...,0,0,0,0,0,0,1,0,0,0
555717,7.99,44.6255,-116.4493,129,44.470525,-117.080888,55,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# Create Dependent Value
y = df['is_fraud']
y

0         0
1         0
2         0
3         0
4         0
         ..
555714    0
555715    0
555716    0
555717    0
555718    0
Name: is_fraud, Length: 555719, dtype: int64

## Train Test Split

In [13]:
# Train Test Split
x_train, x_test, y_train, y_test = train_test_split(x,y, train_size=.8, random_state=20)

In [14]:
# Initial Model
model_dt = DecisionTreeClassifier(criterion = 'gini', random_state = 100)

In [15]:
# Fit Model
model_dt.fit(x_train, y_train)

In [16]:
# Create Predictions
y_pred = model_dt.predict(x_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [17]:
# Check Score
model_dt.score(x_test, y_test)

0.997219822932412

In [18]:
# Check Report
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    110708
           1       0.64      0.67      0.65       436

    accuracy                           1.00    111144
   macro avg       0.82      0.83      0.83    111144
weighted avg       1.00      1.00      1.00    111144



Lower precision due to imbalanced dataset

we must use smoteen

## Smoteen

In [19]:
# Resample the Data
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(x,y)

In [20]:
# 80 / 20 Train Test Split
xr_train,xr_test,yr_train,yr_test=train_test_split(X_resampled, y_resampled,test_size=0.2)

In [21]:
# Train Validation Test Split 70/10/20
xr_train,xr_val,yr_train,yr_val=train_test_split(xr_train, yr_train,train_size=0.875)

In [22]:
model_dt_smote=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [23]:
model_dt_smote.fit(xr_train,yr_train)
yr_predict = model_dt_smote.predict(xr_test)
model_score_r = model_dt_smote.score(xr_test, yr_test)
print(model_score_r)
print(metrics.classification_report(yr_test, yr_predict))

0.909762141881094
              precision    recall  f1-score   support

           0       0.93      0.89      0.91    106646
           1       0.89      0.93      0.91    109239

    accuracy                           0.91    215885
   macro avg       0.91      0.91      0.91    215885
weighted avg       0.91      0.91      0.91    215885



Accuracy, precision, recall, and f1-score are all good but it can be better if we optimize perameters using validation set

### Cross Validation to optimize hyperperameters

In [24]:
# Create possible values for hyperperameters
max_depth_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
leaf_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [25]:
# Create variables to store best values
best_score = 0
best_params = {'max_depth': None, 'leaf': None}

In [26]:
# Nested for loop to test combinations of possible hyperperameters to find the best values
for max_depth in max_depth_values:
    for leaf in leaf_values:
        model_dt_smote=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=max_depth, min_samples_leaf=leaf)
        model_dt_smote.fit(xr_train,yr_train)
        model_score_r = model_dt_smote.score(xr_val, yr_val)

        if model_score_r > best_score:
            best_score = model_score_r
            best_params['max_depth'] = max_depth
            best_params['leaf'] = leaf

In [27]:
# See best values and best score
best_score, best_params

(0.94045005234244, {'max_depth': 10, 'leaf': 2})

### Final Model Using Optimized Hyperpermaters and SMOTEENN

In [32]:
# Apply best hyperpermaters to model
model_dt_smote=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=10, min_samples_leaf=2)

In [33]:
# Fit model and final steps to see accuracy and report of test data
model_dt_smote.fit(xr_train,yr_train)
yr_predict = model_dt_smote.predict(xr_test)
model_score_r = model_dt_smote.score(xr_test, yr_test)
print(model_score_r)
print(metrics.classification_report(yr_test, yr_predict))

0.9424647381707854
              precision    recall  f1-score   support

           0       0.97      0.92      0.94    106646
           1       0.92      0.97      0.94    109239

    accuracy                           0.94    215885
   macro avg       0.94      0.94      0.94    215885
weighted avg       0.94      0.94      0.94    215885



In [30]:
# Print confusion matrix
print(metrics.confusion_matrix(yr_test, yr_predict))

[[ 97592   9054]
 [  3368 105871]]


Optimizing the hyperperameters made the accuracy, precision, recall, and f1-score better.

## Pickle

In [31]:
# Pickle the model
import pickle
filename = 'model.sav'
pickle.dump(model_dt_smote, open(filename, 'wb'))