In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
pd.options.display.max_columns = None

In [4]:
data = pd.read_csv("../datasets/fraud_data.csv")

In [5]:
data.head(5)

Unnamed: 0,trans_date_trans_time,merchant,category,amt,city,state,lat,long,city_pop,job,dob,trans_num,merch_lat,merch_long,is_fraud
0,04-01-2019 00:58,"""Stokes, Christiansen and Sipes""",grocery_net,14.37,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,a3806e984cec6ac0096d8184c64ad3a1,65.654142,-164.722603,1
1,04-01-2019 15:06,Predovic Inc,shopping_net,966.11,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,a59185fe1b9ccf21323f581d7477573f,65.468863,-165.473127,1
2,04-01-2019 22:37,Wisozk and Sons,misc_pos,49.61,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,86ba3a888b42cd3925881fa34177b4e0,65.347667,-165.914542,1
3,04-01-2019 23:06,Murray-Smitham,grocery_pos,295.26,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,3a068fe1d856f0ecedbed33e4b5f4496,64.445035,-166.080207,1
4,04-01-2019 23:59,Friesen Lt,health_fitness,18.17,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,891cdd1191028759dc20dc224347a0ff,65.447094,-165.446843,1


In [6]:
data.dtypes

trans_date_trans_time     object
merchant                  object
category                  object
amt                      float64
city                      object
state                     object
lat                      float64
long                     float64
city_pop                   int64
job                       object
dob                       object
trans_num                 object
merch_lat                float64
merch_long               float64
is_fraud                  object
dtype: object

In [7]:
#No Missing values
data.isnull().sum()

trans_date_trans_time    0
merchant                 0
category                 0
amt                      0
city                     0
state                    0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64

In [8]:
#Assuming that, lat and long are the same of merch_lat and merch_long I'll drop it (Simple inspection, similar values)
data.drop(['lat','long'],axis=1,inplace=True)

In [9]:
#Columns to datetime:
data['trans_date_trans_time'] = pd.to_datetime(data['trans_date_trans_time'], format='%d-%m-%Y %H:%M')
data['dob'] = pd.to_datetime(data['dob'], format='%d-%m-%Y')

In [10]:
#Target Column to integer, but first a simple check:
print(np.unique(data[['is_fraud']].values))

#There are some values differents of 0 or 1 so I need to remove them
data = data.loc[data['is_fraud'].isin(["0","1"])]

print(np.unique(data[['is_fraud']].values))

data['is_fraud'] = data['is_fraud'].astype(int)

['0' '0"2019-01-01 00:00:44"' '1' '1"2020-12-24 16:56:24"']
['0' '1']


In [11]:
data.dtypes

trans_date_trans_time    datetime64[ns]
merchant                         object
category                         object
amt                             float64
city                             object
state                            object
city_pop                          int64
job                              object
dob                      datetime64[ns]
trans_num                        object
merch_lat                       float64
merch_long                      float64
is_fraud                          int64
dtype: object

In [12]:
#Feature Engineering
from datetime import date

# In order to avoid data types issues:
today = pd.to_datetime(date.today(), format='%d-%m-%Y')

# New release of numpy doesn't support 'Y' and 'M' so we need to do a little work to arrived to year:
data['owner_age'] = (today - data['dob']) / np.timedelta64(1, 'W')
data['owner_age'] = (data['owner_age'] / 52).astype(int) #Here I´m supposing that, every year has 52 weeks.
data.drop('dob', axis=1, inplace=True)

In [13]:
#Extracting all the information from the transaction date and then drop it:
data['year'] = data['trans_date_trans_time'].dt.year
data['month'] = data['trans_date_trans_time'].dt.month
data['day'] = data['trans_date_trans_time'].dt.day

data['hour'] = data['trans_date_trans_time'].dt.hour
data['min'] = data['trans_date_trans_time'].dt.minute
data['sec'] = data['trans_date_trans_time'].dt.second

data.drop('trans_date_trans_time',axis=1,inplace=True)

In [14]:
#I´ll drop 'merchant' and 'trans_num' because I think there are not useful
data.drop(['merchant','trans_num'],axis=1,inplace=True)

#Also I´ll drop 'city' and 'state' because I think there are redundant data (we already have lat and long)
data.drop(['city','state'],axis=1,inplace=True)

In [15]:
#Encoding
#In this case, I´ll try to avoid use label encoding becuase Machine learning algorithms 
# may misinterpret the integer labels as having mathematical significance
np.unique(data['category'].values) #13 categories

data = pd.get_dummies(data, columns=['category'], dtype='int') #get_dummies ommits one category and by default return boolean.


In [28]:
from sklearn.model_selection import train_test_split

X = data.loc[:, data.columns != 'is_fraud']
y = data['is_fraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=42)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((11555, 26), (11555,), (2889, 26), (2889,))

In [50]:
#For 'job' I want to try something different becuase has a lot of categories:
#print(data['job'].nunique()) #163 categories

from category_encoders.cat_boost import CatBoostEncoder

CBE = CatBoostEncoder()

X_train['job'] = CBE.fit_transform(X_train['job'], y_train)

X_test['job'] = CBE.transform(X_test['job'])




Unnamed: 0,amt,city_pop,job,merch_lat,merch_long,owner_age,year,month,day,hour,min,sec,category_entertainment,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel
4418,84.93,35705,0.126093,33.562114,-116.745451,68,2020,12,28,2,18,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
907,20.0,5662,0.126093,35.191041,-108.255173,35,2019,12,12,22,11,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2860,3.94,759,0.126093,46.703974,-121.232877,68,2020,12,26,9,25,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
5837,4.69,4878,0.126093,19.43131,-155.021034,57,2020,12,29,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2716,182.15,545147,0.126093,38.398077,-95.400727,36,2020,12,26,3,35,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [53]:
from sklearn.preprocessing import StandardScaler

StdSc = StandardScaler()

X_train = StdSc.fit_transform(X_train)
X_test = StdSc.transform(X_test)


array([[-1.67748727e-01, -2.43119573e-01,  2.83510991e-01, ...,
        -3.30360455e-01, -3.16318070e-01, -1.67133054e-01],
       [-4.50209774e-01, -3.47303110e-01,  2.83510991e-01, ...,
        -3.30360455e-01, -3.16318070e-01, -1.67133054e-01],
       [-5.20074620e-01, -3.64305802e-01,  2.83510991e-01, ...,
        -3.30360455e-01, -3.16318070e-01, -1.67133054e-01],
       ...,
       [ 2.67456722e+00, -9.13568973e-04,  2.83510991e-01, ...,
        -3.30360455e-01,  3.16137488e+00, -1.67133054e-01],
       [-5.14375806e-01, -3.60872666e-01,  2.83510991e-01, ...,
        -3.30360455e-01, -3.16318070e-01, -1.67133054e-01],
       [-2.46227047e-01, -3.58726089e-01,  2.83510991e-01, ...,
        -3.30360455e-01, -3.16318070e-01, -1.67133054e-01]])

In [65]:
# from sklearn.linear_model import LogisticRegression
# from sklearn import metrics

# clf = LogisticRegression(random_state=0).fit(X_train, y_train)
# preds = clf.predict(X_test)

# '''
# AUC, or the Area Under the Receiver Operating Characteristic curve measures how well a binary classifier 
# distinguishes between positive and negative classes. Traditionally, you would plot the ROC curve, 
# and the AUC measures the area under the curve. Higher AUC means better performance, and vice-versa.
# '''

# #ROC and AUC metric:
# fpr, tpr, thresholds = metrics.roc_curve(y_test,preds)
# print(np.round(metrics.auc(fpr, tpr),4)) # 0.8029

0.8029


In [None]:
# from sklearn.naive_bayes import GaussianNB

# nb = GaussianNB()

# nb.fit(X_train, y_train)
# preds = nb.predict(X_test)

# #ROC and AUC metric:
# fpr, tpr, thresholds = metrics.roc_curve(y_test,preds)
# print(np.round(metrics.auc(fpr, tpr),4)) # 0.7569

0.7569


In [66]:
# from sklearn.neighbors import KNeighborsClassifier

# knn = KNeighborsClassifier()

# knn.fit(X_train, y_train)
# preds = knn.predict(X_test)

# #ROC and AUC metric:
# fpr, tpr, thresholds = metrics.roc_curve(y_test,preds)
# print(np.round(metrics.auc(fpr, tpr),4)) # 0.8837


0.8837


In [67]:
# from sklearn.svm import SVC

# svc = SVC()

# svc.fit(X_train, y_train)
# preds = svc.predict(X_test)

# #ROC and AUC metric:
# fpr, tpr, thresholds = metrics.roc_curve(y_test,preds)
# print(np.round(metrics.auc(fpr, tpr),4)) # 0.9379

0.9379


In [68]:
# from sklearn.ensemble import RandomForestClassifier

# rf = RandomForestClassifier()

# rf.fit(X_train, y_train)
# preds = rf.predict(X_test)

# #ROC and AUC metric:
# fpr, tpr, thresholds = metrics.roc_curve(y_test,preds)
# print(np.round(metrics.auc(fpr, tpr),4)) # 0.9996

0.9996


In [71]:
# # Just for fun!
# from sklearn.ensemble import GradientBoostingClassifier

# gb = GradientBoostingClassifier()

# gb.fit(X_train, y_train)
# preds = gb.predict(X_test)

# #ROC and AUC metric:
# fpr, tpr, thresholds = metrics.roc_curve(y_test,preds)
# print(np.round(metrics.auc(fpr, tpr),4)) # 0.9944


0.9944


In [74]:
# Assuming that, we consider 0.85 or above a good performance in test, we will try to optimize the performance of the best algoritms

#NOTE: This part are use of CPU intensely, so if you prefer you can modify n_jobs in RandomizedSearchCV to a possitive number.
from sklearn.model_selection import RandomizedSearchCV

#Around one minute running on premises.

# params = {
#     'C': [0.8, 0.85, 0.90, 0.95, 1.0, 1.05, 1.15, 1.20], #Default: 1.0
#     'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], #Default 'rbf'
# }

# rs = RandomizedSearchCV(
#     estimator = SVC(),
#     param_distributions = params,
#     n_iter = 10,
#     scoring = 'roc_auc',
#     n_jobs = -1,
#     random_state = 42
# )

# rs.fit(X_train, y_train)
# preds = rs.predict(X_test)

# #ROC and AUC metric:
# fpr, tpr, thresholds = metrics.roc_curve(y_test,preds)
# print(np.round(metrics.auc(fpr, tpr),4)) # 0.9456 vs 0.9379 (Model with default hyperparameters)

0.9456


In [75]:
# params = {
#     'n_estimators': [80, 90, 100, 110, 120], # Default= 100
#     'criterion': ['gini', 'entropy', 'log_loss'], #Default 'gini'
#     'min_samples_leaf': [1, 2, 3] #Default 1
# }

# rs = RandomizedSearchCV(
#     estimator = RandomForestClassifier(),
#     param_distributions = params,
#     n_iter = 10,
#     scoring = 'roc_auc',
#     n_jobs = -1,
#     random_state = 42
# )

# rs.fit(X_train, y_train)
# preds = rs.predict(X_test)

# #ROC and AUC metric:
# fpr, tpr, thresholds = metrics.roc_curve(y_test,preds)
# print(np.round(metrics.auc(fpr, tpr),4)) # 0.9996 (Same as the model with default hyperparameters)

0.9996


In [77]:
#More than 2 minutes running on premises.
# params = {
#     'learning_rate': [0.05, 0.01, 0.015, 0.02], #Default 0.1
#     'n_estimators': [80, 90, 100, 110, 120], # Default= 100
#     'min_samples_leaf': [1, 2, 3], #Default 1
#     'max_depth': [3, 5, 7, 9] #Default 3
# }

# rs = RandomizedSearchCV(
#     estimator = GradientBoostingClassifier(),
#     param_distributions = params,
#     n_iter = 10,
#     scoring = 'roc_auc',
#     n_jobs = -1,
#     random_state = 42
# )

# rs.fit(X_train, y_train)
# preds = rs.predict(X_test)

# #ROC and AUC metric:
# fpr, tpr, thresholds = metrics.roc_curve(y_test,preds)
# print(np.round(metrics.auc(fpr, tpr),4)) # 1.00 vs 0.9944(Model with default hyperparameters)

1.0
