In [2]:
#import relevant libaries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import glob
import datetime as dt

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from sklearn.utils import resample
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split



In [3]:
os.chdir("/Users/michael/Documents/Aston/FYP/Dataset")
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
#combine all files in the list
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
#export to csv
combined_csv.to_csv( "combined_csv.csv", index=False, encoding='utf-8-sig')

In [6]:
#load dataset and show
df = pd.read_csv("/Users/michael/Documents/Aston/FYP/Dataset/combined_csv.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [7]:
df.dtypes

Unnamed: 0                 int64
trans_date_trans_time     object
cc_num                     int64
merchant                  object
category                  object
amt                      float64
first                     object
last                      object
gender                    object
street                    object
city                      object
state                     object
zip                        int64
lat                      float64
long                     float64
city_pop                   int64
job                       object
dob                       object
trans_num                 object
unix_time                  int64
merch_lat                float64
merch_long               float64
is_fraud                   int64
dtype: object

In [8]:
from sklearn.preprocessing import LabelEncoder

#The following column deals with data preprocessing involving:
#   -Type conversions
#   -Removal of unnecessary data
#   -Reformatting of data to more model-friendly format

#create a LabelEncoder which matches strings to a integer label in order to convert non-numeric features to numerical format for the models
enc = LabelEncoder()

#drop unnecessary columns
dfn = df.drop(['Unnamed: 0','first','last','street','state','trans_num'],axis=1)

#convert following columns using LabelEncoder

enc.fit(dfn['gender'])
dfn['gender'] = enc.transform(dfn['gender'])

enc.fit(dfn['city'])
dfn['city'] = enc.transform(dfn['city'])

enc.fit(dfn['job'])
dfn['job'] = enc.transform(dfn['job'])


#The following functionality aims to seperate the datetime column
#so that it can be processed as multiple features
#   -This can offer greater performance
#   -Allow the identification of trends with regards to specific days, time of day, etc

#define several empty lists for following functionality
dfn['age']=dt.date.today().year-pd.to_datetime(df['dob']).dt.year
dfn['hour']=pd.to_datetime(df['trans_date_trans_time']).dt.hour
dfn['day']=pd.to_datetime(df['trans_date_trans_time']).dt.dayofweek
dfn['month']=pd.to_datetime(df['trans_date_trans_time']).dt.month

dfn=dfn[['gender','job','city','amt','zip','lat','long','city_pop','merch_lat','merch_long','age','hour','day','month','is_fraud']]

print(dfn.dtypes)
dfn.head()

gender          int64
job             int64
city            int64
amt           float64
zip             int64
lat           float64
long          float64
city_pop        int64
merch_lat     float64
merch_long    float64
age             int64
hour            int64
day             int64
month           int64
is_fraud        int64
dtype: object


Unnamed: 0,gender,job,city,amt,zip,lat,long,city_pop,merch_lat,merch_long,age,hour,day,month,is_fraud
0,0,372,532,4.97,28654,36.0788,-81.1781,3495,36.011293,-82.048315,35,0,1,1,0
1,0,431,619,107.23,99160,48.8878,-118.2105,149,49.159047,-118.186462,45,0,1,1,0
2,1,308,474,220.11,83252,42.1808,-112.262,4154,43.150704,-112.154481,61,0,1,1,0
3,1,330,84,45.0,59632,46.2306,-112.1138,1939,47.034331,-112.561071,56,0,1,1,0
4,1,116,217,41.96,24433,38.4207,-79.4629,99,38.674999,-78.632459,37,0,1,1,0


In [9]:
y=dfn['is_fraud'].values
X=dfn.drop("is_fraud", axis='columns').values

In [11]:
#split the data into features and labels

#labels (y) are the what we are trying to predict and as such, is the 'is_fraud' column
y=dfn['is_fraud'].values

#format data and for features (X) remove 'is_fraud' column since this is what we are trying to predict
X=dfn.drop(['is_fraud'], axis='columns').values

print(dfn['is_fraud'].value_counts())

0    3685486
1      19302
Name: is_fraud, dtype: int64


In [12]:
#split data for training and testing
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

In [13]:
#Call function on results of Logistic Regression
Y_predL = LogisticRegression(max_iter=1000, class_weight='balanced').fit(X_train, Y_train).predict(X_test)
print('Classification report:\n', classification_report(Y_test, Y_predL))

Classification report:
               precision    recall  f1-score   support

           0       1.00      0.95      0.97   1216168
           1       0.08      0.75      0.14      6413

    accuracy                           0.95   1222581
   macro avg       0.54      0.85      0.56   1222581
weighted avg       0.99      0.95      0.97   1222581



In [14]:
#Call function on results of Gaussian Naive Bayes Algorithms
Y_predG = GaussianNB().fit(X_train, Y_train).predict(X_test)
print('Classification report:\n', classification_report(Y_test, Y_predG))

Classification report:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00   1216168
           1       0.30      0.47      0.37      6413

    accuracy                           0.99   1222581
   macro avg       0.65      0.73      0.68   1222581
weighted avg       0.99      0.99      0.99   1222581



In [15]:
# Separate majority and minority classes
df_majority = dfn[dfn['is_fraud'] == 0]
df_minority = dfn[dfn['is_fraud'] == 1]
 
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,      # sample with replacement
                                 n_samples=1842743)   # to match majority class
                              
 
#Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
print(df_upsampled['is_fraud'].value_counts())

y1 = df_upsampled['is_fraud'].values
X1=df_upsampled.drop(['is_fraud'], axis=1).values

0    3685486
1    1842743
Name: is_fraud, dtype: int64


In [16]:
X1_train, X1_test, Y1_train, Y1_test = train_test_split(X1, y1, test_size = 0.33, random_state = 42)

In [17]:
Y1_predL = LogisticRegression(max_iter=10000).fit(X1_train, Y1_train).predict(X1_test)
print('Classification report:\n', classification_report(Y1_test, Y1_predL))

Classification report:
               precision    recall  f1-score   support

           0       0.87      0.98      0.92   1216213
           1       0.94      0.70      0.80    608103

    accuracy                           0.88   1824316
   macro avg       0.90      0.84      0.86   1824316
weighted avg       0.89      0.88      0.88   1824316



In [18]:
Y1_predG = GaussianNB().fit(X1_train, Y1_train).predict(X1_test)
print('Classification report:\n', classification_report(Y1_test, Y1_predG))

Classification report:
               precision    recall  f1-score   support

           0       0.80      0.98      0.88   1216213
           1       0.94      0.52      0.67    608103

    accuracy                           0.83   1824316
   macro avg       0.87      0.75      0.78   1824316
weighted avg       0.85      0.83      0.81   1824316



In [19]:
# Separate majority and minority classes
df_majority = dfn[dfn['is_fraud'] == 0]
df_minority = dfn[dfn['is_fraud'] == 1]

# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,      # sample with replacement
                                 n_samples=92246)   # to match majority class

#Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
print(df_upsampled['is_fraud'].value_counts())

y2 = df_upsampled['is_fraud']
X2=df_upsampled.drop(['is_fraud'], axis=1)

0    3685486
1      92246
Name: is_fraud, dtype: int64


In [20]:
X2_train, X2_test, Y2_train, Y2_test = train_test_split(X2, y2, test_size = 0.2, random_state = 42)

In [21]:
Y2_predL = LogisticRegression(max_iter=10000).fit(X2_train, Y2_train).predict(X2_test)
print('Classification report:\n', classification_report(Y2_test, Y2_predL))

Classification report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99    737302
           1       0.69      0.28      0.40     18245

    accuracy                           0.98    755547
   macro avg       0.84      0.64      0.69    755547
weighted avg       0.98      0.98      0.98    755547



In [22]:
Y2_predG = GaussianNB().fit(X2_train, Y2_train).predict(X2_test)
print('Classification report:\n', classification_report(Y2_test, Y2_predG))

Classification report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99    737302
           1       0.61      0.48      0.54     18245

    accuracy                           0.98    755547
   macro avg       0.80      0.74      0.76    755547
weighted avg       0.98      0.98      0.98    755547



In [23]:
# Perform random over-sampling on dataset

ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X, y)
X3_train, X3_test, Y3_train, Y3_test = train_test_split(X_resampled, y_resampled, test_size = 0.3, random_state = 42)

In [24]:
# train and predict features using logistic regression model on over-sampled dataset

Y3_predL = LogisticRegression(max_iter=10000).fit(X3_train, Y3_train).predict(X3_test)
print('Classification report:\n', classification_report(Y3_test, Y3_predL))

Classification report:
               precision    recall  f1-score   support

           0       0.80      0.95      0.87   1105312
           1       0.94      0.76      0.84   1105980

    accuracy                           0.85   2211292
   macro avg       0.87      0.85      0.85   2211292
weighted avg       0.87      0.85      0.85   2211292



In [25]:
# Perform random under-sampling on dataset

rus = RandomUnderSampler(random_state=0)
X2_resampled, y2_resampled = rus.fit_resample(X, y)
X4_train, X4_test, Y4_train, Y4_test = train_test_split(X2_resampled, y2_resampled, test_size = 0.3, random_state = 42)

In [26]:
# train and predict features using logistic regression model on under-sampled dataset

Y4_predL = LogisticRegression(max_iter=10000).fit(X4_train, Y4_train).predict(X4_test)
print('Classification report:\n', classification_report(Y4_test, Y4_predL))

Classification report:
               precision    recall  f1-score   support

           0       0.80      0.95      0.87      5791
           1       0.94      0.76      0.84      5791

    accuracy                           0.86     11582
   macro avg       0.87      0.86      0.86     11582
weighted avg       0.87      0.86      0.86     11582

