In [1]:
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns

sns.set(rc={"figure.figsize": (15, 8)})
df = pd.read_csv("./datasets/fraudTrain.csv")
df.isna().sum()

Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64

In [2]:
df.drop_duplicates(inplace=True)

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [2]:
import datetime as dt

df["age"] = dt.date.today().year - pd.to_datetime(df["dob"]).dt.year
df["hour"] = pd.to_datetime(df["trans_date_trans_time"]).dt.hour
df["day"] = pd.to_datetime(df["trans_date_trans_time"]).dt.dayofweek
df["month"] = pd.to_datetime(df["trans_date_trans_time"]).dt.month

# subset the training data to include only the features that we need
train = df[
    [
        "category",
        "amt",
        "zip",
        "lat",
        "long",
        "city_pop",
        "merch_lat",
        "merch_long",
        "age",
        "hour",
        "day",
        "month",
        "is_fraud",
    ]
]
# convert category to dummy variables
train = pd.get_dummies(train, drop_first=True)
y_train = train["is_fraud"].values
X_train = train.drop("is_fraud", axis="columns").values

In [3]:
# let's do the same to the testing dataset
test = pd.read_csv("./datasets/fraudTest.csv")
test["age"] = dt.date.today().year - pd.to_datetime(test["dob"]).dt.year
test["hour"] = pd.to_datetime(test["trans_date_trans_time"]).dt.hour
test["day"] = pd.to_datetime(test["trans_date_trans_time"]).dt.dayofweek
test["month"] = pd.to_datetime(test["trans_date_trans_time"]).dt.month
test = test[
    [
        "category",
        "amt",
        "zip",
        "lat",
        "long",
        "city_pop",
        "merch_lat",
        "merch_long",
        "age",
        "hour",
        "day",
        "month",
        "is_fraud",
    ]
]
# convert category to dummy variables
test = pd.get_dummies(test, drop_first=True)
y_test = test["is_fraud"].values
X_test = test.drop("is_fraud", axis="columns").values

In [6]:
# logistic regression
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

method = SMOTE()
X_resampled, y_resampled = method.fit_resample(X_train, y_train)
model = LogisticRegression()
model.fit(X_resampled, y_resampled)
predicted = model.predict(X_test)
print("Classification report:\n", classification_report(y_test, predicted))
conf_mat = confusion_matrix(y_true=y_test, y_pred=predicted)
print("Confusion matrix:\n", conf_mat)
print("Share of Non-Fraud in Test Data:", 1 - round(y_test.sum() / len(y_test), 4))

Classification report:
               precision    recall  f1-score   support

           0       1.00      0.95      0.97    553574
           1       0.05      0.75      0.10      2145

    accuracy                           0.95    555719
   macro avg       0.53      0.85      0.54    555719
weighted avg       1.00      0.95      0.97    555719

Confusion matrix:
 [[524660  28914]
 [   534   1611]]
Share of Non-Fraud in Test Data: 0.9961


In [8]:
import pickle

with open("model_regression.pkl", "wb") as file:
    pickle.dump(model, file)

In [7]:
# random forest
from sklearn.ensemble import RandomForestClassifier

model2 = RandomForestClassifier(random_state=5)
model2.fit(X_resampled, y_resampled)
predicted = model2.predict(X_test)
print("Classification report:\n", classification_report(y_test, predicted))
conf_mat = confusion_matrix(y_true=y_test, y_pred=predicted)
print("Confusion matrix:\n", conf_mat)
print("Share of Non-Fraud in Test Data:", 1 - round(y_test.sum() / len(y_test), 4))

Classification report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.89      0.72      0.80      2145

    accuracy                           1.00    555719
   macro avg       0.94      0.86      0.90    555719
weighted avg       1.00      1.00      1.00    555719

Confusion matrix:
 [[553381    193]
 [   593   1552]]
Share of Non-Fraud in Test Data: 0.9961


In [9]:
with open("model_randforest.pkl", "wb") as file:
    pickle.dump(model2, file)

In [42]:
examples = []
examples.append(X_test[3])
examples.append(X_test[553])
examples.append(X_test[1767])
examples.append(X_test[2225])
examples.append(X_test[517274])
print(examples)


[array([60.05, 32780, 28.5697, -80.8191, 54767, 28.812398, -80.883061, 37,
       12, 6, 6, False, False, False, False, False, False, False, False,
       True, False, False, False, False], dtype=object), array([105.1, 38922, 33.9215, -89.6782, 3451, 34.826923, -89.560916, 40,
       15, 6, 6, False, False, False, False, False, False, True, False,
       False, False, False, False, False], dtype=object), array([780.52, 53803, 42.5545, -90.3508, 1306, 42.461127000000005,
       -91.147148, 66, 22, 6, 6, False, False, False, False, False, False,
       False, True, False, False, False, False, False], dtype=object), array([4.32, 46702, 40.8618, -85.6067, 2304, 40.99301, -85.346478, 69, 1,
       0, 6, False, False, False, False, False, False, False, False,
       False, False, False, True, False], dtype=object), array([868.09, 38668, 34.6323, -89.8855, 14462, 34.091227, -90.390612, 65,
       22, 1, 12, False, False, False, False, False, False, False, False,
       False, False, False, Tr

In [43]:
import pickle as pkl

with open("testset.pkl", "wb") as file:
    pkl.dump(examples, file)
