In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
%matplotlib inline 
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE


In [None]:
# Load data
df = pd.read_csv('data/xente/training.csv', parse_dates=['TransactionStartTime'], 
dtype={'ProductId': 'category','ProductCategory': 'category','ChannelId': 'category','PricingStrategy': 'category'}, 
index_col='TransactionId').drop(['CountryCode', 'CurrencyCode'], axis=1)

In [None]:

## Building baseline model using Naive Bayes with Value, ChannelId and time

# Get time from date time
# in number of seconds since midnight, because Bayes cannot deal with a time object
time = df["TransactionStartTime"].dt.second + df["TransactionStartTime"].dt.minute * 60 + df["TransactionStartTime"].dt.hour * 3600

# Get X_train and y_train
X = df[["Value", "ChannelId"]]
# get dummies from ChannelID column
X["ChannelId"] = X["ChannelId"].str.replace("ChannelId_", "").astype("int")
X["time"] = time
y = df["FraudResult"]

# is the default test set size good for such an imbalanced dataset?
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# implement smote
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X_train, y_train)

# initiate model
from sklearn.naive_bayes import GaussianNB
nb_clf = GaussianNB()

# fit model
nb_clf.fit(X_train, y_train)

# predict
y_pred_base = nb_clf.predict(X_test)

print(classification_report(y_test, y_pred_base))

# TODO: Scaling and transformation where necessary
# TODO: prettify output 
# TODO: Write this into a function
# tomorrow: tweak metric (inconvenience score?)


In [None]:
confusion_matrix(y_test, y_pred_base)
