In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, cross_val_score, cross_validate
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, f1_score
RSEED=42


In [2]:
# Load data
df = pd.read_csv('data/xente/training.csv', parse_dates=['TransactionStartTime'], 
index_col='TransactionId').drop(['CountryCode', 'CurrencyCode'], axis=1)

# create time of day column in number of seconds since midnight
df["time_of_day"] = df["TransactionStartTime"].dt.second + df["TransactionStartTime"].dt.minute * 60 + df["TransactionStartTime"].dt.hour * 3600

# select columns for base model
base_df = df[["Value", "time_of_day", "FraudResult"]]

In [3]:

# Creating separate lists for numerical and categorical predictors 
cat_features = list(base_df.columns[base_df.dtypes==object])
num_features = list(base_df.columns[base_df.dtypes!=object])
num_features.remove('FraudResult')

# Define predictors and target variable
X = base_df[["Value", "time_of_day"]]
y = base_df['FraudResult']

# Split into train and test set 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RSEED, stratify=y)

# implement smote
sm = SMOTE(random_state = 42)
X_train, y_train = sm.fit_resample(X_train, y_train)

In [4]:

# Pipline for numerical features
# Initiating Pipeline and calling one step after another
# each step is built as a list of (key, value)
# key is the name of the processing step
# value is an estimator object (processing step)
num_pipeline = Pipeline([
    ('imputer_num', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler())
])

# Pipeline for categorical features 
cat_pipeline = Pipeline([
    ('imputer_cat', SimpleImputer(strategy='constant', fill_value='missing')),
    ('1hot', OneHotEncoder(handle_unknown='ignore'))
])

# Complete pipeline for numerical and categorical features
# 'ColumnTransformer' applies transformers (num_pipeline/ cat_pipeline)
# to specific columns of an array or DataFrame (num_features/cat_features)
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features)#,
    #('cat', cat_pipeline, cat_features)
])

# Building a full pipeline with the preprocessor and a Naive Bayes Classifier
pipe_logreg = Pipeline([
    ('preprocessor', preprocessor),
    ('logreg', LogisticRegression(max_iter=1000))
])

In [5]:
# Making predictions on the training set using cross validation as well as calculating the probabilities
# cross_val_predict expects an estimator (model), X, y and nr of cv-splits (cv)
y_train_predicted = cross_val_predict(pipe_logreg, X_train, y_train, cv=5)

In [6]:
# Calculating the accuracy for the LogisticRegression Classifier 
print('Cross validation scores:')
print('-------------------------')
print("Accuracy: {:.2f}".format(accuracy_score(y_train, y_train_predicted)))
print("Recall: {:.2f}".format(recall_score(y_train, y_train_predicted)))
print("F1-score: {:.2f}".format(f1_score(y_train, y_train_predicted)))

Cross validation scores:
-------------------------
Accuracy: 0.98
Recall: 0.97
F1-score: 0.98


In [7]:
confusion_matrix(y_train, y_train_predicted)

array([[75983,   392],
       [ 2017, 74358]])

In [8]:

y_test_predicted = cross_val_predict(pipe_logreg, X_test, y_test, cv=5)

# Calculating the accuracy for the LogisticRegression Classifier 
print('Cross validation scores:')
print('-------------------------')
print("Accuracy: {:.2f}".format(accuracy_score(y_test, y_test_predicted)))
print("Recall: {:.2f}".format(recall_score(y_test, y_test_predicted)))
print("F1-score: {:.2f}".format(f1_score(y_test, y_test_predicted)))

Cross validation scores:
-------------------------
Accuracy: 1.00
Recall: 0.33
F1-score: 0.43


In [9]:
confusion_matrix(y_test, y_test_predicted)

array([[19086,     8],
       [   26,    13]])