In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

%load_ext autoreload
%autoreload 2

from cda_prep import *
from cda_util import *
from cda_models import *

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

# Cyber Data Analytics - Lab 1
Cheatsheet: https://gist.github.com/agalea91/545e2337b94d965be788f7db18b1f497

**Data description**
 - `issuercountrycode`: country where the card was issued
 - `txvariantcode`: the card type that was used (subbrand of visa or master card)
 - `bin`: card issuer identifier
 - `amount`/currencycode: transaction amount in minor units (so 100 EUR = 100 euro cent)
 - `shoppercountrycode`: IP address country
 - `shopperinteraction`: Ecommerce if it was an online transaction, ContAuth if it was a (monthly) subscription
 - `simple_journal`: Payment status. Authorised = “transaction approved and no fraud reported”, Refused = “transaction was declined, can be fraud, but can also be insufficient funds, etc”, Chargeback = “transaction was approved, but turned out to be fraud”
 - `bookingdate`: only relevant for Chargebacks. Time stamp when the chargeback was reported. During simulation you may only use this knowledge after this date. So for example if on an email address a transaction ended in a chargeback, you can only block that email address after the booking date of the chargeback.
 - `cardverificationresponsesupplied`: did the shopper provide his 3 digit CVC/CVV2 code?
 - `cvcresponsecode`: Validation result of the CVC/CVV2 code: 0 = Unknown, 1=Match, 2=No Match, 3-6=Not checked
 - `creationdate`: Date of transaction
 - `accountcode`: merchant’s webshop
 - `mail_id`: Email address
 - `ip_id`: Ip address
 - `card_id`: Card number

**Variables to handle carefully:**

`simple_journal` /
`bookingdate` /
`creationdate`

### 1. Get data

In [None]:
df, df_raw = get_data('data/data_for_student_case.csv')

In [None]:
print("Raw dataset:")
display(df_raw.head(3))

print("Processed dataset:")
display(df.head(3))

### 2. Explore

In [None]:
df_stats = get_class_balance(df)

with pd.option_context('display.float_format', lambda x: "%.3f" % x):
    display(df_stats)

### 3. SMOTE Analysis

In [None]:
# Split data and labels
X, y = split_labels(df)

# Drop columns
X.drop(columns=['amount_dollar', 'bookingdate', 'creationdate'], inplace=True)

In [None]:
X.head()

In [None]:
clf_smote_test = [
    (KNeighborsClassifier(5), 5),
    (LogisticRegression(solver='lbfgs'), ""),
    (RandomForestClassifier(n_estimators=100), 100)
]

In [None]:
results = pd.DataFrame(columns=['Classifier', 'Param', 'SMOTE?', 'Mean AUC', 'Std AUC', 'Precision', 'Recall', 'F1-score', 'Confmat'])

i = 0
for (clf, param) in clf_smote_test:
    
    # NoSmote
    confmat, precision, recall, f1, mean_auc, std_auc = roc_cross_val(X, y, clf, prepare_null, 10, "w/o SMOTE")
    results.loc[i] = [type(clf).__name__, param, False, mean_auc, std_auc, precision, recall, f1, confmat]
    
    i+=1
    
    # Smote
    confmat, precision, recall, f1, mean_auc, std_auc = roc_cross_val(X, y, clf, prepare_smote_analysis, 10, "w/ SMOTE")
    results.loc[i] = [type(clf).__name__, param, True, mean_auc, std_auc, precision, recall, f1, confmat]
    
    i+=1
    

In [None]:
results

In [None]:
results.apply(lambda x : display(x['Confmat']), axis=1)