In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

%load_ext autoreload
%autoreload 2
from cda_util import *

# Cyber Data Analytics - Lab 1
Cheatsheet: https://gist.github.com/agalea91/545e2337b94d965be788f7db18b1f497

**Data description**
 - `issuercountrycode`: country where the card was issued
 - `txvariantcode`: the card type that was used (subbrand of visa or master card)
 - `bin`: card issuer identifier
 - `amount`/currencycode: transaction amount in minor units (so 100 EUR = 100 euro cent)
 - `shoppercountrycode`: IP address country
 - `shopperinteraction`: Ecommerce if it was an online transaction, ContAuth if it was a (monthly) subscription
 - `simple_journal`: Payment status. Authorised = “transaction approved and no fraud reported”, Refused = “transaction was declined, can be fraud, but can also be insufficient funds, etc”, Chargeback = “transaction was approved, but turned out to be fraud”
 - `bookingdate`: only relevant for Chargebacks. Time stamp when the chargeback was reported. During simulation you may only use this knowledge after this date. So for example if on an email address a transaction ended in a chargeback, you can only block that email address after the booking date of the chargeback.
 - `cardverificationresponsesupplied`: did the shopper provide his 3 digit CVC/CVV2 code?
 - `cvcresponsecode`: Validation result of the CVC/CVV2 code: 0 = Unknown, 1=Match, 2=No Match, 3-6=Not checked
 - `creationdate`: Date of transaction
 - `accountcode`: merchant’s webshop
 - `mail_id`: Email address
 - `ip_id`: Ip address
 - `card_id`: Card number

**Variables to handle carefully:**

`simple_journal` /
`bookingdate` /
`creationdate`

## 1. Prep data

In [3]:
# Load dataset
df = load_dataset('data/data_for_student_case.csv')
df_raw = df.copy()
df.head(2)

# ['issuercountrycode', 'txvariantcode', 'bin', 'amount', 'currencycode', 'shoppercountrycode', 'shopperinteraction', 'cardverificationcodesupplied', 'cvcresponsecode', 'accountcode', 'mail_id', 'ip_id', 'card_id']

Unnamed: 0_level_0,issuercountrycode,txvariantcode,bin,amount,currencycode,shoppercountrycode,shopperinteraction,cardverificationcodesupplied,cvcresponsecode,accountcode,mail_id,ip_id,card_id,labels
txid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,MX,mccredit,530056.0,64800.0,MXN,MX,Ecommerce,True,0,MexicoAccount,email68370,ip111778,card184798,1.0
2,MX,mccredit,547046.0,44900.0,MXN,MX,Ecommerce,True,0,MexicoAccount,email101299,ip78749,card151595,1.0


In [8]:
# Encode categorical variables
# N.B.!!! Use categorical --> ordinal ONLY for decision trees, else Minhash
df['issuercountrycode'] = label_encode(df['issuercountrycode'])
df['txvariantcode'] = label_encode(df['txvariantcode'])
df['currencycode'] = label_encode(df['currencycode'])
df['shoppercountrycode'] = label_encode(df['shoppercountrycode'])
df['shopperinteraction'] = label_encode(df['shopperinteraction'])
df['accountcode'] = label_encode(df['accountcode'])

df['mail_id'] = label_encode(df['mail_id'])
df['ip_id'] = label_encode(df['ip_id'])
df['card_id'] = label_encode(df['card_id'])

# df['cardverificationcodesupplied'], cardverificationcodesupplied_map = cat_to_ord(df['cardverificationcodesupplied'])

In [9]:
# TODO: currency converter

NameError: name 'c' is not defined

# Explore

In [5]:
df_stats = get_class_balance(df)

with pd.option_context('display.float_format', lambda x: "%.3f" % x):
    display(df_stats)

Unnamed: 0,count,pct
,,
Legitimate,236691.0,99.854
Fraud,345.0,0.146
Total,237036.0,100.0


# 2. Visualize

In [6]:
# Class imbalance

# Time series

# High risk days?

# Monetary value

# Correlation heatmap

# t-SNE for visualization?

# 3. Train model

In [7]:
X_all, Y_all = split_labels(df)
X_all.drop(columns=['issuercountrycode', 'txvariantcode', 'amount', 'currencycode', 'shoppercountrycode', 'shopperinteraction', 'cardverificationcodesupplied', 'cvcresponsecode', 'accountcode'], inplace=True)

In [8]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (stratified)
X_train, X_test, Y_train, Y_test = train_test_split(X_all, Y_all, test_size = 0.33, random_state = 42, stratify=Y_all)

In [None]:
from sklearn.preprocessing import StandardScaler, Normalizer, scale
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import confusion_matrix, log_loss, auc, roc_curve, roc_auc_score, recall_score, precision_recall_curve
from sklearn.metrics import make_scorer, precision_score, fbeta_score, f1_score, classification_report
from sklearn.model_selection import cross_val_score, train_test_split, KFold, StratifiedShuffleSplit, GridSearchCV
from sklearn.linear_model import LogisticRegression


# logModel = logisticRegr.fit(X_train, Y_train)

logitreg_parameters = {'C': np.power(10.0, np.arange(-3, 3))}
logitreg = LogisticRegression(solver = 'lbfgs', verbose = 3, warm_start = True)

logModel = GridSearchCV(logitreg, param_grid = logitreg_parameters, scoring = 'roc_auc')
logModel.fit(X_train, Y_train)

predictions = logModel.predict(X_test)

In [39]:
from sklearn.neighbors import KNeighborsClassifier

neigh = KNeighborsClassifier(n_neighbors=1)
neigh.fit(X_train, Y_train)

predictions = neigh.predict(X_test)

In [None]:
# from sklearn.ensemble import RandomForestClassifier

# # Instantiate model with 1000 decision trees
# rf = RandomForestClassifier(n_estimators = 1000, random_state = 42)

# # Train the model on training data
# rf.fit(X_train, Y_train);

# predictions = rf.predict(X_test)

In [40]:
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_test, predictions)

array([[78014,    94],
       [   97,    17]], dtype=int64)

# 4. Classification