In [22]:
########### Black Box #########

In [2]:
#Import Libraries
import os
import pandas as pd
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score



In [3]:
##Prepocessing stage
#Remove refused transactions from dataset
cwd = os.getcwd()
dataFrame = pd.read_csv(cwd+"\data_for_student_case.csv")
dataFrame = dataFrame.loc[dataFrame['simple_journal'] != 'Refused']
dataFrame.info()
#convert booking date and creation date to DataTime
dates=['bookingdate', 'creationdate']
for col in dates:
    dataFrame[col] = pd.to_datetime(dataFrame.bookingdate, format='%Y-%m-%d %H:%M:%S', errors='coerce')
#Handle the following attributes as categorical
categoricalAtrr=['issuercountrycode', 'txvariantcode', 'currencycode', 'shoppercountrycode', 'shopperinteraction',
               'cardverificationcodesupplied', 'cvcresponsecode', 'accountcode']
for col in categoricalAtrr:
    dataFrame[col] = dataFrame[col].astype('category')
def CurerncyConverter(currencyColumn):
    coversion_dict = {'SEK':0.0858,'MXN':0.038011,'AUD':0.556,'NZD':0.514,'GBP':1}
    currency = currencyColumn['currencycode']
    amount = currencyColumn['amount']
    return round(amount * coversion_dict[currency] / 100)
dataFrame['convertedAmount'] = dataFrame.apply(lambda x: CurerncyConverter(x),axis=1)
##Preprocessing stage finished

<class 'pandas.core.frame.DataFrame'>
Int64Index: 237036 entries, 0 to 290381
Data columns (total 17 columns):
txid                            237036 non-null int64
bookingdate                     237036 non-null object
issuercountrycode               237032 non-null object
txvariantcode                   237036 non-null object
bin                             237036 non-null float64
amount                          237036 non-null float64
currencycode                    237036 non-null object
shoppercountrycode              236694 non-null object
shopperinteraction              237036 non-null object
simple_journal                  237036 non-null object
cardverificationcodesupplied    223842 non-null object
cvcresponsecode                 237036 non-null int64
creationdate                    237036 non-null object
accountcode                     237036 non-null object
mail_id                         237036 non-null object
ip_id                           237036 non-null object
card_id  

In [32]:
# Choose the most discriminative features
dataFrameSubset = dataFrame[['issuercountrycode', 'txvariantcode', 'convertedAmount', 'currencycode', 'shoppercountrycode',
                  'shopperinteraction', 'cardverificationcodesupplied', 'cvcresponsecode', 'accountcode',
                   'simple_journal']]

In [33]:
#ad-hoc endoding of features
dataFrameSubset.loc[dataFrameSubset.simple_journal == 'Chargeback', 'simple_journal'] = 1
dataFrameSubset.loc[dataFrameSubset.simple_journal == 'Settled', 'simple_journal'] = 0
dataFrameSubset['simple_journal'] = dataFrameSubset['simple_journal'].astype('int')
labels = dataFrameSubset.simple_journal
features = dataFrameSubset.drop('simple_journal', axis=1)
features = pd.get_dummies(features)
features.info()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


<class 'pandas.core.frame.DataFrame'>
Int64Index: 237036 entries, 0 to 290381
Columns: 292 entries, convertedAmount to accountcode_UKAccount
dtypes: int64(1), uint8(291)
memory usage: 69.4 MB


In [34]:
#Perform 10-fold cross validation and calculate evaluation metrics for both SMOTED and UNSMOTED case
def cross_validation_10_fold(classifier, features, labels):

    
    k_fold = KFold(n_splits=10, shuffle=True, random_state=42)

    
    TP_all = []
    FP_all = []
    TN_all = []
    FN_all = []
    AUC_all = []
    accuracy=[]
    f1_measure=[]
    Recall=[]
    precision=[]

    for train_index, test_index in k_fold.split(features):
        

        
        features_train, features_test = features.iloc[train_index], features.iloc[test_index]
        labels_train, labels_test = labels.iloc[train_index], labels.iloc[test_index]

        
        oversampling = SMOTE(ratio=float(0.18), random_state=42)
        features_oversampling, labels_oversampling = oversampling.fit_sample(features_train, labels_train)

        
        #classifier.fit(features_train, labels_train) , Comment out to evaluare for UNSMOTED case
        classifier.fit(features_oversampling, labels_oversampling)
        
        
        
        

        
        labels_predicted = classifier.predict(features_test)
        table_of_confusion = confusion_matrix(labels_test, labels_predicted, labels=[1, 0])
        
        acc= accuracy_score(labels_predicted,labels_test)*100
        f1= f1_score(labels_predicted,labels_test)*100
        rec= recall_score(labels_predicted,labels_test)*100
        prec=precision_score(labels_predicted,labels_test)*100
        

        TP = table_of_confusion[0][0]
        FP = table_of_confusion[1][0]
        TN = table_of_confusion[1][1]
        FN = table_of_confusion[0][1]

        labels_predicted_probability = classifier.predict_proba(features_test)[:, 1]
        AUC = roc_auc_score(labels_test, labels_predicted_probability)

        
        TP_all.append(TP)
        FP_all.append(FP)
        TN_all.append(TN)
        FN_all.append(FN)
        AUC_all.append(AUC)
        accuracy.append(acc)
        f1_measure.append(f1)
        Recall.append(rec)
        precision.append(prec)

    
    TP_all = np.array(TP_all)
    FP_all = np.array(FP_all)
    TN_all = np.array(TN_all)
    FN_all = np.array(FN_all)
    AUC_all = np.array(AUC_all)
    accuracy=np.array(accuracy)
    f1_measure=np.array(f1_measure)
    Recall=np.array(Recall)
    precision=np.array(precision)
    print("Accuracy: {}".format(np.mean(accuracy)))
    print("Recall: {}".format(np.mean(Recall)))
    print("Precision: {}".format(np.mean(precision)))
    print("F-measure: {}".format(np.sum(f1_measure)))
    print("AUC: {}".format(np.sum(AUC)))
    print("True positives: {}".format(np.sum(TP_all)))
    print("False positives: {}".format(np.sum(FP_all)))
    print("True negatives: {}".format(np.sum(TN_all)))
    print("False negatives: {}".format(np.sum(FN_all)))

    

In [35]:
classifier =RandomForestClassifier(max_depth=4, criterion="gini", random_state=0)

In [36]:
cross_validation_10_fold(classifier, features, labels)



Accuracy: 96.11577978732738
Recall: 1.7074869220861242
Precision: 43.928427036791824
F-measure: 32.8307133473252
AUC: 0.9385153978452382
True positives: 154
False positives: 9016
True negatives: 227675
False negatives: 191
