In [1]:
import pandas as pd
import numpy as np
import datetime
import time
import matplotlib.pyplot as plt
from sklearn import neighbors
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from operator import itemgetter
from itertools import groupby
import seaborn as sns
from sklearn.model_selection import cross_val_score


In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})

In [None]:
# Read the data in pandas
data = pd.read_csv("data_for_student_case.csv")
data['bookingdate'] =  pd.to_datetime(data['bookingdate'])
data['creationdate'] =  pd.to_datetime(data['creationdate'])

In [None]:
# Show the first entries
data.head()
# data['simple_journal'].unique()

In [None]:
# Delete the refused transactions (but keep them for later use maybe)

refused_data = data.loc[data['simple_journal'] == "Refused", :]
dataset = data.loc[data['simple_journal'] != "Refused", :]
dataset['bool_fraud'] = (dataset['simple_journal'] == "Chargeback").copy().astype(int)
dataset['bool_valid'] = (dataset['simple_journal'] == "Settled").copy().astype(int)

In [None]:
# Do some preprocessing for the ML algorithms
finalset = dataset.copy()
finalset['mail_id'] = finalset['mail_id'].str.replace('email','')
finalset['ip_id'] = finalset['ip_id'].str.replace('ip','')
finalset['card_id'] = finalset['card_id'].str.replace('card','')


In [None]:
# Group on dates
creation_dailygroup_mean = finalset.groupby(pd.Grouper(key='creationdate',freq='D')).mean()
creation_monthlygroup_mean = finalset.groupby(pd.Grouper(key='creationdate',freq='M')).mean()
booking_dailygroup_mean = finalset.groupby(pd.Grouper(key='bookingdate',freq='D')).mean()
booking_monthlygroup_mean = finalset.groupby(pd.Grouper(key='bookingdate',freq='M')).mean()

creation_dailygroup_sum = finalset.groupby(pd.Grouper(key='creationdate',freq='D')).sum()
creation_monthlygroup_sum = finalset.groupby(pd.Grouper(key='creationdate',freq='M')).sum()
booking_dailygroup_sum = finalset.groupby(pd.Grouper(key='bookingdate',freq='D')).sum()
booking_monthlygroup_sum = finalset.groupby(pd.Grouper(key='bookingdate',freq='M')).sum()

creation_dailygroup_count = finalset.groupby(pd.Grouper(key='creationdate',freq='D')).count()
creation_monthlygroup_count = finalset.groupby(pd.Grouper(key='creationdate',freq='M')).count()
booking_dailygroup_count = finalset.groupby(pd.Grouper(key='bookingdate',freq='D')).count()
booking_monthlygroup_count = finalset.groupby(pd.Grouper(key='bookingdate',freq='M')).count()

# Group on simple_journal
booking_monthlygroup_count = finalset.groupby(pd.Grouper(key='bookingdate',freq='M')).count()

# Group on card id
card_id_sum = finalset.groupby('card_id').sum()
ip_id_sum = finalset.groupby('ip_id').sum()
mail_id_sum = finalset.groupby('mail_id').sum()

accountcode_sum = finalset.groupby('accountcode').sum()


In [None]:
sns.barplot(x=accountcode_sum.index, y="bool_fraud",data=accountcode_sum)

In [None]:
sorted_ips = ip_id_sum.sort_values("bool_fraud",ascending=False).head(25)
sns.lineplot(data=sorted_ips, x=sorted_ips.index, sort=False, y="bool_fraud")


In [None]:
sorted_cards = card_id_sum.sort_values("bool_fraud",ascending=False).head(25)
sns.lineplot(data=sorted_cards, x=sorted_cards.index, sort=False, y="bool_fraud")

In [None]:

sorted_cards = mail_id_sum.sort_values("bool_fraud",ascending=False).head(25)
sns.lineplot(data=sorted_cards, x=sorted_cards.index, sort=False, y="bool_fraud")

In [None]:
# Heatmap stuff
heatmap_data = pd.pivot_table(finalset, "simple_journal", "shoppercountrycode","shopperinteraction", aggfunc=lambda x: sum(x == "Chargeback"))
ax = sns.heatmap(heatmap_data.fillna(0))



In [None]:
# Heatmap stuff
heatmap_data = pd.pivot_table(finalset, "simple_journal", "txvariantcode","shopperinteraction", aggfunc=lambda x: sum(x == "Chargeback"))
ax = sns.heatmap(heatmap_data.fillna(0))

In [None]:
finalset.columns

In [None]:
# Heatmap stuff
heatmap_data = pd.pivot_table(finalset, "simple_journal", "card_id","shopperinteraction", aggfunc=lambda x: sum(x == "Chargeback"))
ax = sns.heatmap(heatmap_data.fillna(0))

In [None]:
# # Heatmap stuff
# heatmap_data = pd.pivot_table(finalset, "simple_journal", "ip_id","shopperinteraction", aggfunc=lambda x: sum(x == "Chargeback"))
# ax = sns.heatmap(heatmap_data)

In [None]:
# # Heatmap stuff
# heatmap_data = pd.pivot_table(finalset, "simple_journal", "accountcode","shopperinteraction", aggfunc=lambda x: sum(x == "Chargeback"))
# ax = sns.heatmap(heatmap_data)

In [None]:
# multiple line plot
ax = sns.lineplot(x=creation_dailygroup_sum.index, y="bool_fraud", data=creation_dailygroup_sum)
ax.set_xticklabels(labels = [d.date() for d in creation_dailygroup_sum.index], rotation=90)

In [None]:
# multiple line plot
ax = sns.barplot(x=creation_monthlygroup_sum.index, y="bool_fraud", data=creation_monthlygroup_sum)
ax.set_xticklabels(labels = [d.date() for d in creation_monthlygroup_sum.index], rotation=90)

In [None]:
creation_monthlygroup_sum.index

In [None]:
# issuercountry
# txvariantcode
# currencycode
# shoppercountry
# interaction
# verification
# accountcode



In [None]:
finalset.columns

In [None]:
targets_for_onehot = ['issuercountrycode', 'txvariantcode','currencycode', 'shoppercountrycode', 'shopperinteraction', 'cardverificationcodesupplied', 'cvcresponsecode']

new_df = pd.DataFrame([])

for target in targets_for_onehot:
    temp = pd.get_dummies(finalset[target])
    new_df = pd.concat([new_df, temp],axis=1)
    
new_df = pd.concat([new_df, finalset['creationdate'].apply(lambda x: x.timestamp())], axis=1)
new_df = pd.concat([new_df, finalset[['mail_id','ip_id','card_id','bin','amount']]], axis=1)
new_df = new_df.fillna(0)

In [None]:
def print_scores(y_predict, y_true):
    TP, FP, FN, TN = 0, 0, 0, 0
    for i in range(len(y_predict)):
        if y_true[i]==1 and y_predict[i]==1:
            TP += 1
        if y_true[i]==0 and y_predict[i]==1:
            FP += 1
        if y_true[i]==1 and y_predict[i]==0:
            FN += 1
        if y_true[i]==0 and y_predict[i]==0:
            TN += 1
    print('TP: '+ str(TP))
    print('FP: '+ str(FP))
    print('FN: '+ str(FN))
    print('TN: '+ str(TN))


In [None]:

x = new_df.values
x[x=="NA"] = 0
y = finalset['bool_fraud'].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1)#test_size: proportion of train/test data
clf = neighbors.KNeighborsClassifier(algorithm = 'kd_tree')
clf.fit(x_train, y_train)
y_predict = clf.predict(x_test)



#print confusion_matrix(y_test, answear) watch out the element in confusion matrix
precision, recall, thresholds = precision_recall_curve(y_test, y_predict)
predict_proba = clf.predict_proba(x_test)#the probability of each smple labelled to positive or negative

In [None]:
from sklearn import svm

## Run the svm

# clf = svm.SVC(kernel='linear', C=1).fit(x_train, y_train)
# clf.score(x_test, y_test) 

In [None]:
from sklearn import tree

## Run the decision tree
clf = tree.DecisionTreeClassifier(class_weight={0:1, 1:10000}).fit(x_train, y_train)
y_predict = clf.predict(x_test) 
print_scores(y_predict, y_test)

In [None]:
from sklearn.ensemble import AdaBoostClassifier

## Run the AdaBoost class
clf = AdaBoostClassifier(n_estimators=100, base_estimator=tree.DecisionTreeClassifier(class_weight={0:1, 1:100}, max_depth=1)).fit(x_train, y_train)
y_predict = clf.predict(x_test) 
print_scores(y_predict, y_test)

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier

# Run the bagging
clf = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5).fit(x_train, y_train)
y_predict = clf.predict(x_test) 
print_scores(y_predict, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Run the Forsest
clf = RandomForestClassifier(n_estimators=100).fit(x_train, y_train)
y_predict = clf.predict(x_test) 
print_scores(y_predict, y_test)

In [None]:
len(y_test) - sum(y_test)