In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier, export_graphviz

import time
import datetime

# returns data cols, data, target col, targets
def load_ccfraud():
    ds = pd.read_csv("dataset/fraudTrain.csv")
    print("File \"fraudTrain.csv\" loaded...")

    data_cols = [x for x in ds if x != "is_fraud"]
    target_col = "is_fraud"

    return ds, data_cols, [ds[col] for col in data_cols], target_col, ds[target_col]

dataframe, data_cols, data, target_col, target = load_ccfraud()



labelEncoderTransformTypes = {}

# constants
const_transac_date_format = "%Y-%m-%d %H:%M:%S"
# const_dob_format = "%Y-%m-%d"
def getTransactionDateUnix(string):
    return time.mktime(datetime.datetime.strptime(string, const_transac_date_format).timetuple())

def getDobFormat(string):
    return int(string.replace("-", ""))





def labelEncode(feature: str, dataframe, map):
    le = LabelEncoder()
    le.fit(dataframe[feature])
    dataframe[feature] = le.transform(dataframe[feature])
    map[feature] = le

def removeFeature(feature: str):
    dataframe.drop(feature, axis = 'columns')
    data_cols.remove(feature)


labelEncode("merchant", dataframe, labelEncoderTransformTypes)
labelEncode("category", dataframe, labelEncoderTransformTypes)
removeFeature("first")
removeFeature("last")
labelEncode("gender", dataframe, labelEncoderTransformTypes)
removeFeature("street")
removeFeature("city")
labelEncode("state", dataframe, labelEncoderTransformTypes)
removeFeature("trans_num")
removeFeature("job")

dataframe["trans_date_trans_time"] = [getTransactionDateUnix(x) for x in dataframe["trans_date_trans_time"]]
dataframe["dob"] = [getDobFormat(x) for x in dataframe["dob"]]

for x in data_cols:
    print(x + (" " * (50 - len(x))), "\t", dataframe[x].dtype)

File "fraudTrain.csv" loaded...
#                                                  	 int64
trans_date_trans_time                              	 float64
cc_num                                             	 int64
merchant                                           	 int32
category                                           	 int32
amt                                                	 float64
gender                                             	 int32
state                                              	 int32
zip                                                	 int64
lat                                                	 float64
long                                               	 float64
city_pop                                           	 int64
dob                                                	 int64
unix_time                                          	 int64
merch_lat                                          	 float64
merch_long                                         	 float64


In [3]:
def train(criterion, splitter, max_depth):
    clf = DecisionTreeClassifier(criterion = criterion, splitter = splitter, max_depth = max_depth, random_state = 0)
    clf.fit(dataframe[data_cols], target)
    print("Trained with params:\ncriterion:" + criterion + "   splitter:" + splitter + "   max_depth:" + str(max_depth))
    
    return clf

In [18]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

dataframe_test = pd.read_csv("dataset/fraudTest.csv")


for feature in labelEncoderTransformTypes:
    dataframe_test[feature] = labelEncoderTransformTypes[feature].transform(dataframe_test[feature])
    
dataframe_test["trans_date_trans_time"] = [getTransactionDateUnix(x) for x in dataframe_test["trans_date_trans_time"]]
dataframe_test["dob"] = [getDobFormat(x) for x in dataframe_test["dob"]]

print("Loaded and transformed test dataframe...")

Loaded and transformed test dataframe...


In [4]:
# Performance test

def formatNumber(float):
    return round(float * 100, 3)

def performanceTest(classifier):
    y_test = dataframe_test[target_col]

    y_pred = classifier.predict(dataframe_test[[x for x in data_cols]])

    print("\nAccuracy score: ", formatNumber(accuracy_score(y_test, y_pred)))
    print("Precision score: ", formatNumber(precision_score(y_test, y_pred, average = "macro")))
    print("Recall score: ", formatNumber(recall_score(y_test, y_pred, average = "macro")))
    print("F1 score: ", formatNumber(f1_score(y_test, y_pred, average = "macro")))

In [65]:
performanceTest(train("log_loss", "best", 9))

Trained with params:
criterion:log_loss   splitter:best   max_depth:9

Accuracy score:  99.781
Precision score:  86.525
Recall score:  84.031
F1 score:  85.233


In [6]:
# export tree
import graphviz

dot_data = export_graphviz(tree, out_file = None, filled = True, rounded = False)
g = graphviz.Source(dot_data)
g.format = 'svg'
g.render('dtree_render', view=True)

'dtree_render.svg'

In [5]:
tree = train("gini", "best", None)

Trained with params:
criterion:gini   splitter:best   max_depth:None
