In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier, export_graphviz

import time
import datetime

# returns data cols, data, target col, targets
def load_ccfraud():
    ds = pd.read_csv("dataset/fraudTrain.csv")
    print("File \"fraudTrain.csv\" loaded...")

    data_cols = ["amt", "category", "dob", "gender"]
    target_col = "is_fraud"

    return ds, data_cols, [ds[col] for col in data_cols], target_col, ds[target_col]

dataframe, data_cols, data, target_col, target = load_ccfraud()



labelEncoderTransformTypes = {}



def getDobFormat(string):
    return int(string.replace("-", ""))

def labelEncode(feature: str, dataframe, map):
    le = LabelEncoder()
    le.fit(dataframe[feature])
    dataframe[feature] = le.transform(dataframe[feature])
    map[feature] = le


labelEncode("category", dataframe, labelEncoderTransformTypes)
labelEncode("gender", dataframe, labelEncoderTransformTypes)

dataframe["dob"] = [getDobFormat(x) for x in dataframe["dob"]]

for x in data_cols:
    print(x + (" " * (50 - len(x))), "\t", dataframe[x].dtype)

File "fraudTrain.csv" loaded...
amt                                                	 float64
category                                           	 int32
dob                                                	 int64
gender                                             	 int32


In [2]:
def train(criterion, splitter, max_depth):
    clf = DecisionTreeClassifier(criterion = criterion, splitter = splitter, max_depth = max_depth, random_state = 0)
    clf.fit(dataframe[data_cols], target)
    print("Trained with params:\ncriterion:" + criterion + "   splitter:" + splitter + "   max_depth:" + str(max_depth))
    
    # saveTree(clf)
    
    return clf

In [3]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

dataframe_test = pd.read_csv("dataset/fraudTest.csv")


for feature in labelEncoderTransformTypes:
    dataframe_test[feature] = labelEncoderTransformTypes[feature].transform(dataframe_test[feature])

dataframe_test["dob"] = [getDobFormat(x) for x in dataframe_test["dob"]]

print("Loaded and transformed test dataframe...")

Loaded and transformed test dataframe...


In [3]:
# Performance test

def formatNumber(float):
    return round(float * 100, 3)

def performanceTest():
    classifier = loadTree()
    
    y_test = dataframe_test[target_col]

    y_pred = classifier.predict(dataframe_test[[x for x in data_cols]])

    print("\nAccuracy score: ", formatNumber(accuracy_score(y_test, y_pred)))
    print("Precision score: ", formatNumber(precision_score(y_test, y_pred, average = "macro")))
    print("Recall score: ", formatNumber(recall_score(y_test, y_pred, average = "macro")))
    print("F1 score: ", formatNumber(f1_score(y_test, y_pred, average = "macro")))

In [10]:
tree = train("entropy", "best", 9)

print(labelEncoderTransformTypes["category"].inverse_transform([8]))
print(labelEncoderTransformTypes["gender"].inverse_transform([1]))

# print(data_cols)
print(tree.predict(pd.DataFrame([[ 780.52, 8, 19580910, 1 ]], columns = data_cols)))

Trained with params:
criterion:gini   splitter:best   max_depth:None
['misc_net']
['M']
[0]


In [11]:
import pickle as pkl

def saveTree(clf, lencoders):
    with open("tree.pkl", "wb") as f:
        pkl.dump(clf, f, pkl.HIGHEST_PROTOCOL)
    
    with open("label_encoders.pkl", "wb") as f:
        pkl.dump(lencoders, f, pkl.HIGHEST_PROTOCOL)
        
def loadTree():
    with open("tree.pkl", "rb") as f:
        return pkl.load(f)

In [12]:
saveTree(train("entropy", "best", 9), labelEncoderTransformTypes)

Trained with params:
criterion:entropy   splitter:best   max_depth:9


In [12]:
# export tree
import graphviz

dot_data = export_graphviz(tree, out_file = None, filled = True, rounded = False)
g = graphviz.Source(dot_data)
g.format = 'svg'
g.render('dtree_render', view=True)

'dtree_render.svg'

In [13]:
?g.render

In [7]:
print(labelEncoderTransformTypes["category"].transform(["misc_net"]))

[8]
