In [None]:
import numpy as np
import pandas as pd
import pydotplus
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from IPython.display import Image

In [None]:
# read data and feature name from excel file
def read_data(path = None):
    df = pd.read_excel(path)

    # delete column 'nameid'
    df = df.drop(['nameid'], axis = 1)

    # discretize feature 'revenue'
    threshold = [0,10000,20000,30000,40000,50000]
    df['revenue'] = pd.cut(df['revenue'], threshold, labels = False)

    # fetch dataset and names of each feature
    dataset = df.values
    features = df.columns.values

    return dataset, features

In [None]:
# read dataset for training
train_dataset, train_features = read_data(r"train.xls")
train_data = pd.DataFrame(train_dataset, columns = train_features)

print(train_data)

In [None]:
# read dataset for test
test_dataset, test_features = read_data(r"test.xls")
test_data = pd.DataFrame(test_dataset, columns = test_features)

print(test_data)

In [None]:
# separate features and labels
X_train = np.array(train_data.iloc[:, :-1])
Y_train = np.array(train_data.iloc[:, -1])

X_test = np.array(test_data.iloc[:, :-1])
Y_test = np.array(test_data.iloc[:, -1])

In [None]:
best_F1_score = 0       # use F1 score to evaluate the model
best_depth = 0          # depth of dtree where we got the best performance

# test max depth of the decision tree from 2 to 10
for depth in range(2, 11):
    dt_model = DecisionTreeClassifier(max_depth = depth)
    
    # 8-fold cross validation: divide train set into train and valid (7:1)
    F1_scores = []
    kf = KFold(n_splits = 8, shuffle = False)
    for train_index, valid_index in kf.split(X_train):
        KX_train, KX_valid = X_train[train_index], X_train[valid_index]
        KY_train, KY_valid = Y_train[train_index], Y_train[valid_index]

        # train the model
        dt_model.fit(KX_train, KY_train)

        # result of the prediction 
        result = dt_model.predict(KX_valid)

        # find indicators that we need
        TP, FP, FN, TN = 0, 0, 0, 0
        for i in range(len(KX_valid)):
            if KY_valid[i] == 1:
                if result[i] == 1:
                    TP += 1
                else:
                    FN += 1
            else:
                if result[i] == 1:
                    FP += 1
                else:
                    TN += 1

        # calculate the F1 score and store it
        F1_score = 2 * TP / (2 * TP + FP + FN)
        F1_scores.append(F1_score)
    
    # use the mean of F1 scores as the final F1 score of this depth parameter
    curr_F1_score = np.mean(F1_scores)

    print("depth:", depth, " F1:", curr_F1_score)
    
    # if this depth turns out to be a better parameter, update best depth
    if curr_F1_score > best_F1_score:
        best_depth = depth
        best_F1_score = curr_F1_score

In [None]:
print("best depth:", best_depth)

# use the best parameter to train the model
dt_model = DecisionTreeClassifier(max_depth = best_depth)
dt_model.fit(X_train, Y_train)

# use test set to test the performance of the model
result = dt_model.predict(X_test)

# find indicators that we need
TP, FP, FN, TN = 0, 0, 0, 0
for i in range(len(X_test)):
    if Y_test[i] == 1:
        if result[i] == 1:
            TP += 1
        else:
            FN += 1
    else:
        if result[i] == 1:
            FP += 1
        else:
            TN += 1

# calculate Precision Recall and F1-score
P = TP / (TP + FP)
R = TP / (TP + FN)
F1 = 2 * TP / (2 * TP + FP + FN)

print("Precision: ", P)
print("Recall: ", R)
print("F1-score: ", F1)

In [None]:
# paint the decision tree we've got
features = ['profession', 'education','house_loan','car_loan', 'married', 'child', 'revenue']
labels = ['no loan','loan']

dot_data = export_graphviz(
            dt_model,
            out_file = None,
            feature_names = features,
            class_names = labels,
            rounded = True,
            filled = True,
            special_characters = True)

# in case there is shade on the image
dot_data = dot_data.replace('\n', '')
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())