## Imports

In [324]:
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

## Loading The Dataset

In [311]:
feature_names = ['Price', 'Maintenance Costs', 'Number of Doors', 'Capacity', 'Trunk Size', 'Security', 'Label']
df = pd.read_csv('car.csv', header=None, names=feature_names)

## Preprocessing

In [312]:
replaces = {
    'Price': {'low': 1, 'med': 2, 'high': 3, 'vhigh': 4},
    'Maintenance Costs': {'low': 1, 'med': 2, 'high': 3, 'vhigh': 4},
    'Number of Doors': {'2': 2, '3': 3, '4': 4, '5more': 5},
    'Capacity': {'2': 2, '4': 4, 'more': 5},
    'Trunk Size': {'small': 1, 'med': 2, 'big': 3},
    'Security': {'low': 1, 'med': 2, 'high': 3},
}

In [313]:
df = df.replace(replaces)

## Train-Test Split

In [315]:
train, test = train_test_split(df, test_size=0.2)
x_train = train.iloc[:, :-1]
y_train = train.iloc[:, -1:]
x_test = test.iloc[:, :-1]
y_test = test.iloc[:, -1:]

## Creating Decision Tree with Different Depths

In [337]:
depths = [5, 7, 10]

accuracy = lambda preds, labels: sum([1 for e in np.equal(preds, labels) if e == True]) / len(preds)

for d in depths:
    
    dtree = tree.DecisionTreeClassifier(max_depth=d)
    dtree = dtree.fit(x_train, y_train)
    train_preds = dtree.predict(x_train)
    test_preds = dtree.predict(x_test)
    
    print(f'Depth= {d:2}: Train Accuracy= {accuracy(train_preds, y_train.to_numpy()[:, 0]):.2f}',
         f'Test Accuracy= {accuracy(test_preds, y_test.to_numpy()[:, 0]):.2f}')


Depth=  5: Train Accuracy= 0.88 Test Accuracy= 0.86
Depth=  7: Train Accuracy= 0.94 Test Accuracy= 0.92
Depth= 10: Train Accuracy= 0.99 Test Accuracy= 0.99


## Confusion Matrix, F-Score and ROC for Depth= 5

In [341]:
dtree = tree.DecisionTreeClassifier(max_depth=6)
dtree = dtree.fit(x_train, y_train)
test_preds = dtree.predict(x_test)

In [342]:
confusion_matrix(y_test.to_numpy()[:, 0], test_preds)

array([[ 62,   9,   0,   2],
       [  0,  15,   0,   0],
       [ 10,   2, 236,   0],
       [  1,   0,   0,   9]])

In [340]:
f1_score(y_test.to_numpy()[:, 0], test_preds, average='micro')

0.9855491329479769

In [None]:
## TODO: ROC (https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html)

## Saving Decision Tree in PDF for Depth= 5

In [319]:
import graphviz
dot_data = tree.export_graphviz(
    dtree, out_file=None,
    feature_names=class_names[: -1],  
    class_names=y_train['Label'].unique(),  
    filled=True, rounded=True,  
    special_characters=True
) 
graph = graphviz.Source(dot_data) 
graph.render()

'Source.gv.pdf'