# Decision Tree Classifier (ID3 Algorithm)

**Reading the Dataset**

In [1]:
import pandas as pd

dataframe = pd.read_csv('car.data.txt',header = None, names = ['Buying','Maintenance','Doors','Persons','Lugboot','Safety','Class'])
dataframe.head()

Unnamed: 0,Buying,Maintenance,Doors,Persons,Lugboot,Safety,Class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


**Functions to Calculate Entropy and Information Gain**

In [2]:
import numpy as np

def entropy(df,label):
    values,counts = np.unique(df[label],return_counts=True)
    return sum((-counts[i]/np.sum(counts))*np.log2(counts[i]/np.sum(counts)) for i in range(len(counts)))

def information_gain(df,feature,label):
    e = entropy(df,label)
    values,counts = np.unique(df[feature],return_counts=True)
    return e-sum((counts[i]/sum(counts))*entropy(df.where(df[feature]==values[i]).dropna(),label) for i in range(len(counts)))

**Function to Create Decision Tree**

In [3]:
def decision_tree(dataset,features,label,parent=None):
    dat = np.unique(dataset[label],return_counts=True)
    unq = np.unique(dataset[label])
    if len(unq)==1:        #If there's only one value in the target label then return that value
        return unq[0]
    elif len(dataset) == 0:
        return unq[np.argmax(dat[1])]
    elif len(features) == 0: #If the feature set becomes empty i.e. all the features have been utilised
        return parent
    else:
        parent = unq[np.argmax(dat[1])]
        IG = [information_gain(dataset,feature,label) for feature in features]
        split_feature = features[np.argmax(IG)]
        DT = {'Split Feature':split_feature}
        for value in np.unique(dataset[split_feature]):
            new_features = [i for i in features if i != split_feature]
            new_dataset = dataset.where(dataset[split_feature]==value).dropna()
            new_tree = decision_tree(new_dataset,new_features,label,parent)
            DT[value] = new_tree
    return DT

**Function to Make Prediction**

In [4]:
import random

def predict(X_test,tree):
    layer = tree
    while type(layer) == dict:
        a = layer['Split Feature']
        b = X_test[a]
        if b in layer:
            layer = layer[b]
        else:
            p = layer.items()
            p = [m[1] for m in p if m[0] != 'Split Feature']
            layer = random.choice(p)
    return layer

**Function to Generate Decision Rules**

In [5]:
def gen_rules(decisiontree,rules,lst = [],split_feature = None):
    split_feature = decisiontree['Split Feature']
    for x in decisiontree:
        new_list = list(lst)
        if x!= 'Split Feature':
            if type(decisiontree[x]) == dict:
                new_list.append((split_feature, x))
                gen_rules(decisiontree[x],rules,new_list,split_feature)
            else:
                new_list.append((split_feature,x))
                if decisiontree[x] not in rules:
                    rules[decisiontree[x]] = set()
                rules[decisiontree[x]].add(tuple(new_list))

**Function to print Decision Rules**

In [6]:
def print_decisionRules(rules):
    for y in rules:
        j = 0
        for x in rules[y]:
            print('(',end='')
            for i in range(len(x)):
                print(f'{x[i][0]} = {x[i][1]}',end='')
                if i < len(x)-1:
                    print(' ∧ ',end='')
            if j < len(rules[y])-1:
                print(') ∨ ',end='')
            else:
                print(')',end='')
            j = j+1
        print(f' => {y}\n')

**Split the Dataset into Training and Testing Data**

In [7]:
from sklearn.model_selection import train_test_split

X_train,X_test = train_test_split(dataframe,test_size=0.2) #80% Training Data and 20% Testing Data

**Create the Decision Tree (ID3 Tree)**

In [8]:
features = list(dataframe.columns)
features.remove('Class')
decisiontree = decision_tree(X_train,features,'Class')

**Generate Decision Rules**

In [9]:
rules = {}
gen_rules(decisiontree,rules)

**Export the Tree to File**

In [10]:
import sys,json

current_stdout = sys.stdout
with open('Output_Tree.txt','w') as tree:
    sys.stdout = tree
    print(json.dumps(decisiontree,indent = 4))
    sys.stdout = current_stdout

***TREE***

In [11]:
print("GENERATED DECISION TREE\n\nA tree node has split feature which is its name and then sub trees for different values of that feature\n")
print(json.dumps(decisiontree,indent = 4))

GENERATED DECISION TREE

A tree node has split feature which is its name and then sub trees for different values of that feature

{
    "Split Feature": "Safety",
    "high": {
        "Split Feature": "Persons",
        "2": "unacc",
        "4": {
            "Split Feature": "Buying",
            "high": {
                "Split Feature": "Maintenance",
                "high": "acc",
                "low": "acc",
                "med": "acc",
                "vhigh": "unacc"
            },
            "low": {
                "Split Feature": "Maintenance",
                "high": {
                    "Split Feature": "Lugboot",
                    "big": "vgood",
                    "med": {
                        "Split Feature": "Doors",
                        "2": "acc",
                        "4": "vgood",
                        "5more": "vgood"
                    },
                    "small": "acc"
                },
                "low": {
                    "Split 

***DECISION RULES***

In [12]:
print_decisionRules(rules)

(Safety = med ∧ Persons = 4 ∧ Buying = low ∧ Maintenance = vhigh ∧ Lugboot = small) ∨ (Safety = high ∧ Persons = more ∧ Buying = high ∧ Maintenance = vhigh) ∨ (Safety = high ∧ Persons = more ∧ Buying = low ∧ Maintenance = vhigh ∧ Lugboot = small ∧ Doors = 2) ∨ (Safety = med ∧ Persons = more ∧ Lugboot = med ∧ Buying = med ∧ Maintenance = high ∧ Doors = 2) ∨ (Safety = med ∧ Persons = 4 ∧ Buying = med ∧ Maintenance = high ∧ Lugboot = med ∧ Doors = 2) ∨ (Safety = med ∧ Persons = more ∧ Lugboot = med ∧ Buying = low ∧ Maintenance = vhigh ∧ Doors = 2) ∨ (Safety = med ∧ Persons = more ∧ Lugboot = small ∧ Buying = med ∧ Maintenance = low ∧ Doors = 2) ∨ (Safety = med ∧ Persons = 4 ∧ Buying = med ∧ Maintenance = vhigh ∧ Lugboot = small) ∨ (Safety = med ∧ Persons = 4 ∧ Buying = high ∧ Lugboot = med ∧ Doors = 3) ∨ (Safety = med ∧ Persons = more ∧ Lugboot = small ∧ Buying = high) ∨ (Safety = high ∧ Persons = more ∧ Buying = high ∧ Maintenance = med ∧ Doors = 2 ∧ Lugboot = small) ∨ (Safety = high ∧ P

(Safety = med ∧ Persons = more ∧ Lugboot = big ∧ Maintenance = med ∧ Buying = low) ∨ (Safety = high ∧ Persons = 4 ∧ Buying = low ∧ Maintenance = low ∧ Lugboot = med ∧ Doors = 3) ∨ (Safety = med ∧ Persons = more ∧ Lugboot = med ∧ Buying = low ∧ Maintenance = low ∧ Doors = 3) ∨ (Safety = high ∧ Persons = 4 ∧ Buying = med ∧ Maintenance = low ∧ Lugboot = small) ∨ (Safety = med ∧ Persons = more ∧ Lugboot = med ∧ Buying = med ∧ Maintenance = low ∧ Doors = 4) ∨ (Safety = high ∧ Persons = more ∧ Buying = low ∧ Maintenance = med ∧ Lugboot = small ∧ Doors = 4) ∨ (Safety = med ∧ Persons = more ∧ Lugboot = big ∧ Maintenance = low ∧ Buying = low) ∨ (Safety = high ∧ Persons = 4 ∧ Buying = med ∧ Maintenance = low ∧ Lugboot = med ∧ Doors = 3) ∨ (Safety = high ∧ Persons = more ∧ Buying = med ∧ Maintenance = low ∧ Lugboot = small ∧ Doors = 3) ∨ (Safety = med ∧ Persons = more ∧ Lugboot = med ∧ Buying = med ∧ Maintenance = low ∧ Doors = 5more) ∨ (Safety = med ∧ Persons = 4 ∧ Buying = low ∧ Maintenance = m

***TRAINING SET ACCURACY***

In [13]:
from sklearn.metrics import accuracy_score

y_train_pred = []
for instance in X_train.iterrows():
    y = predict(instance[1],decisiontree)
    y_train_pred.append(y)
print("Training set Accuracy score =",accuracy_score(X_train['Class'],y_train_pred)*100)

Training set Accuracy score = 100.0


***TEST SET ACCURACY***

In [14]:
y_pred = []
i = 1
for instance in X_test.iterrows():
    y = predict(instance[1],decisiontree)
    y_pred.append(y)
print("Testing set Accuracy score =",accuracy_score(X_test['Class'],y_pred)*100)

Testing set Accuracy score = 89.88439306358381
