In [None]:
import pandas as pd
import numpy as np
from pprint import pprint

In [None]:
#Load training data from local drive
from google.colab import files
uploaded = files.upload()
import io
car_data= pd.read_csv(io.BytesIO(uploaded['car_evaluation.csv']))
car_data.loc[253:255,:]

In [None]:
traindata=car_data.sample(frac=0.75, random_state=99)# Alternatively use the first 100 feature vectors for training without this random sampling as train=iris.loc[0:99]
testdata = car_data.loc[~car_data.index.isin(traindata.index), :] # and the remaining feature vectors for test test=iris.loc[100:149]
trainidx=np.arange(0,traindata.shape[0])
traindata.set_index(trainidx,inplace=True)
testidx=np.arange(0,testdata.shape[0])
testdata.set_index(testidx,inplace=True)
print(traindata.shape,testdata.shape)

In [None]:
#Form the dataset as X consisting of all training examples and features except the ground truths and Y consisiting of only ground truths of the
#corresponding training examples in X.
X=traindata.loc[:,"price":"safety"]
Y=traindata.loc[:,"profitable"]
print(X.head(3),"\n", Y.head(3), X.shape)

In [None]:
tdata=X.join(Y)
tdata.head(2)

In [None]:
#Compute entropy
def entropy(fdata):
    values,counts = np.unique(fdata,return_counts=True)
    for i in range(len(values)):
        entropy = np.sum([(-counts[i]/np.sum(counts))*np.log2(counts[i]/np.sum(counts))])
    return entropy

In [None]:
##Info Gain

def InfoGain(X,Y,selected_feature):
    total_entropy = entropy(Y)
    vals,counts = np.unique(X[selected_feature],return_counts=True)
    for i in range(len(vals)):
        w=counts[i]/np.sum(counts)
        v=X.loc[(X[selected_feature]==vals[i]).dropna(),'profitable']
        Weighted_Entropy = np.sum([w*entropy(v)])           
    #formula for information gain
    Information_Gain = total_entropy-Weighted_Entropy
    return Information_Gain

In [None]:
def ID3(tdata,sdata,features,class_label="profitable", parent_node_class=None):
    #tdata=X.join(Y)
    #If all class_label values are same, return that value
    if len(np.unique(sdata[class_label])) <= 1:
        return np.unique(sdata[class_label])[0]
    
    #if the dataset is empty or below some threshold value, terminate recursion
    elif len(sdata) <= 5:
        # Find the counts of distinct values of class_label, then find the maximum count of them--> majority class label
        return np.unique(tdata[class_label])[np.argmax(np.unique(tdata[class_label],
                                                                           return_counts=True)[1])]
    
    #If the feature space is empty, terminate recursion
    elif len(features) == 0:
        return parent_node_class 

    #If none of the above condition holds true form the subtrees

    else:
        # Find the counts of distinct values of class_label, then find the maximum count of them--> majority class label
        parent_node_class = np.unique(sdata[class_label])[np.argmax(np.unique(sdata[class_label],
                                                                           return_counts=True)[1])]

    #Select the feature which best splits the dataset, feature having maximum informatin gain
    for feature in features:
        item_values = [InfoGain(sdata,feature,class_label)] #Return the infgain values
    best_feature_index = np.argmax(item_values)
    best_feature = features[best_feature_index]

    #Create the tree structure as a nested dictionary
    tree = {best_feature:{}}

    #Remove the feature with the best info gain
    features = [i for i in features if i!= best_feature]

    #Form subtrees down the root node by calling ID3 recursively

    for value in np.unique(sdata[best_feature]):
        value = value
        sub_data = sdata.where(sdata[best_feature]==value).dropna()
        #call the ID3 algotirthm
        subtree = ID3(tdata,sub_data,features,class_label,parent_node_class)
        #Add the subtree
        tree[best_feature][value] = subtree
    return(tree)

In [None]:
tdata.columns[:]

In [None]:
tree = ID3(tdata,tdata,tdata.columns[:-1])
pprint(tree)

In [None]:
#Predict the result
def predict(query,tree,default=1):
    for key in list(query.keys()):
        if key in list(tree.keys()):
            try:
               result = tree[key][query[key]]
               print(result)
            except:
               return default

            result = tree[key][query[key]]
            if isinstance(result,dict):
                return predict(query,result)
            else:
                return result

In [None]:
def test(data,tree):
   queries = data.iloc[:,:-1].to_dict(orient="records")
   predicted = pd.DataFrame(columns=["predicted"])

   #calculation of accuracy

   for i in range(len(data)):
       predicted.loc[i,"predicted"] = predict(queries[i],tree,1.0)
   print("The Prediction accuracy is:",(np.sum(predicted["predicted"]==data["profitable"])/len(data))*100,'%')

In [None]:
#Train and print the tree, find the accuracy
test(testdata,tree)

#Form the decision tree class by embedding the train and test methods and evaluate the classifier by forming confusion matrix

#Use scikit-learn library to form the decision tree classifier and evaluate its performance