# ID3

In [1]:
import pandas as pd
import numpy as np
from pprint import pprint
import random

In [2]:
dataset = pd.read_csv('C:\\Users\\mdere\\source\\repos\\IT-master\\Python\\Code\\Example_4_Tree\\dataset.csv')
dataset=dataset.drop('Name',axis=1)

In [3]:
print(dataset)

        TextEditor ProgrammingLanguage   Drink       OS
0              vim              Python     tea  windows
1          jupyter              Kotlin  coffee  windows
2          jupyter              Python     tea      mac
3          jupyter              Python     tea      mac
4          jupyter              Python  coffee      mac
5          jupyter              Python  coffee      mac
6          jupyter              Python  coffee  windows
7              vim              Python     tea     unix
8          jupyter              Python  coffee  windows
9   Android Studio              Kotlin     tea  windows
10         jupyter              Python     tea  windows
11         pycharm              Python  coffee  windows
12          matlab              matlab  coffee  windows
13         pycharm              Python     tea     unix
14         pycharm                Java     tea  windows
15       vs studio                 c++     tea  windows
16  vs studio code                  c#  coffee  

In [4]:
def entropy(target_col):
    elements,counts = np.unique(target_col,return_counts = True)
    entropy = np.sum([(-counts[i] / np.sum(counts)) * np.log2(counts[i] / np.sum(counts)) for i in range(len(elements))])
    return entropy

In [5]:
entropy(dataset['ProgrammingLanguage'])

2.0814963295286755

In [6]:
def InfoGain(data,split_attribute_name,target_name="class"):
    total_entropy = entropy(data[target_name])
    
    vals,counts= np.unique(data[split_attribute_name],return_counts=True)
    
    Weighted_Entropy = np.sum([(counts[i]/np.sum(counts))*entropy(data.where(data[split_attribute_name]==vals[i]).dropna()[target_name]) for i in range(len(vals))])
    
    Information_Gain = total_entropy - Weighted_Entropy
    return Information_Gain

In [7]:
InfoGain(dataset, 'ProgrammingLanguage', 'OS')

0.4110008528178223

In [8]:
def ID3(data, originaldata, features, target_attribute_name="class", parent_node_class = None):
    if len(np.unique(data[target_attribute_name])) <= 1:
        return np.unique(data[target_attribute_name])[0]
    elif len(data)==0:
        return np.unique(originaldata[target_attribute_name])[np.argmax(np.unique(originaldata[target_attribute_name],return_counts=True)[1])]
    elif len(features) ==0:
        return parent_node_class
    else:
        parent_node_class = np.unique(data[target_attribute_name])[np.argmax(np.unique(data[target_attribute_name],return_counts=True)[1])]
        item_values = [InfoGain(data,feature,target_attribute_name) for feature in features] #Return the information gain values for the features in the dataset
        best_feature_index = np.argmax(item_values)
        best_feature = features[best_feature_index]
        
        tree = {best_feature:{}}
        
        
        features = [i for i in features if i != best_feature]
        
        for value in np.unique(data[best_feature]):
            value = value
            sub_data = data.where(data[best_feature] == value).dropna()
            
            subtree = ID3(sub_data,dataset,features,target_attribute_name,parent_node_class)
            
            tree[best_feature][value] = subtree
            
        return(tree)   

In [9]:
def train_test_split(dataset, train_weight):
    weight = int(len(dataset) * train_weight)
    dataset = dataset.sample(frac=1)
    training_data = dataset[:weight].reset_index(drop=True)
    testing_data = dataset[weight:].reset_index(drop=True)
    return training_data,testing_data
training_data, testing_data = train_test_split(dataset, 0.7)

In [10]:
print(training_data)

        TextEditor ProgrammingLanguage   Drink       OS
0        vs studio                 c++     tea     unix
1   vs studio code                 c++  coffee  windows
2              vim              Python     tea     unix
3        vs studio                 c++     tea  windows
4          jupyter              Python     tea      mac
5          jupyter              Python     tea  windows
6   Android Studio              Kotlin     tea  windows
7          jupyter              Kotlin  coffee  windows
8          jupyter              Python  coffee  windows
9          pycharm                   c     tea  windows
10          matlab              matlab  coffee  windows
11         pycharm              Python     tea     unix
12         jupyter              Python  coffee      mac
13             vim              Python     tea  windows


In [11]:
print(testing_data)

       TextEditor ProgrammingLanguage   Drink       OS
0         pycharm              Python  coffee  windows
1         jupyter              Python  coffee      mac
2         pycharm                Java     tea  windows
3         jupyter              Python  coffee  windows
4  vs studio code                  c#  coffee     unix
5         jupyter              Python     tea      mac


In [12]:
training_data.columns[:-1]

Index(['TextEditor', 'ProgrammingLanguage', 'Drink'], dtype='object')

In [15]:
tree = ID3(training_data, training_data, training_data.columns[:-1], 'OS')

In [16]:
pprint(tree)

{'TextEditor': {'Android Studio': 'windows',
                'jupyter': {'ProgrammingLanguage': {'Kotlin': 'windows',
                                                    'Python': {'Drink': {'coffee': 'mac',
                                                                         'tea': 'mac'}}}},
                'matlab': 'windows',
                'pycharm': {'ProgrammingLanguage': {'Python': 'unix',
                                                    'c': 'windows'}},
                'vim': {'ProgrammingLanguage': {'Python': {'Drink': {'tea': 'unix'}}}},
                'vs studio': {'ProgrammingLanguage': {'c++': {'Drink': {'tea': 'unix'}}}},
                'vs studio code': 'windows'}}


In [17]:
def predict(query, tree, default = 1):
    for key in list(query.keys()):
        if key in list(tree.keys()):
            try:
                result = tree[key][query[key]] 
            except:
                return default
            result = tree[key][query[key]]
            if isinstance(result,dict):
                return predict(query,result)
            else:
                return result

In [18]:
def test(data, tree, target_attribute_name):
    queries = data.iloc[:,:-1].to_dict(orient = "records")
    predicted = pd.DataFrame(columns=["predicted"]) 
    for i in range(len(data)):
        predicted.loc[i,"predicted"] = predict(queries[i],tree,1.0) 
    print('The prediction accuracy is: ',(np.sum(predicted["predicted"] == data[target_attribute_name])/len(data))*100,'%')

In [20]:
test(testing_data, tree, 'OS')

The prediction accuracy is:  33.33333333333333 %
