<a href="https://colab.research.google.com/github/MXMxRazer/Machine-Learning-Models/blob/main/'Decision%20Tree'/'Iterative%20Dichotomiser%203%20(ID3)'/Implementation/Python/ID3.ipynbID3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import json

In [None]:
dataset = pd.read_csv('./weather.csv')
label = 'Play'
class_list = dataset[label].unique()

In [None]:
def calc_total_entrophy(train_data, label, class_list):
  total_row = dataset.shape[0]
  total_entrophy = 0

  for c in class_list:
    total_class_count = train_data[train_data[label] == c].shape[0]
    total_class_entrophy = - (total_class_count / total_row) *  np.log2(total_class_count / total_row)
    total_entrophy += total_class_entrophy

  return total_entrophy


In [None]:
print(f"Total Entrophy: {calc_total_entrophy(dataset, label, class_list)}")

Total Entrophy: 0.9402859586706311


In [None]:
def calc_entrophy(feature_data, label, class_list):
  class_count = feature_data.shape[0]
  entrophy = 0

  for c in class_list:
    total_class_count = feature_data[feature_data[label] == c].shape[0]
    if total_class_count != 0:
      probability_class = (total_class_count / class_count)
      entrophy_class = - probability_class * np.log2(probability_class)
      entrophy += entrophy_class
  return entrophy


In [None]:
print(f"Entrophy: {calc_entrophy(dataset[dataset['Outlook'] == 'Sunny'], label, class_list)}")

Entrophy: 0.9709505944546686


In [None]:
def calc_info_gain(feature_name, train_data, label, class_list):
  feature_value_list = train_data[feature_name].unique()
  total_rows = train_data.shape[0]
  feature_info = 0

  for feature_value in feature_value_list:
    feature_value_data = train_data[train_data[feature_name] == feature_value]
    feature_value_count = feature_value_data.shape[0]
    feature_value_entrophy = calc_entrophy(feature_value_data, label, class_list)
    feature_value_probability = feature_value_count / total_rows
    feature_info += feature_value_probability * feature_value_entrophy

  return calc_total_entrophy(train_data, label, class_list) - feature_info


In [None]:
def most_informative_feature(train_data, label, class_list):
  feature_list = train_data.columns.drop(label)

  max_info_gain = -1
  max_info_feature = None

  for feature in feature_list:
    info_gain = calc_info_gain(feature, train_data, label, class_list)
    if max_info_gain < info_gain:
      max_info_gain = info_gain
      max_info_feature = feature

  return max_info_feature

In [None]:
feature_dict = dataset['Outlook'].value_counts(sort=False)

for feature, count in feature_dict.items():
  print(f"Feature: {feature} Count: {count}")

Feature: Sunny Count: 5
Feature: Overcast Count: 4
Feature: Rain Count: 5


In [None]:
def generate_sub_tree(feature_name, train_data, label, class_list):
  feature_value_count = train_data[feature_name].value_counts(sort=False)
  tree = {}

  for feature_value, count in feature_value_count.items():
    feature_value_data = train_data[train_data[feature_name] == feature_value]

    node_assigned = False

    for c in class_list:
      class_count = feature_value_data[feature_value_data[label] == c].shape[0]

      if class_count == count:
        tree[feature_value] = c
        train_data = train_data[train_data[feature_name] != feature_value]
        node_assigned = True

    if not node_assigned:
      tree[feature_value] = '?'

  return train_data, tree

In [None]:
train_data, tree = generate_sub_tree('Outlook', dataset[dataset['Outlook'] == 'Sunny'], label, class_list)
print(f"Train Data: {train_data} \n Tree: {tree}")

Train Data:    Outlook Temperature Humidity    Wind Play
0    Sunny         Hot     High    Weak   No
1    Sunny         Hot     High  Strong   No
7    Sunny        Mild     High    Weak   No
8    Sunny        Cool   Normal    Weak  Yes
10   Sunny        Mild   Normal  Strong  Yes 
 Tree: {'Sunny': '?'}


In [None]:
def create_tree(root, prev_feature_name, train_data, label, class_list):
  if train_data.shape[0] != 0:
    max_info_feature = most_informative_feature(train_data, label, class_list)
    train_data, tree = generate_sub_tree(max_info_feature, train_data, label, class_list)
    next_root = None

  if prev_feature_name != None:
    root[prev_feature_name] = dict()
    root[prev_feature_name][max_info_feature] = tree
    next_root = root[prev_feature_name][max_info_feature]
  else:
    root[max_info_feature] = tree
    next_root = root[max_info_feature]

  for node, branch in list(next_root.items()):
    if branch == "?":
      feature_value_data = train_data[train_data[max_info_feature] == node]
      create_tree(next_root, node, feature_value_data, label, class_list)

In [None]:
def id3(train_data, label):
    tree = {}
    class_list = train_data[label].unique()
    create_tree(tree, None, train_data, label, class_list)
    return tree

In [None]:
tree = id3(dataset, 'Play')
tree_json = json.dumps(tree, indent=4)

In [None]:
print(tree_json)

{
    "Outlook": {
        "Sunny": {
            "Humidity": {
                "High": "No",
                "Normal": "Yes"
            }
        },
        "Overcast": "Yes",
        "Rain": {
            "Wind": {
                "Weak": "Yes",
                "Strong": "No"
            }
        }
    }
}
