<a href="https://colab.research.google.com/github/Hungtran-pro/codeCoursera/blob/main/Decision_Tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("/content/drive/MyDrive/AI projects/AI NAVER/play_tennis.csv")

In [None]:
df

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Play Tennis
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
5,Rain,Cool,Normal,Strong,No
6,Overcast,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes


In [None]:
def calc_entropy(feature_data, label, class_list):
  '''
  Return a calculation of entropy
  Entropy(S) = sum (-p_i * log(p_i))
  '''
  total_row = feature_data.shape[0]
  entropy = 0.0
  
  for c in class_list:
    total_c = feature_data[feature_data[label] == c].shape[0]

    entropy_c = 0.0
    if total_c != 0:
      prob_c = total_c / total_row
      entropy_c = - prob_c * np.log2(prob_c) 
    
    entropy += entropy_c
      
  return entropy

In [None]:
def calc_IG(feature_name, train_data, label, class_list):
    '''
    Return IG(feature, values of the feature)
    IG(S,A) = Entropy(S) - sum( (Sv/S) * Entropy(Sv))
    '''
    features_list = train_data[feature_name].unique()
    total_row = train_data.shape[0]
    entropy_features = 0.0
    
    for feature in features_list:
        feature_data = train_data[train_data[feature_name] == feature]
        feature_count = feature_data.shape[0]
        feature_entropy = calc_entropy(feature_data, label, class_list)
        feature_normalization = feature_count / total_row
        entropy_features += feature_normalization * feature_entropy
        
    return calc_entropy(train_data, label, class_list) - entropy_features

In [None]:
def find_most_IG_feature(train_data, label, class_list):
  '''
  Return features regarding the highest IG (Information Gain)
  '''
  features_list = train_data.columns.drop(label) #Return all left features without the current features
  max_IG = -1
  best_feature = None

  for feature in features_list:  
    feature_IG = calc_IG(feature, train_data, label, class_list)
    if max_IG < feature_IG:
      max_IG = feature_IG
      best_feature = feature
        
  return best_feature

In [None]:
def generate_sub_tree(feature_name, train_data, label, class_list):
  '''
  Generate a sub tree
  '''
  features_count_dict = train_data[feature_name].value_counts(sort=False)
  tree = dict()
  for feature, count in features_count_dict.iteritems():
    feature_data = train_data[train_data[feature_name] == feature] #Get a data with relative feature
    assigned_to_node = False
    for c in class_list:
      total_c = feature_data[feature_data[label] == c].shape[0]

      if total_c == count:
        tree[feature] = c
        train_data = train_data[train_data[feature_name] != feature] #Eliminate feature from the dataset
        assigned_to_node = True
    if not assigned_to_node:
      tree[feature] = "?"
  return tree, train_data

In [None]:
def make_tree(root, parent_node, train_data, label, class_list):
  '''
  Create a tree
  '''
  if train_data.shape[0] != 0:
      best_feature = find_most_IG_feature(train_data, label, class_list)
      tree, train_data = generate_sub_tree(best_feature, train_data, label, class_list)
      
      if parent_node != None:
          root[parent_node] = dict()
          root[parent_node][best_feature] = tree
          next_root = root[parent_node][best_feature]
      else:
          root[best_feature] = tree
          next_root = root[best_feature]
      
      for node, branch in list(next_root.items()):
          if branch == "?":
              feature_data = train_data[train_data[best_feature] == node]
              make_tree(next_root, node, feature_data, label, class_list)

In [None]:
#Create a tree
def pre_build(df, label):
  '''
  Initialize required infor
  '''
  tree = dict()
  train_data = df.copy()
  class_list = train_data[label].unique() # Get unique values based on a hash table
  #Respect to "Play Tennis" return array(["Yes" "No"])
  make_tree(tree, None, train_data, label, class_list)

  return tree

In [None]:
pre_build(df, "Play Tennis")

{'Outlook': {'Overcast': 'Yes',
  'Rain': {'Wind': {'Strong': 'No', 'Weak': 'Yes'}},
  'Sunny': {'Humidity': {'High': 'No', 'Normal': 'Yes'}}}}