## Importing Libraries and dataset

In [1]:
import numpy as np
import pandas as pd

data = {'Day':[1,2,3,4,5,6,7,8,9,10,11,12,13,14],
        'Outlook':['Sunny','Sunny','Overcast','Rain','Rain','Rain','Overcast','Sunny','Sunny','Rain','Sunny','Overcast','Overcast','Rain'],
        'Temp.':['Hot','Hot','Hot','Mild','Cool','Cool','Cool','Mild','Cool','Mild','Mild','Mild','Hot','Mild'],
        'Humidity':['High','High','High','High','Normal','Normal','Normal','High','Normal','Normal','Normal','High','Normal','High'],
        'Wind':['Weak','Strong','Weak','Weak','Weak','Strong','Strong','Weak','Weak','Weak','Strong','Strong','Weak','Strong'],
        'Decision':['No','No','Yes','Yes','Yes','No','Yes','No','Yes','Yes','Yes','Yes','Yes','No']
       }

df = pd.DataFrame(data)

df.to_csv('Sample_DT.csv',mode='a',header=False,index=False)
df

Unnamed: 0,Day,Outlook,Temp.,Humidity,Wind,Decision
0,1,Sunny,Hot,High,Weak,No
1,2,Sunny,Hot,High,Strong,No
2,3,Overcast,Hot,High,Weak,Yes
3,4,Rain,Mild,High,Weak,Yes
4,5,Rain,Cool,Normal,Weak,Yes
5,6,Rain,Cool,Normal,Strong,No
6,7,Overcast,Cool,Normal,Strong,Yes
7,8,Sunny,Mild,High,Weak,No
8,9,Sunny,Cool,Normal,Weak,Yes
9,10,Rain,Mild,Normal,Weak,Yes


## Entropy

In [2]:
def cal_Entropy(dataframe,attr):
    entropy = 0
    All_sum = dataframe.shape[0]

    for i in dataframe[attr].value_counts():
        entropy_values = -(i/All_sum)*(np.log2(i/All_sum))
        entropy += entropy_values
        
    return entropy

In [3]:
def split(dataframe,attr):
    split_data = []
    col_vals = dataframe[attr].unique()
    for col_val in col_vals:
        split_data.append(dataframe[dataframe[attr] == col_val])
    return split_data

## Information Gain

In [4]:
def Info_gain(dataframe,start,attr):
    info_gain = cal_Entropy(dataframe,start)
    p2 = dataframe.shape[0]
    
    for df_subset in split(dataframe,attr):

        p1 = df_subset.shape[0]
        info_gain -=  (p1/p2) * (cal_Entropy(df_subset,start))
    
    return info_gain

In [5]:
print("Information Gain of Decision upon Wind : ",Info_gain(df,'Decision','Wind'))

Information Gain of Decision upon Wind :  0.048127030408269544


## Finding the Most Dominant Attribute

In [6]:
def best_feature(dataframe):
    
    max_info_gain = 0
    best_attribute = 0
    
    attributes = list(dataframe.columns)
    attributes.remove('Decision')
    attributes.remove('Day')     # not necessary
    
    for attribute in attributes:
        
        gain = Info_gain(dataframe, 'Decision', attribute)
        
        #print(attribute," : ",gain)
        
        if (gain > max_info_gain):
            max_info_gain = gain
            best_attribute = attribute
            
    #print("\nBest Attribute : ",best_attribute,"  Information Gain : ", max_info_gain)
    return best_attribute, max_info_gain

In [7]:
def New_Data(dataset,best_attr):
    new_data = split(dataset,best_attr)
    return new_data,best_attr

## First and Second Level DT Nodes

In [8]:

def DT_Nodes(df, depth = 0):
    
    if(depth == 0):
        
        # Level - 1
        print("Level - 1 (root node): ",best_feature(df),"\n")
    
    elif(depth == 1):
        
        # Level - 1
        print("Level - 1 (root node): ",best_feature(df),"\n")
        
        # Level - 2
        new_data, best_attr = New_Data(df,best_feature(df)[0])

        for j in range (len(new_data)):
            #print(new_data[j])

            df1 = new_data[j]
            print("Level - 2.",j+1," - When ",best_attr," is ",df1[best_attr].values[0],"\n")

            df2 = df1.drop(best_attr,axis=1)

            if (best_feature(df2) == (0,0)):
                print("   Best feature with gain value : None \n")
                print("      Decision is : ",df2.iloc[1][4],"\n")

            else:
                print("   Best feature with gain value :",best_feature(df2),"\n")

    else:
        # Level - 1
        print("Level - 1 (root node): ",best_feature(df),"\n")

        # Level - 2
        new_data, best_attr = New_Data(df,best_feature(df)[0])

        for j in range (len(new_data)):
        #print(new_data[j])

            df1 = new_data[j]
            print("Level - 2.",j+1," - When ",best_attr," is ",df1[best_attr].values[0],"\n")

            df2 = df1.drop(best_attr,axis=1)

            if (best_feature(df2) == (0,0)):
                print("   Best feature with gain value : None \n")
                print("      Decision is : ",df2.iloc[1][4],"\n")

            else:
                print("   Best feature with gain value :",best_feature(df2),"\n")

                new_data1, best_attr1 = New_Data(df2,best_feature(df2)[0])

                # Level - 3
                for k in range (len(new_data1)):

                    df3 = new_data1[k]
                    print("Level - 3.",k+1," - When ",best_attr1," is ",df3[best_attr1].values[0],"\n")

                    df4 = df3.drop(best_attr1,axis=1)

                    if (best_feature(df4) == (0,0)):
                        print("   Best feature with gain value : None \n")
                        print("      Decision is : ",df4.iloc[1][3],"\n")
                    else:
                        print("   Best feature with gain value :",best_feature(df4),"\n")

In [9]:
DT_Nodes(df, depth = 2)

Level - 1 (root node):  ('Outlook', 0.24674981977443933) 

Level - 2. 1  - When  Outlook  is  Sunny 

   Best feature with gain value : ('Humidity', 0.9709505944546686) 

Level - 3. 1  - When  Humidity  is  High 

   Best feature with gain value : None 

      Decision is :  No 

Level - 3. 2  - When  Humidity  is  Normal 

   Best feature with gain value : None 

      Decision is :  Yes 

Level - 2. 2  - When  Outlook  is  Overcast 

   Best feature with gain value : None 

      Decision is :  Yes 

Level - 2. 3  - When  Outlook  is  Rain 

   Best feature with gain value : ('Wind', 0.9709505944546686) 

Level - 3. 1  - When  Wind  is  Weak 

   Best feature with gain value : None 

      Decision is :  Yes 

Level - 3. 2  - When  Wind  is  Strong 

   Best feature with gain value : None 

      Decision is :  No 

