Write a program to demonstrate the working of the decision tree based ID3 algorithm. Use an appropriate data set for building the decision tree and apply this knowledge to classify a new sample.


# Import Data

In [19]:
import pandas as pd
from pandas import DataFrame 
df = pd.read_csv('/Users/julie/Desktop/computer_science/CS-Data-Science-Build-Week-1/data1.csv',delimiter=';',index_col = 0)
df.head()

Unnamed: 0_level_0,age,income,student,credit_rating,class:buys_computer
RID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,youth,high,no,fair,no
2,youth,high,no,excellent,no
3,middle_aged,high,no,fair,yes
4,senior,medium,no,fair,yes
5,senior,low,yes,fair,yes


In [20]:
df = df.rename(columns = {"class:buys_computer":"buy_computer"})

In [21]:
df.keys()[4]

'buy_computer'

# Entropy of the Training Data Set

In [22]:
#Function to calculate the entropy of probaility of observations
# -p*log2*p

def entropy(probs):  
    import math
    return sum( [-prob*math.log(prob, 2) for prob in probs] )

#Function to calulate the entropy of the given Data Sets/List with respect to target attributes
def entropy_of_list(a_list):  
    #print("A-list",a_list)
    from collections import Counter
    cnt = Counter(x for x in a_list)   # Counter calculates the propotion of class
   # print("\nClasses:",cnt)
    #print("No and Yes Classes:",a_list.name,cnt)
    num_instances = len(a_list)*1.0   # = 14
    print("\n Number of Instances of the Current Sub Class is {0}:".format(num_instances ))
    probs = [x / num_instances for x in cnt.values()]  # x means no of YES/NO
    print("\n Classes:",min(cnt),max(cnt))
    print(" \n Probabilities of Class {0} is {1}:".format(min(cnt),min(probs)))
    print(" \n Probabilities of Class {0} is {1}:".format(max(cnt),max(probs)))
    return entropy(probs) # Call Entropy :
    
# The initial entropy of the YES/NO attribute for our dataset.
print("\n  INPUT DATA SET FOR ENTROPY CALCULATION:\n", df['buy_computer'])

total_entropy = entropy_of_list(df['buy_computer'])

print("\n Total Entropy of buy_computer Data Set:",total_entropy)


  INPUT DATA SET FOR ENTROPY CALCULATION:
 RID
1      no
2      no
3     yes
4     yes
5     yes
6      no
7     yes
8      no
9     yes
10    yes
11    yes
12    yes
13    yes
14     no
Name: buy_computer, dtype: object

 Number of Instances of the Current Sub Class is 14.0:

 Classes: no yes
 
 Probabilities of Class no is 0.35714285714285715:
 
 Probabilities of Class yes is 0.6428571428571429:

 Total Entropy of buy_computer Data Set: 0.9402859586706309


# Information Gain of Attributes

In [10]:
def information_gain(df, split_attribute_name, target_attribute_name, trace=0):
    print("Information Gain Calculation of ",split_attribute_name)
    '''
    Takes a DataFrame of attributes, and quantifies the entropy of a target
    attribute after performing a split along the values of another attribute.
    '''
    # Split Data by Possible Vals of Attribute:
    df_split = df.groupby(split_attribute_name)
   # for name,group in df_split:
    #    print("Name:\n",name)
     #   print("Group:\n",group)
    
    # Calculate Entropy for Target Attribute, as well as
    # Proportion of Obs in Each Data-Split
    nobs = len(df.index) * 1.0
   # print("NOBS",nobs)
    df_agg_ent = df_split.agg({target_attribute_name : [entropy_of_list, lambda x: len(x)/nobs] })[target_attribute_name]
    #print([target_attribute_name])
    #print(" Entropy List ",entropy_of_list)
    #print("DFAGGENT",df_agg_ent)
    df_agg_ent.columns = ['Entropy', 'PropObservations']
    #if trace: # helps understand what fxn is doing:
     #   print(df_agg_ent)
    
    # Calculate Information Gain:
    new_entropy = sum( df_agg_ent['Entropy'] * df_agg_ent['PropObservations'] )
    old_entropy = entropy_of_list(df[target_attribute_name])
    return old_entropy - new_entropy


print('Info-gain for age is :'+str( information_gain(df, 'age', 'buy_computer')),"\n")
print('\n Info-gain for income is: ' + str( information_gain(df, 'income', 'buy_computer')),"\n")
print('\n Info-gain for student is:' + str( information_gain(df, 'student', 'buy_computer')),"\n")
print('\n Info-gain for credit_rating is:' + str( information_gain(df, 'credit_rating','buy_computer')),"\n")

Information Gain Calculation of  age

 Number of Instances of the Current Sub Class is 0.0:

 Number of Instances of the Current Sub Class is 4.0:

 Classes: yes yes
 
 Probabilities of Class yes is 1.0:
 
 Probabilities of Class yes is 1.0:

 Number of Instances of the Current Sub Class is 5.0:

 Classes: no yes
 
 Probabilities of Class no is 0.4:
 
 Probabilities of Class yes is 0.6:

 Number of Instances of the Current Sub Class is 5.0:

 Classes: no yes
 
 Probabilities of Class no is 0.4:
 
 Probabilities of Class yes is 0.6:

 Number of Instances of the Current Sub Class is 14.0:

 Classes: no yes
 
 Probabilities of Class no is 0.35714285714285715:
 
 Probabilities of Class yes is 0.6428571428571429:
Info-gain for age is :0.2467498197744391 

Information Gain Calculation of  income

 Number of Instances of the Current Sub Class is 0.0:

 Number of Instances of the Current Sub Class is 4.0:

 Classes: no yes
 
 Probabilities of Class no is 0.5:
 
 Probabilities of Class yes is 0

# ID3 Algorithm

In [11]:
def id3(df, target_attribute_name, attribute_names, default_class=None):
    
    ## Tally target attribute:
    from collections import Counter
    cnt = Counter(x for x in df[target_attribute_name])# class of YES /NO
    
    ## First check: Is this split of the dataset homogeneous?
    if len(cnt) == 1:
        return next(iter(cnt))  # next input data set, or raises StopIteration when EOF is hit.
    
    ## Second check: Is this split of the dataset empty?
    # if yes, return a default value
    elif df.empty or (not attribute_names):
        return default_class  # Return None for Empty Data Set
    
    ## Otherwise: This dataset is ready to be devied up!
    else:
        # Get Default Value for next recursive call of this function:
        default_class = max(cnt.keys()) #No of YES and NO Class
        # Compute the Information Gain of the attributes:
        gainz = [information_gain(df, attr, target_attribute_name) for attr in attribute_names] #
        index_of_max = gainz.index(max(gainz)) # Index of Best Attribute
        # Choose Best Attribute to split on:
        best_attr = attribute_names[index_of_max]
        
        # Create an empty tree, to be populated in a moment
        tree = {best_attr:{}} # Iniiate the tree with best attribute as a node 
        remaining_attribute_names = [i for i in attribute_names if i != best_attr]
        
        # Split dataset
        # On each split, recursively call this algorithm.
        # populate the empty tree with subtrees, which
        # are the result of the recursive call
        for attr_val, data_subset in df.groupby(best_attr):
            subtree = id3(data_subset,
                        target_attribute_name,
                        remaining_attribute_names,
                        default_class)
            tree[best_attr][attr_val] = subtree
        return tree

# Predicting Attributes

In [13]:
# Get Predictor Names (all but 'class')
attribute_names = list(df.columns)
print("List of Attributes:", attribute_names) 
attribute_names.remove('buy_computer') #Remove the class attribute 
print("Predicting Attributes:", attribute_names)

List of Attributes: ['age', 'income', 'student', 'credit_rating', 'buy_computer']
Predicting Attributes: ['age', 'income', 'student', 'credit_rating']


# Tree Construction

In [14]:
# Run Algorithm:
from pprint import pprint
tree = id3(df,'buy_computer',attribute_names)
print("\n\nThe Resultant Decision Tree is :\n")
#print(tree)
pprint(tree)
attribute = next(iter(tree))
print("Best Attribute :\n",attribute)
print("Tree Keys:\n",tree[attribute].keys())

Information Gain Calculation of  age

 Number of Instances of the Current Sub Class is 0.0:

 Number of Instances of the Current Sub Class is 4.0:

 Classes: yes yes
 
 Probabilities of Class yes is 1.0:
 
 Probabilities of Class yes is 1.0:

 Number of Instances of the Current Sub Class is 5.0:

 Classes: no yes
 
 Probabilities of Class no is 0.4:
 
 Probabilities of Class yes is 0.6:

 Number of Instances of the Current Sub Class is 5.0:

 Classes: no yes
 
 Probabilities of Class no is 0.4:
 
 Probabilities of Class yes is 0.6:

 Number of Instances of the Current Sub Class is 14.0:

 Classes: no yes
 
 Probabilities of Class no is 0.35714285714285715:
 
 Probabilities of Class yes is 0.6428571428571429:
Information Gain Calculation of  income

 Number of Instances of the Current Sub Class is 0.0:

 Number of Instances of the Current Sub Class is 4.0:

 Classes: no yes
 
 Probabilities of Class no is 0.5:
 
 Probabilities of Class yes is 0.5:

 Number of Instances of the Current Su

# Classification Accuracy

In [15]:
def classify(instance, tree, default=None): # Instance of Play Tennis with Predicted 
    
    #print("Instance:",instance)
    attribute = next(iter(tree)) # Outlook/Humidity/Wind       
    print("Key:",tree.keys())  # [Outlook,Humidity,Wind ]
    print("Attribute:",attribute) # [Key /Attribute Both are same ]
   
    # print("Insance of Attribute :",instance[attribute],attribute)
    if instance[attribute] in tree[attribute].keys(): # Value of the attributs in  set of Tree keys  
        result = tree[attribute][instance[attribute]]
        print("Instance Attribute:",instance[attribute],"TreeKeys :",tree[attribute].keys())
        if isinstance(result, dict): # this is a tree, delve deeper
            return classify(instance, result)
        else:
            return result # this is a label
    else:
        return default

In [17]:
df['predicted'] = df.apply(classify, axis=1, args=(tree,'No') ) 
    # classify func allows for a default arg: when tree doesn't have answer for a particular
    # combitation of attribute-values, we can use 'no' as the default guess 

print(df['predicted'])

print('\n Accuracy is:\n' + str( sum(df['buy_computer']==df['predicted'] ) / (1.0*len(df.index)) ))


df[['buy_computer', 'predicted']]

Key: dict_keys(['age'])
Attribute: age
Instance Attribute: youth TreeKeys : dict_keys(['middle_aged', 'senior', 'youth'])
Key: dict_keys(['student'])
Attribute: student
Instance Attribute: no TreeKeys : dict_keys(['no', 'yes'])
Key: dict_keys(['age'])
Attribute: age
Instance Attribute: youth TreeKeys : dict_keys(['middle_aged', 'senior', 'youth'])
Key: dict_keys(['student'])
Attribute: student
Instance Attribute: no TreeKeys : dict_keys(['no', 'yes'])
Key: dict_keys(['age'])
Attribute: age
Instance Attribute: middle_aged TreeKeys : dict_keys(['middle_aged', 'senior', 'youth'])
Key: dict_keys(['age'])
Attribute: age
Instance Attribute: senior TreeKeys : dict_keys(['middle_aged', 'senior', 'youth'])
Key: dict_keys(['credit_rating'])
Attribute: credit_rating
Instance Attribute: fair TreeKeys : dict_keys(['excellent', 'fair'])
Key: dict_keys(['age'])
Attribute: age
Instance Attribute: senior TreeKeys : dict_keys(['middle_aged', 'senior', 'youth'])
Key: dict_keys(['credit_rating'])
Attribut

Unnamed: 0_level_0,buy_computer,predicted
RID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,no,no
2,no,no
3,yes,yes
4,yes,yes
5,yes,yes
6,no,no
7,yes,yes
8,no,no
9,yes,yes
10,yes,yes


# Classification Accuracy: Training/Testing Set

In [18]:
training_data = df.iloc[1:-4] # all but last four instances
test_data  = df.iloc[-4:] # just the last four
train_tree = id3(training_data, 'buy_computer', attribute_names)

test_data['predicted2'] = test_data.apply(                                # <---- test_data source
                                          classify, 
                                          axis=1, 
                                          args=(train_tree,'Yes') ) # <---- train_data tree


print ('\n\n Accuracy is : ' + str( sum(test_data['buy_computer']==test_data['predicted2'] ) / (1.0*len(test_data.index)) ))

Information Gain Calculation of  age

 Number of Instances of the Current Sub Class is 0.0:

 Number of Instances of the Current Sub Class is 2.0:

 Classes: yes yes
 
 Probabilities of Class yes is 1.0:
 
 Probabilities of Class yes is 1.0:

 Number of Instances of the Current Sub Class is 4.0:

 Classes: no yes
 
 Probabilities of Class no is 0.25:
 
 Probabilities of Class yes is 0.75:

 Number of Instances of the Current Sub Class is 3.0:

 Classes: no yes
 
 Probabilities of Class no is 0.3333333333333333:
 
 Probabilities of Class yes is 0.6666666666666666:

 Number of Instances of the Current Sub Class is 9.0:

 Classes: no yes
 
 Probabilities of Class no is 0.3333333333333333:
 
 Probabilities of Class yes is 0.6666666666666666:
Information Gain Calculation of  income

 Number of Instances of the Current Sub Class is 0.0:

 Number of Instances of the Current Sub Class is 2.0:

 Classes: no yes
 
 Probabilities of Class no is 0.5:
 
 Probabilities of Class yes is 0.5:

 Number 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


# End！