# Decision Trees - Notebook

In [1]:
import math

Simply read the dataset into a list of list:

In [2]:
with open('playgolf.csv') as f:
    data = f.readlines()
data = [x.strip() for x in data]
data2 = []
for line in data:
    line = [word.strip() for word in line.split(',')]
    data2.append(line)
data = data2
header = ['Outlook', 'Temp', 'Humidity', 'Windy', 'Play Golf']
data

[['Rainy', 'Hot', 'High', 'False', 'No'],
 ['Rainy', 'Hot', 'High', 'True', 'No'],
 ['Overcast', 'Hot', 'High', 'False', 'Yes'],
 ['Sunny', 'Mild', 'High', 'False', 'Yes'],
 ['Sunny', 'Cool', 'Normal', 'False', 'Yes'],
 ['Sunny', 'Cool', 'High', 'True', 'No'],
 ['Overcast', 'Cool', 'Normal', 'True', 'Yes'],
 ['Rainy', 'Mild', 'High', 'False', 'No'],
 ['Rainy', 'Cool', 'Normal', 'False', 'Yes'],
 ['Sunny', 'Mild', 'Normal', 'False', 'Yes'],
 ['Rainy', 'Mild', 'Normal', 'True', 'Yes'],
 ['Overcast', 'Mild', 'High', 'True', 'Yes'],
 ['Overcast', 'Hot', 'Normal', 'False', 'Yes'],
 ['Sunny', 'Mild', 'High', 'True', 'No']]

### Entropy 

The below function uses the entropy formual too calculate the the entropy of given attribute

In [3]:
def entropy(attributeIndex, classifierIndex, inputData, targetAttr):
    
    #Dictionary that geathers all the different answers for the classifier
    answers = {}
    totalAnswerCount = 0
    
    #Go thorugh the data add new answers and how many times they appear to the dictionary
    for row in inputData:
        if row[attributeIndex] == targetAttr:
            totalAnswerCount += 1
            if row[classifierIndex] in answers:
                answers[row[classifierIndex]] += 1
            else:
                answers[row[classifierIndex]] = 1
    
    result = 0
    
    #Compute the entropy for the answers in the dictionary
    for key, answer in answers.items():
        result += -((answer/totalAnswerCount)*math.log((answer/totalAnswerCount), 2))
    return result
    
attributeIndex = header.index('Outlook')
classifierIndex = header.index('Play Golf')
entropy(attributeIndex, classifierIndex, data, "Sunny")

0.9709505944546686

## Information gain

The below function uses the information gain formula too compute the information gain of a given attribute set, using the entropy

In [4]:
def infoGain(attributeIndex, classifierIndex, inputData):
    
    #Geather the values for each row item and how often it appears
    values = {}
    totalSizeOfSet = 0
    for row in inputData:
        totalSizeOfSet += 1
        if row[attributeIndex] in values:
            values[row[attributeIndex]] += 1
        else:
            values[row[attributeIndex]] = 1
            
    #Compute the actual information gain here
    result = 0
    for key, value in values.items():
        result += (value/totalSizeOfSet)* entropy(attributeIndex, classifierIndex, inputData, key)
    return result

attributeIndex = header.index('Outlook')
classifierIndex = header.index('Play Golf')
infoGain(attributeIndex, classifierIndex, data)


0.6935361388961918

### Splitter

In the below function we loop through all the attributes and use the information gain to choose what attribute to split on. Then we return the different items of the chosen attribute and the index of it.

In [6]:
def splitter(data, head, classifier):
    
    # The attribute is assigned in this varible
    lowestAttribute = None
    # The attribute information gain value is assigned to this varible
    lowestAttributeVal = 1
    # The index of the lowest attribute
    attributeIndex = 0
    
    for currentAttributeIndex, attribute in enumerate(head):
        #We don't split on the classifier
        if attribute == classifier:
            continue
        #Compute the information gain of curent attribute
        attributeInfoGain = infoGain(currentAttributeIndex, classifierIndex, data)
        # If the information of current attribute is the lowest, we assign relevant values
        if attributeInfoGain <lowestAttributeVal:
            attributeIndex = currentAttributeIndex
            lowestAttributeVal = attributeInfoGain
            lowestAttribute = attribute
            
    # The items in the attribute, we return a set of them (one of each)
    splitReturns = [line[attributeIndex] for line in data]
    return (attributeIndex,set(splitReturns))

### Split

The below split function ueses the previous functions and recursively splits until the dataset is pure. 

The other two functions are small helper functions used in split.

In [7]:
#Determines if a subset is pure subset
def isPure(subset, classifierIndex):
    currentValue = None
    for line in subset:
        if currentValue is None:
            currentValue = line[classifierIndex]
        else:
            if line[classifierIndex] != currentValue:
                return False
    return True

# Just prints the subset in a better way
def printData(head, subset):
    print("---------------------------- Pure Subset ---------------------------------" )
    print()
    for attribute in head:
        print(" ----- ", end='')
        print(attribute, end='')
    print(" ----- ")
    for line in subset:
        print("       ", end='')
        for element in line:
            print(element, end='')
            print(((14-len(element))*" "), end='')
        print()
    print()
    print()
    print()
        
        

def split(data, head, classifier):
    
    #Call the splitter function from above
    splitterOut = splitter(data, head, classifier)
    classifierIndex = head.index(classifier)
    for value in splitterOut[1]:
        # Make a subset with relevant values
        subset = [line for line in data if line[splitterOut[0]] == value]
        #Print it if it is pure if not call split again with the subset as datab
        if  isPure(subset, classifierIndex):
            printData(head, subset)
            
        else: 
            split(subset, head, classifier)
            
            
split(list(data), list(header), 'Play Golf')


---------------------------- Pure Subset ---------------------------------

 ----- Outlook ----- Temp ----- Humidity ----- Windy ----- Play Golf ----- 
       Sunny         Cool          Normal        False         Yes           
       Overcast      Cool          Normal        True          Yes           
       Rainy         Cool          Normal        False         Yes           
       Sunny         Mild          Normal        False         Yes           
       Rainy         Mild          Normal        True          Yes           
       Overcast      Hot           Normal        False         Yes           



---------------------------- Pure Subset ---------------------------------

 ----- Outlook ----- Temp ----- Humidity ----- Windy ----- Play Golf ----- 
       Overcast      Hot           High          False         Yes           
       Overcast      Mild          High          True          Yes           



---------------------------- Pure Subset -------------------------

### Will John play golf?

In the function below we can send in the paramters for the relevant attributes and determine if John will play golf or not

In [8]:
result = None
def splitAndDefine(attributes, data, head, classifier):
    global result
    #Call the splitter function from above
    splitterOut = splitter(data, head, classifier)
    classifierIndex = head.index(classifier)
    for value in splitterOut[1]:
        # Make a subset with relevant values
        subset = [line for line in data if line[splitterOut[0]] == value]
        #Print it if it is pure if not call split again with the subset as datab
        if  isPure(subset, classifierIndex):
            for row in subset:
                if set(attributes).issubset(set(row)):
                    result = row[classifierIndex]
            
        else: 
            splitAndDefine(attributes, subset, head, classifier)


def willPlayGolf(Outlook, Temp, Humidity, Windy, data, head, classifier):
    splitAndDefine((Outlook, Temp, Humidity, Windy), data, head, classifier)


willPlayGolf('Sunny', 'Cool', 'Normal', 'False', data, header, 'Play Golf')
print(result)

Yes


### Scikit-learn and random forest classifier

In this example we will use dataset and the random forest classifer from Scikit-learn.
Example from: Chris Albon: https://github.com/chrisalbon/notes

In [9]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np

#Loading data for the training dataset
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [10]:
#Making a new column with the species names, which is what we are going to try to predict
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)
df.head()

# Adding another column to randomly assingn some columns to be a part of the training data and others not
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,is_train
0,5.1,3.5,1.4,0.2,setosa,True
1,4.9,3.0,1.4,0.2,setosa,False
2,4.7,3.2,1.3,0.2,setosa,True
3,4.6,3.1,1.5,0.2,setosa,False
4,5.0,3.6,1.4,0.2,setosa,True


In [11]:
#We crate two data frames, one is for the training data and the other one for testing data
train, test = df[df['is_train']==True], df[df['is_train']==False]

print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 106
Number of observations in the test data: 44


In [12]:
# We convert each species name into a digit so we can use it in the classifier
y = pd.factorize(train['species'])[0]

y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [13]:
#Here we make the classifer and train it with the training data
clf = RandomForestClassifier(n_jobs=2, random_state=0)
features = df.columns[:4]
clf.fit(train[features], y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [14]:
# Here we use the trained classifer to classify the test data which the classifer has never seen before
clf.predict(test[features])

preds = iris.target_names[clf.predict(test[features])]
print(preds[0:5])
test['species'].head()

['setosa' 'setosa' 'setosa' 'setosa' 'setosa']


1    setosa
3    setosa
5    setosa
6    setosa
7    setosa
Name: species, dtype: category
Categories (3, object): [setosa, versicolor, virginica]