# **Decision Tree**

In [52]:
import numpy as np
import pandas as pd

In [12]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## **Attribute Selection Measures**
- Entropy
- Information Gain

## **Entropy**
- Measure of randomness of a system

In [6]:
def entropy(x):
  n = x.shape[0]
  val, count = np.unique(x, return_counts=True)
  ent = 0.0

  for i in count:
    p = i/n
    ent += -(p*np.log2(p))

  return ent

In [7]:
X = np.array([1,0,0,1,0,1,1,0])
y = np.array([1,2,1,0,2,1,0,2])

In [8]:
entropy(X)

1.0

In [9]:
entropy(y)

1.561278124459133

## **Data Splitting according to Feature**

In [13]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/golf.csv')

In [21]:
df

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes
5,rainy,cool,normal,True,no
6,overcast,cool,normal,True,yes
7,sunny,mild,high,False,no
8,sunny,cool,normal,False,yes
9,rainy,mild,normal,False,yes


In [36]:
def dataSplit(X, feature):
  DATA = {}
  feature_vals = list(X[feature].value_counts().index)
  occurence = list(X[feature].value_counts())

  for i in feature_vals:
    DATA[i] = {
        'data' : pd.DataFrame([], columns=X.columns),
        'len' : 0
    }

  for i in range(X.shape[0]):
    val = X[feature].iloc[i]
    DATA[val]['data'] = pd.concat([DATA[val]['data'], X.iloc[[i]]])
    index = feature_vals.index(val)
    DATA[val]['len'] = occurence[index]

  return DATA

In [37]:
dataSplit(df,'Outlook')

{'sunny': {'data':    Outlook Temperature Humidity  Windy Play
  0    sunny         hot     high  False   no
  1    sunny         hot     high   True   no
  7    sunny        mild     high  False   no
  8    sunny        cool   normal  False  yes
  10   sunny        mild   normal   True  yes,
  'len': 5},
 'rainy': {'data':    Outlook Temperature Humidity  Windy Play
  3    rainy        mild     high  False  yes
  4    rainy        cool   normal  False  yes
  5    rainy        cool   normal   True   no
  9    rainy        mild   normal  False  yes
  13   rainy        mild     high   True   no,
  'len': 5},
 'overcast': {'data':      Outlook Temperature Humidity  Windy Play
  2   overcast         hot     high  False  yes
  6   overcast        cool   normal   True  yes
  11  overcast        mild     high   True  yes
  12  overcast         hot   normal  False  yes,
  'len': 4}}

## **Information Gain**

In [44]:
def infoGain(X, feature):
  n = X.shape[0]
  DATA = dataSplit(X, feature)
  keys = DATA.keys()
  entropy_child = 0.0

  for key in keys:
    entropy_child += ( (DATA[key]['len']/n) * entropy(DATA[key]['data']['Play']))

  info_gain = entropy(X['Play']) - entropy_child

  return info_gain

In [48]:
entropy(df['Play'])

0.9402859586706311

In [45]:
infoGain(df, 'Outlook')

0.24674981977443933

In [49]:
infoGain(df, 'Temperature')

0.02922256565895487

In [50]:
infoGain(df, 'Humidity')

0.15183550136234159

In [51]:
infoGain(df, 'Windy')

0.04812703040826949

## **The Decision Tree**


In [57]:
class DecisionTree:
  def __init__(self, depth=0, max_depth=5):
    # creating a node
    self.children = {}
    self.fkey = None
    self.max_depth = max_depth
    self.depth = depth
    self.target = None

  def train(self, X):
    features = ['Outlook', 'Temperature', 'Humidity', 'Windy']
    info_gains = []

    for f in features:
      ig = infoGain(X, f)
      info_gains.append(ig)

    # finding the best feature
    self.fkey = features[np.argmax(info_gains)]

    # splitting the data
    DATA = dataSplit(X, self.fkey)

    # Giving a target label to the node
    labels = list(X['Play'].value_counts().index)
    freq = list(X['Play'].value_counts().values)
    self.target = labels[np.argmax(freq)]

    ### STOPPING CONDITIONS
    have_data = 0
    keys = DATA.keys()

    for key in keys:
      if DATA[key]['len'] > 0:
        have_data +=1

    # if it is pure node
    if have_data < 2:
      return

    # Early stop if max_depth reached
    if self.depth >= self.max_depth:
      return

    print('\t'*self.depth + 'Making tree with - ',self.fkey)
    # Recursion - train child node
    for key in keys:
      new = DATA[key]['data']
      self.children[key] = DecisionTree(depth = self.depth+1)
      self.children[key].train(new)

    return

  def predict(self, test):
    if self.children == {}:
      return self.target
    return self.children[test[self.fkey][0]].predict(test)

## **Decision Tree Model**

In [58]:
model = DecisionTree()

In [59]:
model.train(df)

Making tree with -  Outlook
	Making tree with -  Humidity
	Making tree with -  Windy


In [60]:
model

<__main__.DecisionTree at 0x7afe598b1b10>

In [61]:
model.target

'yes'

In [62]:
model.fkey

'Outlook'

In [63]:
model.children

{'sunny': <__main__.DecisionTree at 0x7afe5ae46b90>,
 'rainy': <__main__.DecisionTree at 0x7afe598b1330>,
 'overcast': <__main__.DecisionTree at 0x7afe598b3160>}

In [64]:
model.children['sunny']

<__main__.DecisionTree at 0x7afe5ae46b90>

In [65]:
model.children['sunny'].fkey

'Humidity'

In [66]:
model.children['sunny'].children

{'high': <__main__.DecisionTree at 0x7afe598b38b0>,
 'normal': <__main__.DecisionTree at 0x7afe598b2830>}

In [67]:
model.children['sunny'].children['high'].children

{}

In [68]:
model.children['overcast'].target

'yes'

In [69]:
model.children['overcast'].children

{}

In [71]:
Xt = pd.DataFrame([['sunny', 'hot', 'normal', False]], columns=list(df.columns.values[:-1]))

In [72]:
Xt

Unnamed: 0,Outlook,Temperature,Humidity,Windy
0,sunny,hot,normal,False


In [73]:
model.predict(Xt)

'yes'