## Regression Tree

using standard deviation , we are going to make decision

In [231]:
import pandas as pd
data = [['Sunny','Hot','High','Weak',25],
        ['Sunny','Hot','High','Strong',30],
        ['Overcast','Hot','High','Weak',46],
        ['Rain','Mild','High','Weak',45],
        ['Rain','Cool','Normal','Weak',52],
        ['Rain','Cool','Normal','Strong',23],
        ['Overcast','Cool','Normal','Strong',43],
        ['Sunny','Mild','High','Weak',35],
        ['Sunny','Cool','Normal','Weak',38],
        ['Rain','Mild','Normal','Weak',46],
        ['Sunny','Mild','Normal','Strong',48],
        ['Overcast','Mild','High','Strong',52],
        ['Overcast','Hot','Normal','Weak',44],
        ['Rain','Mild','High','Strong',30]]

In [1]:
title = ['Outlook','Temp.','Humidity','Wind','Decision']

In [233]:
Data = pd.DataFrame(data,columns = title)
Data

Unnamed: 0,Outlook,Temp.,Humidity,Wind,Decision
0,Sunny,Hot,High,Weak,25
1,Sunny,Hot,High,Strong,30
2,Overcast,Hot,High,Weak,46
3,Rain,Mild,High,Weak,45
4,Rain,Cool,Normal,Weak,52
5,Rain,Cool,Normal,Strong,23
6,Overcast,Cool,Normal,Strong,43
7,Sunny,Mild,High,Weak,35
8,Sunny,Cool,Normal,Weak,38
9,Rain,Mild,Normal,Weak,46


In [234]:
#creating uniques values for features
def unique_values(rows,col):
    return set([row[col] for row in rows])

In [235]:
def values_list(rows):
    return list([row[-1] for row in rows])


def avg(rows):
    val = values_list(rows)
    summ = 0
    for i in val:
        summ += i
    mean = summ/len(rows)
    return mean

In [236]:
avg(data)

39.785714285714285

In [237]:
#Standard Deviation (S) is for tree building (branching).
from math import sqrt
def std(rows):
    x_mean = avg(rows)
    val = values_list(rows)
    st_dev = 0
    for x in val:
        st_dev += (x-x_mean)**2
    return sqrt(st_dev)/sqrt(len(rows))

In [238]:
std(data)

9.321086474291743

In [239]:
#Coefficient of Deviation (CV) is used to decide when to stop branching. We can use Count (n) as well.
def coeff_variation(rows):
    mean = avg(rows)
    S= std(rows)
    return S/mean 

In [240]:
coeff_variation(data)

0.23428224531433467

In [242]:
is_numeric(7)

True

In [267]:
#counting the attributes of class label
def count_values(rows):
    count = {}
    for row in rows:
        label = row[-1]
        if label not in count:
            count[label] = 1
        else:
            count[label] += 1
    return count


In [268]:
count_values(data)

{25: 1, 30: 2, 46: 2, 45: 1, 52: 2, 23: 1, 43: 1, 35: 1, 38: 1, 48: 1, 44: 1}

In [269]:
class Question(object):
    #taking columns and value as an attributes
    def __init__(self , col , value):
        self.col = col
        self.value = value
        
    #Matching the values with the column
    def match(self,data):
        '''returns True if the value in given column is greater than input value
           else, it returns False'''
        val = data[self.col]
        return val == self.value
    
    #string representation
    def __repr__(self):
        condition = '=='
        return ('Is %s %s %s  ?'%(title[self.col] ,condition , self.value))
        
    

In [270]:
q = Question(0,'Sunny')
q

Is Outlook == Sunny  ?

In [271]:
r = Question(1,'Hot')
r

Is Temp. == Hot  ?

In [293]:
def partition(rows , question):
    true_rows,false_rows = [],[]
    for row in rows:
        if question.match(row):
            true_rows.append(row)
        else:
            false_rows.append(row)
    return true_rows,false_rows

In [297]:
t_r ,f_r= partition(data,Question(0,'Sunny'))


In [298]:
std(data)

9.321086474291743

In [299]:
# Standard deviation for two attributes (target and predictor):	
def std_two_attr(rows,true_branch):
    count = count_values(rows)
    std_attr = 0
    for val in count:
        p = count[val]/len(rows)
        std_attr += p*std(true_branch)
    return std_attr
        
        
        

In [300]:
std_two_attr(data,t_r)

7.7820305833374865

In [301]:
def std_dev_reduction(rows,current,left):
    return current - std_two_attr(rows,left)
    

In [302]:
current = std(data)
std_dev_reduction(data,current,t_r)

1.5390558909542564

In [303]:
# splitting the data with respect to standard deviation reduction
#splitting the best question and gainRatio
def best_split(rows):
    best_stdRed = 0
    best_question = None
    current = std(rows)
    features = len(rows[0])-1
    for col in range(features):
        value = unique_values(rows,col)
        for val in value:
            question = Question(col,val)
            
            true_rows,false_rows = partition(rows,question)
            #print(true_rows)
            if len(true_rows) == 0 or len(false_rows)==0 :
                continue
            
            std_reduction = std_dev_reduction(rows,current,true_rows)
            
            if std_reduction > best_stdRed:
                best_stdRed,best_question = std_reduction,question
    return best_stdRed,best_question

In [304]:
best_split(data)

(5.830026463349508, Is Outlook == Overcast  ?)

In [315]:
#class --> decisiontree
class DecisionTree:
    def __init__(self,question,true_branch,false_branch):
        #stores question
        self.question = question
        #stores true_branch
        self.true_branch = true_branch
        #stores false_branch
        self.false_branch = false_branch

In [316]:
#class leaf
class Leaf:
    def __init__(self,rows):
        #stores the leaf value
        self.predictions = count_values(rows)

In [317]:
def build_tree(rows):
    #info_gain and question formed
    std_reduction,question = best_split(rows)
    
    #if gain = 0, then Leaf satisfied 
    if std_reduction == 0:
        return Leaf(rows)
    
    #to find a best value or question to partition on
    true_rows,false_rows = partition(rows,question)
    #recursive function to build the model
    true_branch = build_tree(true_rows)
    false_branch = build_tree(false_rows)
    
    return DecisionTree(question , true_branch , false_branch)


In [318]:
def print_tree(node,indentation=""):
    '''printing function'''
    #base case means we have reached the leaf
    #if the node object is of leaf type
    if isinstance(node,Leaf):
        print(indentation+"PREDICTION",node.predictions)
        return 
    #print the question at node
    print(indentation + str(node.question))
    
    #call the function on true branch 
    print(indentation+ "Return Yes")
    print_tree(node.true_branch,indentation + " ")
    
    #on flase branch
    print(indentation+ "Return No")
    print_tree(node.false_branch,indentation + " ")

In [319]:
tree = build_tree(data)
print_tree(tree)

Is Outlook == Overcast  ?
Return Yes
 Is Temp. == Cool  ?
 Return Yes
  PREDICTION {43: 1}
 Return No
  Is Temp. == Mild  ?
  Return Yes
   PREDICTION {52: 1}
  Return No
   Is Humidity == Normal  ?
   Return Yes
    PREDICTION {44: 1}
   Return No
    PREDICTION {46: 1}
Return No
 Is Temp. == Hot  ?
 Return Yes
  Is Wind == Weak  ?
  Return Yes
   PREDICTION {25: 1}
  Return No
   PREDICTION {30: 1}
 Return No
  Is Outlook == Sunny  ?
  Return Yes
   Is Temp. == Cool  ?
   Return Yes
    PREDICTION {38: 1}
   Return No
    Is Humidity == Normal  ?
    Return Yes
     PREDICTION {48: 1}
    Return No
     PREDICTION {35: 1}
  Return No
   Is Wind == Weak  ?
   Return Yes
    Is Temp. == Cool  ?
    Return Yes
     PREDICTION {52: 1}
    Return No
     Is Humidity == Normal  ?
     Return Yes
      PREDICTION {46: 1}
     Return No
      PREDICTION {45: 1}
   Return No
    Is Temp. == Cool  ?
    Return Yes
     PREDICTION {23: 1}
    Return No
     PREDICTION {30: 1}
