## Contents 

1. Import "Hitters" Data ISLR
2. Modified Regression Tree Regressor
3. Random Forest Regressor

# <font color='blue'>1. Import "hitters" Data ISLR</font>

In [1]:
import random
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


%cd /content/drive/My\ Drive/colab_notebooks/machine_learning/data/
df = pd.read_csv("Hitters.csv")

/content/drive/My Drive/colab_notebooks/machine_learning/data


In [3]:
df.columns

Index(['Unnamed: 0', 'AtBat', 'Hits', 'HmRun', 'Runs', 'RBI', 'Walks', 'Years',
       'CAtBat', 'CHits', 'CHmRun', 'CRuns', 'CRBI', 'CWalks', 'League',
       'Division', 'PutOuts', 'Assists', 'Errors', 'Salary', 'NewLeague'],
      dtype='object')

In [4]:
df = df[['AtBat', 'Hits', 'HmRun', 'Runs', 'RBI', 'Walks', 'Years',
       'CAtBat', 'CHits', 'CHmRun', 'CRuns', 'CRBI', 'CWalks'
       , 'PutOuts', 'Assists', 'Errors', 'Salary']]
df = df.dropna()
df.columns

Index(['AtBat', 'Hits', 'HmRun', 'Runs', 'RBI', 'Walks', 'Years', 'CAtBat',
       'CHits', 'CHmRun', 'CRuns', 'CRBI', 'CWalks', 'PutOuts', 'Assists',
       'Errors', 'Salary'],
      dtype='object')

In [5]:
#df['Salary']= np.log(df['Salary'])
df =  (df - np.mean(df,axis=0))/np.std(df,axis=0)
sub_df = df.sample(frac=1)
train_df = sub_df.iloc[0:210,:]
test_df = sub_df.iloc[210:,:]
# Array implentation
train_arr = np.array(train_df)
col = train_df.columns

### <font color='blue'>2. Modified Regression Tree</font>

Our earlier implementation of regression tree,was a single tree implementation without the constraint of MaxDepth. 

But when we are using our regression tree for building a random forest, we need to update our tree in two aspects. 
1. To add a constraint of max depth- This is not a part of random forest algorithm, but since we are bagging(aggregating) the prediction from multiple trees, we can save on some execution time and also not overfitting the data when it comes to execution of individual instances of trees.
2. As per the requirement of the random forest algorithm, rather than having all the features available at very node, we only have a random number of features $m$ out of the total features $p$, such that $m\leq p$. 
As per ESL total $m$ for regression tree should ideally be equal to $m=\frac{p}{3}$.\
  In our example, we have total 16 features, so $\frac{16}{3} ≈ 5 $ random features per node.

In [8]:
##########################################################
#Node class 

# Abstract Datatype created, to hold value, and 
# pointer to subsequent recursive Nodes created
##########################################################
class Node:

  def __init__(self):
    self.data = None
    self.feature_index = None
    self.threshold = None
    self.leaf_val = None
    self.left = None
    self.right = None


##########################################################
#least Square Error Function to select the best threshold value
# per feature.
#
#Input: Array of single feature & response 
#Outout: minimum error calculated &threshold value
##########################################################

def leastsquares_error(arr):

  n = len(arr)
  #print("arr",len(arr),arr.shape)
  #Begin splitting logic
  i = 0
  min_err = 99999999999
  while i < n-1:

    split = arr[i,0]
    left_split = arr[arr[:,0] <= split]
    #print("left_split",left_split)

    right_split = arr[arr[:,0] > split]
    #print("right_split",right_split)
    #print("right split",right_split.shape)

    # length of the splits 
    nl = len(left_split)
    nr = len(right_split)
    #print("nl,nr",nl,nr)
    #print("here here- 2")
    kl = np.mean(left_split,axis=0)[1]
    kr = np.mean(right_split,axis=0)[1]
    
    delta_err = np.sum(((left_split[:,-1] - kl)**2)) + np.sum(((right_split[:,-1] - kr)**2))
    #print("here here- 4")
    if delta_err < min_err:
      min_err = delta_err
      threshold = split
      
    i +=1

  return min_err, threshold


##########################################################
#Best Split Function 
#
#Input : dataframe/array consisting of selected features
#         with response variable.
#Output : Best feature inder, along with the corresponding 
#         threshold
##########################################################

def best_split(arr,col):

  #print("arr",arr.shape)
  #print("col",col)
  m = arr.shape[1]
  split_err = []
  split_threshold = []

  #for i in range(0,m-1):
  for i in range(0,2):

    #print("here -1 ")
    min_err,threshold = leastsquares_error(arr[:,[i,-1]])
    #print("here -2")
    split_err.append(min_err)
 
    split_threshold.append(threshold)
 

  best_split_index = np.argmin(split_err)
  colname = col[best_split_index]   # adding the colname
  best_threshold = split_threshold[best_split_index]

  #print("best_threshold",best_threshold)
  #print("colname",colname)
  return best_threshold,colname

##########################################################
#Build Tree Function 
#
#Input : The training dataset, abstract node class, list of columns
#Output : Return the head node or the root of the tree.
##########################################################

def build_tree(arr,col,n_node,depth):

  # base case
  n = len(arr)
  n_node.data = arr

  if n <= 10 :
    mean = np.mean(arr[:,-1],axis=0)
    n_node.mean_value = mean
    return n_node
  elif depth == 0:
    mean = np.mean(arr[:,-1],axis=0)
    n_node.mean_value = mean
    return n_node

  
  ##########################################################
  # recurring tree build 
  # Change : Select a random list of features
  # Input to the best_split function will be a sub-dataframe  consisting only the 
  # columns of the respective feature indexes chosen.
  ##########################################################
  # Place holder to randomly sample m features out of total p features.
  # Added the requirement of having a randomized set of features for
  # each node selection

  lst  = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
  rand_col = random.sample(lst,k = 5) #  n = 16/3 = 5
  rand_col.append(-1) # adding the response column at the end
  rand_arr = arr[:,rand_col] # selecting the data column 
  rand_col_names = col[rand_col]  
  
  
  # Continue to the rest of the code 

  best_threshold,colname  = best_split(rand_arr,rand_col_names) #change 
  best_split_index = np.where(col == colname) [0]
  #test
 
  n_node.feature_index = col[best_split_index]
  n_node.threshold = best_threshold

  # Split into two brances
  
  #print("best_threshold",best_threshold)
  left_arr  = arr[arr[:,best_split_index[0]] <= best_threshold]
  right_arr = arr[arr[:,best_split_index[0]] > best_threshold]

  # grow tree
  left_node = Node()
  right_node = Node()
  depth = depth - 1  # Updating the depth Hyperparameter
  n_node.left = build_tree(left_arr,col,left_node,depth)
  n_node.right = build_tree(right_arr,col,right_node,depth)

  return n_node

##########################################################
#Predict 
#Input : The test dataset, root of the tree, list of columns
#Output : Returns the predicted value.
##########################################################

def predict_tree_val(arr,cols,noden):

  head = noden
  index = head.feature_index[0]
  threshold = head.threshold

  value = arr[index]

  if value <= threshold:
  
    if head.left.feature_index is not None:
      pred = predict_tree_val(arr,cols,head.left)
    else:
      return np.mean(head.data[:,-1])
  
  elif value > threshold:

  
    if head.right.feature_index is not None:
      pred = predict_tree_val(arr,cols,head.right)
    else:
      return np.mean(head.data[:-1])

  return pred


In [9]:
import warnings
warnings.filterwarnings("ignore")

n_node = Node()
maxdepth = 6


#leastsquares_error(train_arr[:,[0,-1]],col[[0,-1]],eps)
#best_split_index,best_threshold = best_split(rand_arr,rand_col_names)
noden = build_tree(train_arr,col,n_node,maxdepth)

In [10]:
################################################################
#Prediction 
################################################################

test_arr = test_df
pred_arr= []
for i in range(0,len(test_arr)):
  entry = test_arr.iloc[i,:]
  pred = predict_tree_val(entry,col,noden)
  pred_arr.append(pred)


y = test_df.iloc[:,-1]

# Prediction Accuracy 
mean_squared_error = 1/len(test_arr)* np.sum((y - pred_arr)**2)

print("mean_squared_error",mean_squared_error)
print("root mean squared error",np.sqrt(mean_squared_error))

mean_squared_error 0.4500362057868485
root mean squared error 0.6708473789073403


In [11]:
# import the regressor
""" Testing the same dataset, with Sklearn library"""
from sklearn.tree import DecisionTreeRegressor 
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
  
clf = DecisionTreeRegressor()
clf.fit(train_df.iloc[:,0:-2], train_df.iloc[:,-1])

predictions = clf.predict(test_df.iloc[:,0:-2])
# Prediction Accuracy 
mean_squared_error = 1/len(test_df)* np.sum((test_df.iloc[:,-1] - predictions)**2)

print("mean_squared_error",mean_squared_error)
print("root mean squared error",np.sqrt(mean_squared_error))

mean_squared_error 0.5232047107003388
root mean squared error 0.7233289090727252


# <font color='blue'>3. Random Forest Regressor</font>

1. Bootstrapped samples
2. Build tree
3. Repeat 1 & 2 for no of trees to be created. 
4. Predict 

In [12]:
class randomforest:

  def __init__(self,n_trees=50):
    self.n_trees = n_trees
    self.arr = None
    self.col = None
    self.max_depth = None
    self.trees = None

  # Internal method to the class, called from functions within the 
  # random forest class
  def _bootstrap_samples(self,df):

    n = len(df)
    self.arr = np.array(df.sample(n,replace=True))
    return self.arr

  def build_forest(self,df,col,max_depth):

    i = 0
    self.max_depth = max_depth
    self.col = col
    self.trees = []

    while i < self.n_trees :
      
      nnode = Node()
      self.arr = self._bootstrap_samples(df)
      node = build_tree(self.arr,self.col,nnode,self.max_depth)
      self.trees.append(node)

      i+=1
    
    return self.trees

  # To-Do
  #def out_of_bag-accuracy

  def predict(self,test_df):

    m = len(test_df)
    pred_array = np.zeros([test_df.shape[0],len(self.trees)])

    for i in range(0,m):
      sel_row = test_arr.iloc[i,:]

      for j in range(0,len(self.trees)):
        pred_array[i,j] = predict_tree_val(sel_row,self.col,self.trees[j])
        
    predictions = np.mean(pred_array,axis = 1)

    #Mean Square Error 
    mean_squared_error = (1/m) * np.sum((test_df.iloc[:,-1] - predictions)**2)
    root_mean_squared_error = np.sqrt(mean_squared_error)

    return mean_squared_error, root_mean_squared_error


In [14]:
# Random Forest Execution
rm = randomforest()
max_depth = 6
tree = rm.build_forest(train_df,col,max_depth)
mean_squared_error,root_mean_squared_error = rm.predict(test_df)

print("Mean Squared Error of the Random Forest Regressor:",mean_squared_error)
print("Root Mean Squared Error of the Random Forest Regressor:",root_mean_squared_error)

Mean Squared Error of the Random Forest Regressor: 0.46859222603494705
Root Mean Squared Error of the Random Forest Regressor: 0.6845379653714957


In [15]:
# library 
# Fitting Random Forest Regression to the dataset
# import the regressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
 
 # create regressor object
regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)
 
# fit the regressor with x and y data
regressor.fit(train_df.iloc[:,0:-2], train_df.iloc[:,-1]) 

Y_pred = regressor.predict(test_df.iloc[:,0:-2])

mean_squared_error = mean_squared_error(test_df.iloc[:,-1], Y_pred)
print("Mean Squared Error of the Random Forest Regressor:",mean_squared_error)
print("Root Mean Squared Error of the Random Forest Regressor:",np.sqrt(mean_squared_error))

Mean Squared Error of the Random Forest Regressor: 0.34570706027036124
Root Mean Squared Error of the Random Forest Regressor: 0.5879685878262216
