# Random Forest

In [1]:
import numpy as np
import math

In [2]:
# Mounting drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Preparing Dataset

In [3]:
import numpy as np
import math

In [4]:
#below where the file is in gdrive, change with your
data_path = "/content/drive/MyDrive/Colab Notebooks/PRNN_A1/Prnn_datasets/"
dataset = np.loadtxt(data_path + 'PCA_MNIST.csv', delimiter=',',skiprows=1)

In [5]:
dataset.shape

(60000, 11)

In [6]:
# Normalizing dataset
for i in range(1,dataset.shape[1]):
  dataset[:,i] = (dataset[:,i]-dataset[:,i].min())/(dataset[:,i].max()-dataset[:,i].min())

In [7]:
classes = 10
features = 10

In [8]:
train_dataset = dataset[0:40000,:]
test_dataset = dataset[40000:,:]

## Creating random forest class

 DATA FORMAT - Both train and test Dataset should contain first column as class
Each column should be normalised

In [10]:
class node():
  def __init__(self, value,feature,left,right,leaf,depth,cls):
    self.value = value
    self.left = left
    self.right = right
    self.depth = depth
    self.leaf = leaf
    self.feature = feature
    self.cls  = cls 

In [11]:
def entropy(D):
  sum = 0
  x = np.unique(D[:,0])
  for i in range(x.shape[0]):
    d = D[D[:,0]==x[i]]
    p = d.shape[0]/D.shape[0]
    temp = p * math.log(p)
    sum =sum + temp
  sum = -sum
  return(sum)

In [12]:
def maj(D):
  x = np.unique(D[:,0])
  max_cls = 0
  max_count = 0
  for i in range(x.shape[0]):
    d = D[D[:,0]==x[i]]
    if(max_count<d.shape[0]):
      max_count = d.shape[0]
      max_cls = x[i]
  return(max_cls)

In [13]:
def perc_maj_cls(D):
  x = np.unique(D[:,0])
  max_cls = 0
  max_count = 0
  for i in range(x.shape[0]):
    d = D[D[:,0]==x[i]]
    if(max_count<d.shape[0]):
      max_count = d.shape[0]
      max_cls = x[i]
  if(max_count/D.shape[0] >0.95):
    return(max_cls)
  else:
    return(-1)

In [14]:
def grow_trees_with_slct_ftr(D,depth,k,indices): # i is depth, k is max depth
  if(depth>=k or D.shape[0]==0):          # user defined depth
    return(None)
  cls = perc_maj_cls(D)
  if(cls!=-1):
    return(node(0,0,None,None,1,depth,cls))
  ent = -1
  featur=0
  value=0
  for features in indices: # first column is ans
    # d = D[:,features]
    # d = np.unique(d)                 # commented code necessary if want to check for each datapoint
    # d = np.sort(d)
    for i in range(1,100):
      d1 = D[D[:,features]<i/100]
      d2 = D[D[:,features]>=i/100]
      p1 = d1.shape[0]/D.shape[0]
      p2 = d2.shape[0]/D.shape[0]
      entropy_split = p1 * entropy(d1)+ p2*entropy(d2)
      if(entropy_split<ent or ent==-1):
        featur = features
        value = i/100
        D1 = d1
        D2 = d2
        ent =entropy_split
  left = grow_trees_with_slct_ftr(D1,depth+1,k,indices)
  right = grow_trees_with_slct_ftr(D2,depth+1,k,indices)
  leaf = 0
  if left==None and right == None:
    cls = maj(D)
    leaf = 1
  else:
    cls = -1
  return(node(value,featur,left,right,leaf,depth,cls))

In [20]:
def run_decision_tree(D,root):
  y=np.zeros(D.shape[0])
  for i in range(D.shape[0]):
    nd =root
    while(1):
      if nd.leaf ==1:
        y[i] = nd.cls
        break
      if(D[i][nd.feature]<nd.value):
        nd = nd.left
      else:
        nd = nd.right
  return(y)

In [15]:
def random_forest(D,max_depth,d,t): # D is dataset and d is number of features and t is number of trees
  root=[]
  for i in range(t):
    indices = np.random.choice(len(D), size=D.shape[0], replace=True)      # selecting rows with replacement
    D_i = D[indices,:]
    #D_i = select_feature(D_i,d)
    a=np.zeros(D_i.shape[1]-1)
    for i in range(1,D_i.shape[1]):
      a[i-1]=i
    a = a.astype(int)
    indices = np.random.choice(a, size=d, replace=False)                   # selecting features without replacement
    root.append(grow_trees_with_slct_ftr(D_i,0,max_depth,indices))
  return(root)

In [16]:
def run_random_forest(D,root):
  j=0
  lst = []
  for j in range(len(root)):
    y = run_decision_tree(D,root[j])
    lst.append(y)
  y = np.array(lst)
  ans = np.zeros(D.shape[0])
  for i in range(y.shape[1]): # number of datapoints
    index = np.zeros(10)  # number of classes
    for k in range(y.shape[0]): # number of trees
      index[int(y[k][i])]+=1
    id = np.argmax(index)
    ans[i]=id
  return(ans)

In [17]:
class Random_forest:
  def __init__(self,max_depth=5,no_trees = 8,no_features = 5): # no_features is number of feature to be selected without replacement from total features
    self.max_depth = max_depth
    self.no_trees = no_trees
    self.no_features = no_features
  def train(self,D):
    self.root = random_forest(D,self.max_depth,self.no_features,self.no_trees)

  def test(self,D):
    y = run_random_forest(D,self.root)
    acc = np.sum(y==D[:,0])/D.shape[0]
    return(acc)

  def predict(self,D):
    y_pred = run_random_forest(D,self.root)
    return(y_pred)

## Testing random forest algo on example dataset

In [21]:
rf =Random_forest()

In [22]:
rf.train(train_dataset)

In [23]:
acc = rf.test(test_dataset)
print(acc)

0.80735

In [24]:
acc = rf.test(train_dataset)
print(acc)

0.772725


In [25]:
y = rf.predict(test_dataset)