# Application of Bootstrap samples in Random Forest

In [1]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import random
#from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import mean_squared_error
from prettytable import PrettyTable

 <li> Load the digits dataset </li>

In [2]:
df = pd.read_csv('Boston.csv')
df = df.drop(['Unnamed: 0'], axis=1)

In [3]:
df.head(2)

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6


In [4]:
x = df.drop(['medv'], axis=1) #independent variables
y = df['medv'] #target variable

In [5]:
x.shape

(506, 13)

In [6]:
df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


### 1
<font color='red'><b>Step 1 Creating samples: </b></font> Randomly create 30 samples from the whole boston data points.
<ol>
<li>Creating each sample: Consider any random 303(60% of 506) data points from whole data set and then replicate any 203 points from the sampled points</li>
<li>Ex: For better understanding of this procedure lets check this examples, assume we have 10 data points [1,2,3,4,5,6,7,8,9,10], first we take 6 data points randomly consider we have selected [4, 5, 7, 8, 9, 3] now we will replicate 4 points from [4, 5, 7, 8, 9, 3], consider they are [5, 8, 3,7] so our final sample will be [4, 5, 7, 8, 9, 3, 5, 8, 3,7]</li>
<li> we create 30 samples like this </li>
<li> Note that as a part of the Bagging when you are taking the random samples make sure each of the sample will have different set of columns</li>
<li> Ex: assume we have 10 columns for the first sample we will select [3, 4, 5, 9, 1, 2] and for the second sample [7, 9, 1, 4, 5, 6, 2] and so on...</li>
<li> Make sure each sample will have atleast 3 feautres/columns/attributes</li>
</ol>

<font color='red'><b>Step 2 Building High Variance Models on each of the sample and finding train MSE value:</b></font> Build a DecisionTreeRegressor on each of the sample.
<ol><li>Build a regression trees on each of 30 samples.</li>
<li>computed the predicted values of each data point(506 data points) in your corpus.</li>
<li> predicted house price of $i^{th}$ data point $y^{i}_{pred} =  \frac{1}{30}\sum_{k=1}^{30}(\text{predicted value of } x^{i} \text{ with } k^{th} \text{ model})$.</li>
<li>Now calculate the $MSE =  \frac{1}{506}\sum_{i=1}^{506}(y^{i} - y^{i}_{pred})^{2}$.</li>
</ol>

<font color='red'><b>Step 3 Calculating the OOB score :</b></font>
<ol>
<li>Computed the predicted values of each data point(506 data points) in your corpus.</li>
<li>Predicted house price of $i^{th}$ data point $y^{i}_{pred} =  \frac{1}{k}\sum_{\text{k= model which was buit on samples not included } x^{i}}(\text{predicted value of } x^{i} \text{ with } k^{th} \text{ model})$.</li>
<li>Now calculate the $OOB Score =  \frac{1}{506}\sum_{i=1}^{506}(y^{i} - y^{i}_{pred})^{2}$.</li>
</ol>

### 2
<pre>
<font color='red'><b>Computing CI of OOB Score and Train MSE</b></font>
<ol>
<li> Repeat Task 1 for 35 times, and for each iteration store the Train MSE and OOB score </li>
<li> After this we will have 35 Train MSE values and 35 OOB scores </li>
<li> using these 35 values (assume like a sample) find the confidence intervals of MSE and OOB Score </li>
<li> you need to report CI of MSE and CI of OOB Score </li>
<li> Note: Refer the Central_Limit_theorem.ipynb to check how to find the confidence interval</li>
</ol>
</pre>
### 3
<pre>
<font color='red'><b>Given a single query point predict the price of house.</b></font>

<li>Consider xq= [0.18,20.0,5.00,0.0,0.421,5.60,72.2,7.95,7.0,30.0,19.1,372.13,18.60] Predict the house price for this point as mentioned in the step 2 of Task 1. </li>
</pre>

##  1

### Step 1: Creating Samples

In [7]:
def create_samples(df, samples=30):
    len_of_rows = df.shape[0]
    len_of_cols = df.shape[1]-1
    sampled_df = {}
    
    for i in range(1, samples+1):
        no_of_col, sixty_percent, = 0, 0
        sample_cols, sample_rows, second_sample_rows = [], [], []
        no_of_col = random.randint(3, len_of_cols)
        sample_cols = random.sample(range(len_of_cols), no_of_col)

        sixty_percent = (len_of_rows*60//100)
        sample_rows = random.sample(range(len_of_rows), sixty_percent)
        second_sample_rows = random.sample(range(len(sample_rows)), len_of_rows-sixty_percent)
        sample_rows.extend(second_sample_rows)
        
        sampled_df["df_"+str(i)] = [sample_rows, sample_cols]
    return sampled_df

In [8]:
sampled_dfs = create_samples(df, 30)

In [9]:
#for x, y in sampled_dfs.items():
#    print(x,y)

### Step 2 : Building High Variance Models on each of the sample and finding train MSE value

In [10]:
def building_tree(df, sampled_dfs):
    pred, models = [], []
    y = df['medv'].values
    X = df.drop(['medv'], axis=1)
    #print(X.shape, y.shape)
    
    for name, sdf in sampled_dfs.items():
        sample_y = df['medv'][sdf[0]].values
        sample_X = df.iloc[sdf[0], sdf[1]].values
        X_test = df.iloc[:, sdf[1]].values
        
        #print(name, sample_X.shape, sample_y.shape, X_test.shape)
        #print(name, sample_X[0], sample_y[0])
        
        # train test split
        #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, stratify=y, random_state=42)
        dtr = DecisionTreeRegressor()
        dtr.fit(sample_X, sample_y)
        
        y_test_pred = dtr.predict(X_test)
        pred.append(y_test_pred)
        models.append(dtr)
        
    avg_pred = np.array(pred).mean(axis=0)
    return mean_squared_error(y, avg_pred), models

In [11]:
mse, models = building_tree(df, sampled_dfs)
mse

2.2516328116582485

### Step 3 : Calculating the OOB score

In [12]:
def check_nan(arr):
    mean = np.nanmean(arr)
    arr = [mean if np.isnan(x) else x for x in arr]    
    return arr

def oob_score(models, df, sampled_dfs):
    y = df['medv'].values
    X = df.drop(['medv'], axis=1)

    y_pred = []
    for i, x in df.iterrows():
        predctns = []
        x = x.values
            
        for idx, samples in enumerate(sampled_dfs.values()):
            if i not in samples[0]:
                X_test = x[samples[1]].reshape(1, -1)
                pred = models[idx].predict(X_test)
                predctns.append(pred[0])
        
        y_pred.append(np.array(predctns).mean())
    y_pred = check_nan(y_pred)
    return mean_squared_error(y, y_pred)
    

In [13]:
oob_score(models, df, sampled_dfs)

18.75015602604405

## 2 : Computing CI of OOB Score and Train MSE

In [14]:
train_mses, oob_scores = [], []
for x in range(0,35):
    sampled_dfs = create_samples(df, samples=30)
    mse, models = building_tree(df, sampled_dfs)
    oob = oob_score(models, df, sampled_dfs)
    train_mses.append(mse)
    oob_scores.append(oob)
    
train_mses = np.array(train_mses)
oob_scores = np.array(oob_scores)

In [15]:
def c_i(sample):
    sample_mean = sample.mean()
    sample_std =  sample.std()
    sample_size = len(sample)

    # here we are using sample standard deviation instead of population standard deviation
    left_limit  = np.round(sample_mean - 2*(sample_std/np.sqrt(sample_size)), 3)
    right_limit = np.round(sample_mean + 2*(sample_std/np.sqrt(sample_size)), 3)
    return (left_limit, sample_mean, right_limit)


In [16]:
print("Train MSE C.I. : ", c_i(train_mses))

Train MSE C.I. :  (1.587, 1.6766912591913734, 1.767)


In [17]:
print("OOB Score C.I. : ", c_i(oob_scores))

OOB Score C.I. :  (16.978, 17.53173218871786, 18.086)


## 3 Given a single query point predict the price of house.

In [18]:
def predict_single_sample(xq, sampled_dfs, models):
    pred = []
    xq = np.array(xq)
    for idx, sdf in enumerate(sampled_dfs.values()):
        X_test = xq[sdf[1]].reshape(1, -1)
        
        y_test_pred = models[idx].predict(X_test)
        pred.append(y_test_pred[0])
        
    avg_pred = np.array(pred).mean()
    return avg_pred

In [19]:
xq = [0.18,20.0,5.00,0.0,0.421,5.60,72.2,7.95,7.0,30.0,19.1,372.13,18.60]
print("Predicted house price is : ", predict_single_sample(xq, sampled_dfs, models))

Predicted house price is :  20.68933333333333
