# Bagging, Random Forests, Boosting

Utilice la clase `regression_tree` de `regression_tree.py` para entrenar un arbol de decisión en los datos de `auto.csv`

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from regression_tree import regression_tree

In [2]:
auto_dataset = pd.read_csv('../data/auto.csv')

In [3]:
auto_dataset.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [4]:
train_dataset = auto_dataset[['mpg', 'cylinders', 'displacement', 'weight', 'acceleration', 'year', 'origin']]

In [10]:
rnd = np.random.rand(len(train_dataset))
train_df = train_dataset[rnd < 0.8]
test_df = train_dataset[rnd >= 0.8]

In [13]:
len(train_df), len(test_df)

(319, 78)

Cree una función que regrese el error cuadrado medio del modelo.

In [28]:
def rms_error(y, yhat):
    return (y - yhat).apply(lambda x: x*x).mean()

In [29]:
auto_tree = regression_tree()
auto_tree.fit(train_df, 'mpg', alpha=0, min_points_per_leaf=1)

In [30]:
rms_error(train_df.mpg, auto_tree.predict(train_df))

0.74250000000000005

In [31]:
rms_error(test_df.mpg, auto_tree.predict(test_df))

10.691525890313393

In [32]:
pd.DataFrame({'y':test_df.mpg, 'yhat': auto_tree.predict(test_df)})

Unnamed: 0,y,yhat
1,15.0,14.750000
5,15.0,13.750000
7,14.0,13.750000
20,25.0,24.500000
21,24.0,24.500000
24,21.0,21.200000
26,10.0,13.000000
28,9.0,10.500000
31,25.0,23.250000
34,16.0,17.500000


# Bootstrap

Cree una función que genere muestras de datos con reemplazo.

In [20]:
def bootstrap_sample(df, N=None):
    if not N:
        N = len(df)
    selection = np.random.choice(df.index, size=N)
    return df.loc[selection]

In [21]:
bootstrap_sample(auto_dataset).head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
140,14.0,8,304.0,150,4257,15.5,74,1,amc matador (sw)
361,25.4,6,168.0,116,2900,12.6,81,3,toyota cressida
154,15.0,6,250.0,72,3432,21.0,75,1,mercury monarch
326,43.4,4,90.0,48,2335,23.7,80,2,vw dasher (diesel)
110,22.0,4,108.0,94,2379,16.5,73,3,datsun 610


# Random Forrest
Cree una clase que tenga la misma interfaz de `regression_tree`, es decir, una función **fit** que entrene **B** arbóles de decisión, y una función **predict** que evalúe los arboles y regrese
$$ \frac{1}{B}\sum_{b=1}^B \hat f^b (x) $$

In [22]:
class random_forrest(object):
    def __init__(self):
        self.trees = []

    def fit(self, df, y, B=10, predictors=None, alpha=1, min_points_per_leaf=5):
        if not predictors:
            predictors = list(df.columns)
            predictors.remove(y)
        self.trees = []
        for b in range(B):
            tree = regression_tree()
            df_new = bootstrap_sample(df)
            predictors_new = np.random.choice(predictors,
                                              size=int(np.sqrt(len(predictors))),
                                              replace=False)
            tree.fit(df_new, y, predictors=predictors, 
                                alpha=alpha,
                                min_points_per_leaf=min_points_per_leaf)
            self.trees.append(tree)

    def predict(self, df):
        prediction = pd.Series([0]*len(df), index=df.index)
        for tree in self.trees:
            prediction += tree.predict(df)
        return prediction/len(self.trees)

In [23]:
model = random_forrest()
predictors = ['cylinders', 'displacement', 'weight', 'acceleration', 'year', 'origin']
model.fit(train_df, 'mpg', 3, predictors=predictors, alpha=0.5, min_points_per_leaf=50)

In [24]:
yhat = model.predict(test_df)
yhat.head()

1     16.555556
5     13.933333
7     13.933333
20    22.000000
21    23.833333
dtype: float64

In [33]:
rms_error(train_df.mpg, model.predict(train_df))

3.2332222927955905

In [34]:
rms_error(test_df.mpg, yhat)

7.3746897080753131

# Boosted Decision Trees
Cree una clase que entrene arboles de decisión siguiendo el algoritmo de boosting:
1. Set $\hat f(x)$ and $r_i=y_i$ for all $i$ in the training set.
2. For $b = 1,2,...,B$, repeat:
   1. Fit a tree $\hat f^b$ with $d$ splits ($d+1$ terminal nodes) to the training data $(X, r)$
   2. Update $\hat f$ by adding in a shrunken verson fo the new tree: $\hat f(x) \leftarrow \hat f(x) + \lambda\hat f^b(x)$
   3. Update the residuals: $r_i \leftarrow r_i - \lambda\hat f^b(x_i)$
3. Output the boosted model:
    $$\hat f(x) = \sum_{b=1}^B \lambda\hat f^b(x)$$

In [None]:
class boosted_decision_trees(object):
    def __init__(self):
        pass
    
    def fit(self, df, ...):
        pass
    
    def predict(self, df);
        pass