In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('../Data/hitters.csv')

In [4]:
df.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
0,293,66,1,30,29,14,1,293,66,1,30,29,14,A,E,446,33,20,,A
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500.0,N
4,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N


How does a Random Forest Split Data?

In [None]:
# this is a loss function measures mean squared error
def mse(idx, target):
    error = target[idx].mean() - target[idx]
    return np.mean(error**2)

In [17]:
# only include values that don't have an empty value for y
df = df[~df.Salary.isnull()]
# every columns that's not Salary
X = df.loc[:, df.columns !='Salary']
# remove columns that have empty values
X = X.loc[:, X.isnull().sum() == 0]
# define y
y = df['Salary']

In [15]:
# this is our total level of loss for the entire dataset
total_error = mse(X.index, y)

In [18]:
# this is a sample value that we'll split our dataset on
query = df.RBI < 100

In [19]:
# returns True and False for every single row
query

1       True
2       True
3       True
4       True
5       True
6       True
7       True
8       True
9       True
10      True
11      True
12      True
13      True
14      True
16      True
17      True
19      True
20      True
21      True
23      True
24     False
25      True
26      True
27      True
28      True
29      True
31      True
33      True
34      True
35      True
       ...  
287     True
288     True
289     True
290     True
291     True
293     True
294     True
295     True
296     True
297     True
299     True
300     True
301     True
303     True
304     True
306     True
307     True
308     True
309     True
310     True
311     True
312     True
313     True
314     True
315    False
317     True
318     True
319     True
320     True
321     True
Name: RBI, Length: 263, dtype: bool

In [20]:
# split the dataset depending on whether or not the value is True or False 
# for this condition
left =  df[query].copy()
right = df[~query].copy()

In [21]:
# rbi is > 100, left is < 100
right.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
24,629,168,18,73,102,40,18,8424,2464,164,1008,1072,402,A,E,1067,157,14,776.667,A
82,677,238,31,117,113,53,5,2223,737,93,349,401,171,A,E,1377,100,6,1975.0,A
86,637,174,31,89,116,56,14,6727,2024,247,978,1093,495,N,W,278,9,9,1041.667,N
96,565,148,24,90,104,77,14,7287,2083,305,1135,1234,791,A,E,292,9,5,1861.46,A
108,641,198,31,101,108,41,5,2129,610,92,297,319,117,A,E,269,17,10,1175.0,A


In [22]:
# calculate the loss for the left side
left_error = mse(left.index, y)

In [23]:
# and the right side
right_error = mse(right.index, y)

In [26]:
# mean squared error for left split
left_error

168406.68785966877

In [27]:
# and right split
right_error

564282.2894012296

In [30]:
# total number of rows in dataset
df_size = df.shape[0]

In [31]:
# total number of rows in left and right hand sides
right_size = right.shape[0]
left_size  = left.shape[0]

In [32]:
right_size

14

In [33]:
left_size

249

In [34]:
total_error

202734.26915834736

In [35]:
# the info gain -- total error minus the weighted average of the left and right error terms
info_gain = total_error -  (right_size/df_size)*(right_error) - (left_size/df_size)*(left_error)

In [36]:
# and this is the value we use to evaluate the quality of a split
# the bigger the number, the better
info_gain

13254.355361105001

In [37]:
# tree module includes DecisionTrees
from sklearn.tree import DecisionTreeRegressor

tree = DecisionTreeRegressor()

In [38]:
tree.get_params()

{'criterion': 'mse',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': False,
 'random_state': None,
 'splitter': 'best'}

In [39]:
# if you want to ensemble, you use RandomForests
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()

In [43]:
X = X.select_dtypes(include=np.number)

In [40]:
# common features to optimize:  max_depth, min_samples_leaf, n_estimators
rf.get_params()

{'bootstrap': True,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 'warn',
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [41]:
# max features is the fraction of columns to use at each split
rf.max_features = 0.5

In [44]:
rf.fit(X, y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=0.5, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [45]:
# these are the individual trees in our random forest
rf.estimators_

[DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=0.5,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=671409970, splitter='best'),
 DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=0.5,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=1895247431, splitter='best'),
 DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=0.5,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=2007981974, splitter='best'),
 DecisionTreeRegressor(criter

In [46]:
# this is our prediction for the first sample
rf.predict(X[:1])

array([588.])

In [47]:
# these are the values predicted by each of the trees
tree_preds = [tree.predict(X[:1]) for tree in rf.estimators_]

In [48]:
tree_preds

[array([475.]),
 array([740.]),
 array([430.]),
 array([1300.]),
 array([475.]),
 array([475.]),
 array([475.]),
 array([475.]),
 array([560.]),
 array([475.])]

In [49]:
# if we take their average, then we get our final prediction
np.mean(tree_preds)

588.0