In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
df = pd.read_csv('Cleaned_2012-18_playerBoxScore.csv')

In [2]:
df.describe(include='all')

Unnamed: 0,next_gmDays,pre_gmDays,gmTime,teamDayOff,playMin,playHeight,playWeight,playPTS,playAST,playTO,...,opptDiv_Atlantic,opptDiv_Central,opptDiv_Northwest,opptDiv_Pacific,opptDiv_Southeast,opptDiv_Southwest,age,Seats,Year_Open,pre_gm_miles
count,154014.0,154014.0,154014.0,154014.0,154014.0,154014.0,154014.0,154014.0,154014.0,154014.0,...,154014.0,154014.0,154014.0,154014.0,154014.0,154014.0,154014.0,154014.0,154014.0,154014.0
mean,2.349851,2.390562,15.682535,2.076396,22.988676,79.011609,217.932941,9.855974,2.185892,1.297285,...,0.166323,0.166478,0.166972,0.167829,0.166634,0.165764,26.615347,19000.412014,1998.6283,1561.30145
std,1.277117,1.296341,5.880749,1.049777,10.690116,3.431578,25.942307,7.932366,2.532465,1.382802,...,0.372371,0.372511,0.372952,0.373715,0.37265,0.37187,5.073903,1021.014056,11.162248,1783.445556
min,0.0,0.0,1.0,0.0,0.0,69.0,149.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,14.0,16867.0,1966.0,0.0
25%,2.0,2.0,9.0,2.0,15.0,77.0,200.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,23.0,18119.0,1995.0,613.472295
50%,2.0,2.0,19.0,2.0,24.0,79.0,220.0,8.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,26.0,19067.0,1999.0,1091.402326
75%,3.0,3.0,20.0,2.0,31.0,82.0,237.0,15.0,3.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,30.0,19800.0,2003.0,1811.182098
max,9.0,9.0,22.0,11.0,60.0,87.0,290.0,62.0,25.0,12.0,...,1.0,1.0,1.0,1.0,1.0,1.0,45.0,20917.0,2018.0,8990.043545


In [3]:
X = np.array(df.drop('playMin', axis=1))
y = np.array(df['playMin'])

In [4]:
X.shape

(154014, 84)

In [5]:
y.shape

(154014,)

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

In [7]:
X_train.shape

(98568, 84)

In [8]:
X_test.shape

(30803, 84)

In [9]:
X_val.shape

(24643, 84)

### 1. Sampling the Features

In [10]:
from sklearn.ensemble import RandomForestRegressor

In [13]:
def feature_sampling_scores (max_features_list):
    feature_sampling_scores_list = []
    for max_feature in max_features_list:
        rfr = RandomForestRegressor(max_features=max_feature, random_state=1)
        rfr.fit(X_train, y_train)
        score = (max_feature, rfr.score(X_val,y_val))
        feature_sampling_scores_list.append(score)
    return feature_sampling_scores_list

In [14]:
feature_sampling_scores([0.5,'log2','sqrt',None])



[(0.5, 0.9054320889464907),
 ('log2', 0.8949565393234469),
 ('sqrt', 0.901351332432361),
 (None, 0.9031627258203313)]

We can see from above that using None and 0.5 perform the best.

### 2. Number of Trees

In [15]:
rfr_n_trees = RandomForestRegressor(n_estimators=50, random_state=1, max_features=0.5)

In [17]:
rfr_n_trees.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features=0.5, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=50,
                      n_jobs=None, oob_score=False, random_state=1, verbose=0,
                      warm_start=False)

In [18]:
rfr_n_trees.score(X_val,y_val)

0.9170266754917261

In [19]:
tree_predictions = np.vstack([estimator.predict(X_val) for estimator in rfr_n_trees.estimators_])

In [20]:
tree_predictions.shape

(50, 24643)

In [22]:
from sklearn.metrics import r2_score
r2_scores = [r2_score(y_val, np.mean(tree_predictions[:i + 1], axis = 0)) for i in range(0, len(rfr_n_trees.estimators_)) ]

In [24]:
from graph import trace_values, plot
x_vals = list(range(1, len(rfr_n_trees.estimators_) + 1))
trace = trace_values(x_vals, r2_scores)
plot([trace])

### 3. Number of leaves

In [36]:
min_samples = np.arange(2,51,1)
min_samples

array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50])

In [42]:
def num_leaves_wrapped_scores (min_samples):
    num_leaves_wrapped_scores_list = []
    for min_sample in min_samples:
        rfr = RandomForestRegressor(n_estimators=50, min_samples_leaf=min_sample, random_state=1)
        rfr.fit(X_train, y_train)
        score = rfr.score(X_val,y_val)
        num_leaves_wrapped_scores_list.append(score)
    return num_leaves_wrapped_scores_list

In [44]:
%%time
scores = num_leaves_wrapped_scores(min_samples)

CPU times: user 49min 57s, sys: 7.11 s, total: 50min 4s
Wall time: 50min 12s


In [45]:
scores[:5]

[0.9106822082282149,
 0.9052995845607124,
 0.8997137789678333,
 0.8948712576214528,
 0.8900293158076413]

In [52]:
wrapped_scores = np.column_stack((min_samples, scores))

In [54]:
trace_1 = trace_values(wrapped_scores[:, 0], wrapped_scores[:, 1])

In [55]:
plot([trace_1])

In [56]:
X_val.shape

(24643, 84)

In [57]:
combined_X = np.vstack((X_train, X_val))

In [58]:
combined_y = np.concatenate((y_train, y_val))

In [60]:
rfr3 = RandomForestRegressor(n_estimators=50, random_state=1, max_features=0.5)
rfr3.fit(combined_X, combined_y)
rfr3.score(X_test,y_test)

0.9396544407603628