In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
df = pd.read_csv('Cleaned_2012-18_playerBoxScore.csv')

In [2]:
feature_names = df.drop('playMin', axis=1)
feature_names = list(feature_names.columns)

In [3]:
X = np.array(df.drop('playMin', axis=1))
y = np.array(df['playMin'])

In [4]:
from sklearn.model_selection import train_test_split

X_train, X, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

In [5]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(n_estimators = 12, max_features='log2', min_samples_leaf=8).fit(X_train, y_train)

In [6]:
rfr.score(X_val, y_val)

0.8470672066384692

In [7]:
importance_numbers = rfr.feature_importances_

In [8]:
importance_list = importance_numbers.tolist()

In [9]:
importances = np.vstack((feature_names, importance_list))

In [10]:
importances = pd.Series(importance_list, index = feature_names)

In [11]:
importances.sort_values(ascending=False)

play2PA             0.106369
playFGM             0.098441
playFG%             0.095231
IsStarter           0.084286
playFGA             0.073216
                      ...   
playPos_F           0.000000
teamLocIsHome       0.000000
gmIs_year_start     0.000000
playBIs_year_end    0.000000
opptLocIsHome       0.000000
Length: 84, dtype: float64

In [12]:
importances = ['playFGA','IsStarter','play2PM','playPTS','playPTS']

In [35]:
df_X_train = pd.DataFrame(X_train, columns=feature_names)
X_val = pd.DataFrame(X_val, columns=feature_names)

In [40]:
df_y_train = pd.DataFrame(y_train, columns=['playMin'])

In [44]:
y_importances_train = df_y_train

In [42]:
y_importances_train = df_y_train

In [26]:
X_importances_train = df_X_train[importances]

In [28]:
X_val_importances = X_val[importances]

In [29]:
X_importances_train.shape

(98568, 5)

In [30]:
X_val_importances.shape

(24643, 5)

In [45]:
rfr_importances = RandomForestRegressor(n_estimators = 12, max_features='log2', min_samples_leaf=8).fit(X_importances_train, y_importances_train)

  """Entry point for launching an IPython kernel.


In [46]:
rfr_importances.score(X_val_importances, y_val)

0.7663514454376044

### An alternative mechanism

In [51]:
import eli5
from eli5.sklearn import PermutationImportance

In [57]:
%%time
perm = PermutationImportance(rfr).fit(X_val, y_val)
eli5.show_weights(perm, feature_names=feature_names)

Weight,Feature
0.0863  ± 0.0015,IsStarter
0.0515  ± 0.0006,playFGA
0.0322  ± 0.0012,playPTS
0.0313  ± 0.0012,playAST
0.0312  ± 0.0009,play2PA
0.0302  ± 0.0010,playFGM
0.0291  ± 0.0006,playDRB
0.0208  ± 0.0004,playTRB
0.0194  ± 0.0005,play3PA
0.0161  ± 0.0007,playFG%


In [58]:
selected_permutation = ['IsStarter','playFGA','playPTS','playAST']

In [61]:
X_val_permute = X_val[selected_permutation]
selected_X_train = df_X_train[selected_permutation]

In [60]:
from sklearn.ensemble import RandomForestRegressor
rfr_permutation = RandomForestRegressor(n_estimators = 12, max_features='log2', min_samples_leaf=8).fit(selected_X_train, y_train)

In [62]:
rfr_permutation.score(X_val_permute, y_val)

0.7852314352477748

In [63]:
eli5.show_weights(rfr_permutation, feature_names=selected_permutation)

Weight,Feature
0.5686  ± 0.4754,playFGA
0.2294  ± 0.3637,IsStarter
0.1591  ± 0.3829,playPTS
0.0429  ± 0.0205,playAST
