## Model Evaluation - Random Forest Models on Diff Data
USA World Series Results,
Run on "Diff" data

# @To Do

- [ ] Randomize data and rebuild model
    * Limit to very simple tuning, so as not to overfit
    * n_estimators = 100 to 3-400
    * 5-fold or 6-fold CV
    * max_features = 5 or 6
- [ ] Merge new data from validation set into full data set
- [ ] Explore relationship between Posession Time + Attacking Rucks + Passes

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
#Import Data - USA's differential data
df = pd.read_csv('../data/output/new_features_diffdata.csv')
df.head()

#Import validation data
#valdf = pd.read_csv('../data/output/new_features_diffdata_validate.csv')
#valdf.head()

Unnamed: 0,Opp,Tournament,Poss_Time_Diff,Score_Diff,Conv_Diff,Tries_Diff,Passes_Diff,Contestable_KO_Win_pct_Diff,PenFK_Against_Diff,RuckMaul_Diff,...,-99 : -75,-74 : -25,-24 : -1,0 : 25,26 : 50,51 : 75,76 : 100,101 : 125,126 : 150,Result
0,AUSTRALIA,2015_Cape_Town,13.96648,-10.638298,-14.285714,0.25,25.925926,-50.0,0.0,0.0,...,0.0,-12.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,WALES,2015_Cape_Town,7.471264,15.555556,14.285714,0.083333,27.868852,25.0,-20.0,-100.0,...,0.0,0.0,0.0,12.5,0.0,0.0,0.0,0.0,0.0,1
2,KENYA,2015_Cape_Town,-33.136095,-44.444444,-33.333333,-0.75,-10.638298,-16.666667,66.666667,60.0,...,0.0,0.0,-5.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,NEW ZEALAND,2015_Cape_Town,51.758794,33.333333,33.333333,0.0,76.119403,-75.0,-50.0,-100.0,...,-37.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,FIJI,2015_Cape_Town,12.880562,-20.833333,-25.0,0.266667,38.461538,-66.666667,-33.333333,-33.333333,...,0.0,-12.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


## Randomize Data

In [75]:
#Shuffle dataframes before running model to prevent overfitting
from sklearn.utils import shuffle
df = shuffle(df)
valdf = shuffle(valdf)

In [46]:
#Diagnostic
#df.info()
list(df.columns)
#df.head()

['Opp',
 'Tournament',
 'Poss_Time_Diff',
 'Score_Diff',
 'Conv_Diff',
 'Tries_Diff',
 'Passes_Diff',
 'Contestable_KO_Win_pct_Diff',
 'PenFK_Against_Diff',
 'RuckMaul_Diff',
 'Ruck_Win_pct_Diff',
 'Cards_diff',
 'Lineout_Win_Pct_Diff',
 'Scrum_Win_Pct_Diff',
 '-175 : -150',
 '-149 : -125',
 '-124 : -100',
 '-99 : -75',
 '-74 : -25',
 '-24 : -1',
 '0 : 25',
 '26 : 50',
 '51 : 75',
 '76 : 100',
 '101 : 125',
 '126 : 150',
 'Result']

In [76]:
from sklearn.model_selection import train_test_split

In [77]:
#Create a list of features to drop that are unneccessary or will bias the prediction
droplist = ['Opp', 'Score_Diff', 'Tries_Diff','Tournament', 'Conv_Diff','-175 : -150', '-149 : -125','-124 : -100', '-99 : -75', '-74 : -25','-24 : -1','0 : 25','26 : 50','51 : 75','76 : 100','101 : 125','126 : 150']

rf_data = df.drop((droplist), axis=1)

#Drop rows with Result == "2" (Ties). This label messes up classification models
rf_data.drop(rf_data[rf_data.Result == 2].index, inplace=True)

#Pull out the variable we're trying to predict: 'Result'
X = rf_data.drop('Result',axis=1)
y = rf_data['Result']

In [78]:
rf_data.head()
#Check to insure 'Result' only contains 2 values (W, L)
#rf_data['Result'].describe()
#rf_data.describe()

Unnamed: 0,Poss_Time_Diff,Passes_Diff,Contestable_KO_Win_pct_Diff,PenFK_Against_Diff,RuckMaul_Diff,Ruck_Win_pct_Diff,Cards_diff,Lineout_Win_Pct_Diff,Scrum_Win_Pct_Diff,Result
108,45.5,63.380282,-66.666667,-50.0,-100.0,0.333333,0.0,1.0,0.0,0
134,36.492891,50.684932,25.0,-100.0,-100.0,0.0,100.0,1.0,0.0,1
44,24.463519,33.333333,0.0,-9.090909,0.0,0.208333,0.0,0.0,0.0,0
92,-6.264501,-13.043478,0.0,100.0,100.0,0.0,0.0,-1.0,1.0,1
132,44.139651,58.823529,100.0,-14.285714,100.0,-0.083333,0.0,0.5,0.0,1


In [57]:
#list(rf_data.columns) 

In [79]:
#Pull out the variable we're trying to predict: 'Result'
X = rf_data.drop('Result',axis=1)
y = rf_data['Result']
#X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.30)

In [80]:
#Split into train/test/validate sets
#OR, keep as is and use new data for validate
#156 rows in original dataframe
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

### Validation set
Imported last two series stops (London & Paris, 2018 - 12 matches total) to use as a validation set.

Need to also incorporate into larger data set later.

In [81]:
#Drop features that are unneccessary/str or will bias the prediction
val_data = valdf.drop((droplist), axis=1)

#Drop rows with Result == "T" (Ties). This label messes up classification models
val_data.drop(val_data[val_data.Result == 2].index, inplace=True)

#Pull out the variable we're trying to predict: 'Result'
val_X = val_data.drop('Result',axis=1)
val_y = val_data['Result']

## Random Forest

In [59]:
from sklearn.ensemble import RandomForestClassifier

#Fit RF Classifier model
rf = RandomForestClassifier(random_state=101)

from pprint import pprint
# Look at parameters used by our current forest
print('Default Parameters currently in use:\n')
pprint(rf.get_params())

Default Parameters currently in use:

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': 101,
 'verbose': 0,
 'warm_start': False}


## Evaluate Base Model

In [64]:
base_model = RandomForestClassifier(n_estimators = 10, random_state = 101)
base_model.fit(X_train, y_train)
predictions = base_model.predict(X_test)
#base_accuracy = evaluate(base_model, y_train, y_test)

#Use sklearn.metrics 'accuracy_score' to determine accuracy of trained and test model
base_train_acc = accuracy_score(y_train, base_model.predict(X_train))
base_test_acc = accuracy_score(y_test, predictions)
print("Base Model")
print(base_train_acc)
print(base_test_acc)

Base Model
0.971428571429
0.608695652174


### Hyperparameters
* n_estimators = number of trees in the foreset
* max_features = max number of features considered for splitting a node
* max_depth = max number of levels in each decision tree
* min_samples_split = min number of data points placed in a node before the node is split
* min_samples_leaf = min number of data points allowed in a leaf node
* bootstrap = method for sampling data points (with or without replacement)

### Grid Search with Cross Validation
Random search allowed us to narrow down the range for each hyperparameter. Now that we know where to concentrate our search, we can explicitly specify every combination of settings to try. We do this with GridSearchCV, a method that, instead of sampling randomly from a distribution, evaluates all combinations we define.

In [62]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [60, 70, 80, 90, 100, 110],
    'max_features': [4, 5, 6],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'min_samples_split': [2, 5, 8, 10, 12],
    'n_estimators': [100, 200, 300, 400]
}

# Create a base model
#rf = RandomForestClassifier(random_state = 101)
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2)

In [None]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)
grid_search.best_params_

best_grid = grid_search.best_estimator_
#grid_accuracy = evaluate(best_grid, X_test, y_test)

print("Base Model")
print(base_train_acc)
print(base_test_acc)

#get predictions with best parameters
grid_predict = best_grid.predict(X_test)

grid_train_acc = accuracy_score(y_train, best_grid.predict(X_train))
grid_test_acc = accuracy_score(y_test, grid_predict)
print("\n")
print("Grid Search Model")
print(grid_train_acc)
print(grid_test_acc)

#print('Improvement of {:0.2f}%.'.format( 100 * (grid_test_acc - base_test_acc) / base_test_acc))

### Output
**Base Model**  
1.0  
0.45652173913

**Grid Search Model**  
0.895238095238  
0.565217391304

In [65]:
print('Improvement of {:0.2f}%.'.format( 100 * (grid_test_acc - base_test_acc) / base_test_acc))

Improvement of -7.14%.


### Grid Search Accuracy Results
**Base Model**
```
1.0
0.45652173913
```
**Grid Search Model**
```
1.0
0.50
```

***Improvement of 9.52%. to 50%***

In [66]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 60,
 'max_features': 4,
 'min_samples_leaf': 5,
 'min_samples_split': 2,
 'n_estimators': 100}

In [71]:
# examine the best model#
print('Best Estimator:')
print(grid_search.best_estimator_)
print()
print('Best Score:')
print(grid_search.best_score_)
print()
print('Best Parameters:')
print(grid_search.best_params_)

Best Estimator:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=60, max_features=4, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=101, verbose=0, warm_start=False)

Best Score:
0.704761904762

Best Parameters:
{'bootstrap': True, 'max_depth': 60, 'max_features': 4, 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 100}


**Best Estimator**
```
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=60, max_features=4, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=101, verbose=0, warm_start=False)
```

**Best Score**
```
0.704761904762
```
**Best Parameters**
```
{'bootstrap': True, 'max_depth': 60, 'max_features': 4, 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 100}
```

### Use new parameters from gridsearch to create and fit model

In [89]:
#Fit classifier with new model parameters from gridsearch
rfc = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=60, max_features=4, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=101, verbose=0, warm_start=False)

#Fit model
rfc.fit(X_train, y_train)

#Predict Classifier
rfc_pred = rfc.predict(X_test)

## Random Forest Model Eval

In [90]:
#Accuracy
rfc_acc = accuracy_score(y_test, rfc_pred)
print(rfc_acc)


0.652173913043


In [91]:
#Find Feature Importances
feature_importances = pd.DataFrame(rfc.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)

print("Feature Importance")
print(feature_importances)

Feature Importance
                             importance
Poss_Time_Diff                 0.259110
PenFK_Against_Diff             0.170337
Contestable_KO_Win_pct_Diff    0.140098
Passes_Diff                    0.126238
Ruck_Win_pct_Diff              0.115913
Scrum_Win_Pct_Diff             0.066672
RuckMaul_Diff                  0.059296
Lineout_Win_Pct_Diff           0.051486
Cards_diff                     0.010850


In [92]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

#Output confusion matrix
print("Confusion Matrix")
print(confusion_matrix(y_test,rfc_pred))

#import libraries to ignore UndefinedMetricWarning
import warnings
import sklearn.exceptions
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)

#get the model's accuracy score
accuracy_score(y_test, rfc_pred)
print("\n")
print("Classification Report")
print(classification_report(y_test,rfc_pred))

#print accuracy score
print("\n")
print("Accuracy Score")
print(rfc.score(X_test, y_test))

Confusion Matrix
[[10 14]
 [ 2 20]]


Classification Report
             precision    recall  f1-score   support

          0       0.83      0.42      0.56        24
          1       0.59      0.91      0.71        22

avg / total       0.72      0.65      0.63        46



Accuracy Score
0.652173913043


## Predict on Validation Set

In [93]:
#Run Prediction Classifier on validation data (val_X, val_y)
rfc_val_pred = rfc.predict(val_X)

In [94]:
#Accuracy
rfc_val_acc = accuracy_score(val_y, rfc_val_pred)
print(rfc_val_acc)

0.8


In [95]:
#Output confusion matrix
print("Confusion Matrix")
print(confusion_matrix(val_y, rfc_val_pred))

#import libraries to ignore UndefinedMetricWarning
import warnings
import sklearn.exceptions
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)

#get the model's accuracy score
accuracy_score(val_y, rfc_val_pred)
print("\n")
print("Classification Report")
print(classification_report(val_y, rfc_val_pred))

#print accuracy score
print("\n")
print("Accuracy Score")
print(rfc.score(val_X, val_y))

Confusion Matrix
[[4 0]
 [2 4]]


Classification Report
             precision    recall  f1-score   support

          0       0.67      1.00      0.80         4
          1       1.00      0.67      0.80         6

avg / total       0.87      0.80      0.80        10



Accuracy Score
0.8
