In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [2]:
dataset = pd.read_csv("../data/decision_tree_sample.csv")

Variables to track:

|Variable | Description|
|:--|:--|
|Down          |The current down (1st, 2nd, 3rd, or 4th)|
|Score         |Difference in score between the two teams|
|Seconds       |Number of seconds remaining in the game|
|AdjustedScore |Score / sqrt(Seconds + 1)|
|Spread        |Las Vegas pre-game point spread|
|TIMO          |Time outs remaining offense|
|TIMD          |Time outs remaining defense|
|TOTp          |Total points scored|
|Yardline      |Yards from own goal line|
|YTG           |Yards to go for a first down|

In [3]:
dataset.columns

Index(['id', 'offense', 'offense_conference', 'defense', 'defense_conference',
       'home', 'away', 'offense_score', 'defense_score', 'game_id', 'drive_id',
       'drive_number', 'play_number', 'period', 'clock', 'offense_timeouts',
       'defense_timeouts', 'yard_line', 'yards_to_goal', 'down', 'distance',
       'scoring', 'yards_gained', 'play_type', 'play_text', 'ppa',
       'time_remaining', 'score', 'adj_score', 'target',
       'yards_from_own_goal_line', 'yards_to_go_for_first_down', 'defense_elo',
       'offense_elo'],
      dtype='object')

In [4]:
dataset["target"]

0        False
1        False
2         True
3        False
4         True
         ...  
99995    False
99996    False
99997    False
99998     True
99999     True
Name: target, Length: 100000, dtype: bool

In [5]:
X = dataset[["down",
             "score",
             "time_remaining",
             "adj_score",
             "yards_from_own_goal_line",
             "yards_to_go_for_first_down",
             "defense_elo",
             "offense_elo",
             "offense_timeouts",
             "defense_timeouts",
             "offense_score",
             "defense_score"]].fillna(0)

y = dataset["target"]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.84415

In [7]:
clf.feature_importances_

array([0.0224986 , 0.18204318, 0.07907132, 0.23962639, 0.06829132,
       0.03786077, 0.10931615, 0.10927977, 0.01378841, 0.01395843,
       0.06185279, 0.06241287])

In [8]:
np.argsort(clf.feature_importances_)

array([ 8,  9,  0,  5, 10, 11,  4,  2,  7,  6,  1,  3])

In [11]:
features = np.array([
    "down",
    "score",
    "time_remaining",
    "adj_score",
    "yards_from_own_goal_line",
    "yards_to_go_for_first_down",
    "defense_elo",
    "offense_elo",
    "offense_timeouts",
    "defense_timeouts",
    "offense_score",
    "defense_score",
])

features[np.argsort(clf.feature_importances_)]

array(['offense_timeouts', 'defense_timeouts', 'down',
       'yards_to_go_for_first_down', 'offense_score', 'defense_score',
       'yards_from_own_goal_line', 'time_remaining', 'offense_elo',
       'defense_elo', 'score', 'adj_score'], dtype='<U26')