In [1]:
%matplotlib inline
%load_ext tensorboard

import numpy as np
import pandas as pd

from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

2024-04-29 16:36:41.185377: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Data Cleaning

In [3]:
# Get data
# Data is from 2012-2020
data = pd.read_csv('data_file.csv') # pandas data frame


## Preprocessing

# Filter to only Patriots
data = data.loc[(data['home_team'] == 'NE') | (data['away_team'] == 'NE')]

# Remove data that does not correlate 
data = data.loc[(data['desc'].str.startswith('('))]
data = data[data['play_type'] != 'qb_kneel']
data = data[data['play_type'] != 'no_play']
data = data[data['posteam'] == 'NE']
data = data.dropna(subset=['down'])
data = data.dropna(subset=['play_type'])
data = data[(data['play_type'] == 'pass') | (data['play_type'] == 'run')]

# Extract year and month using string slicing
data['year'] = data['game_id'].str[:4].astype(int)
data['month'] = data['game_id'].str[5:7].astype(int)
 
# Fixes playtype to be strictly pass or run 
# Lables scrambles as passes
data.loc[data['pass']==1,'play_type'] = 'pass'
data.loc[data.rush==1,'play_type'] = 'run'

# Columns that are useful for us
columnsKeep = ['game_id', 'year','month','season_type','week','posteam','posteam_type','defteam','side_of_field','yardline_100','half_seconds_remaining','home_team','away_team',
               'game_seconds_remaining','game_half','drive','down','time','yrdln','ydstogo','desc','play_type','yards_gained','shotgun','no_huddle','qb_dropback','qb_kneel',
               'qb_spike','qb_scramble','pass_length','run_location','run_gap','home_timeouts_remaining','away_timeouts_remaining','total_home_score','total_away_score',
               'passer_player_name','receiver_player_name','rusher_player_name','play_type_nfl','roof','surface','temp','wind','home_coach','away_coach','game_stadium',
               'success', 'wp']

data = data[columnsKeep]

# Save
data.to_csv('finalized_data.csv', index=False)

  data = pd.read_csv('data_file.csv') # pandas data frame


In [4]:
# Count occurrences of each play type
runCount = (data['play_type'] == 'run').sum()
passCount = (data['play_type'] == 'pass').sum()

print("Number of 'run' plays:", runCount)
print("Number of 'pass' plays:", passCount)

10896
Number of 'run' plays: 4402
Number of 'pass' plays: 6494


In [5]:
# Check for nan values in data
for index, row in data.iterrows():
    if pd.isnull(row['play_type']):  # Check if the value in the 'down' column is NaN
        print(row['desc'])  # Print the value in the 'desc' column if 'down' is NaN

# Wind and stadium had a lot of null 
# Check for missing values in data
selected_features = ['down', 'drive', 'ydstogo', 'yardline_100', 'half_seconds_remaining', 'game_seconds_remaining','wp']
missing_values_features = data[selected_features].isnull().sum()
print("Missing values in selected features:")
print(missing_values_features)

# Check for missing values in the target
missing_values_target = data['play_type'].isnull().sum()
print("\nMissing values in target (play_type):", missing_values_target)


Missing values in selected features:
down                      0
drive                     0
ydstogo                   0
yardline_100              0
half_seconds_remaining    0
game_seconds_remaining    0
wp                        0
dtype: int64

Missing values in target (play_type): 0


In [27]:
features = data[['down','drive', 'ydstogo','yardline_100', 'half_seconds_remaining', 'game_seconds_remaining', 'wp']]
target = data['play_type'].map({'pass': 0, 'run': 1})

# Initialize the scaler 
scaler = StandardScaler() 

# Fit the scaler on the features and transform them 
features_scaled = scaler.fit_transform(features)

# Reshaping input data for LSTM [samples, time steps, features]
# Here, we assume each sample is a single time step
features_scaled = features_scaled.reshape((features_scaled.shape[0], 1, features_scaled.shape[1]))

X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.2, shuffle = False)

In [28]:
# Initalize model
model = Sequential()

# input_shape corresponds to (time steps, features)
model.add(LSTM(units=50, return_sequences=True, input_shape=(1, features_scaled.shape[2])))
model.add(Dropout(0.2))  # Adding dropout to avoid overfitting

# Adding a second LSTM layer
model.add(LSTM(units=50, return_sequences=False))
model.add(Dropout(0.2))

# Adding the output layer
# Since binary classification (pass/run), we use 1 unit with a sigmoid activation function
model.add(Dense(units=1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=30, batch_size=64, validation_split=0.1)

test_loss, test_acc = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {test_acc:.3f}, Test Loss: {test_loss:.3f}')

Epoch 1/30


2024-04-12 17:27:13.216087: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-04-12 17:27:13.216755: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-04-12 17:27:13.217098: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/30
  1/123 [..............................] - ETA: 0s - loss: 0.5347 - accuracy: 0.7812

2024-04-12 17:27:14.870478: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-04-12 17:27:14.871168: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-04-12 17:27:14.871677: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Test Accuracy: 0.661, Test Loss: 0.584


In [26]:
predictions = model.predict(X_test)

predictions = (predictions > 0.5).astype(int).reshape(-1)
for i in range(1000):
    actual = 'pass' if y_test.iloc[i] == 1 else 'run'  # Inverted label mapping
    predicted = 'pass' if predictions[i] == 1 else 'run'  # Inverted label mapping
    print(f'Actual: {actual}, Predicted: {predicted}')

Actual: pass, Predicted: run
Actual: run, Predicted: run
Actual: run, Predicted: run
Actual: run, Predicted: run
Actual: run, Predicted: run
Actual: run, Predicted: run
Actual: run, Predicted: pass
Actual: run, Predicted: run
Actual: run, Predicted: run
Actual: pass, Predicted: pass
Actual: pass, Predicted: pass
Actual: pass, Predicted: pass
Actual: pass, Predicted: run
Actual: pass, Predicted: run
Actual: pass, Predicted: pass
Actual: pass, Predicted: run
Actual: run, Predicted: run
Actual: pass, Predicted: run
Actual: run, Predicted: run
Actual: run, Predicted: run
Actual: pass, Predicted: run
Actual: pass, Predicted: run
Actual: run, Predicted: run
Actual: run, Predicted: run
Actual: pass, Predicted: run
Actual: run, Predicted: run
Actual: pass, Predicted: run
Actual: pass, Predicted: run
Actual: run, Predicted: run
Actual: run, Predicted: run
Actual: run, Predicted: run
Actual: run, Predicted: run
Actual: run, Predicted: run
Actual: run, Predicted: run
Actual: run, Predicted: run
A

In [58]:
# Length of X train, and nulls in the x,train, x_test, y_train, y_test 
print(len(X_train))
nan_indices_X = np.isnan(X_train)
nan_count_X = np.sum(nan_indices_X)
print(nan_count_X)

nan_indices_X_test = np.isnan(X_test)
nan_count_X_test = np.sum(nan_indices_X_test)
print(nan_count_X_test)

nan_indices_y = np.isnan(y_train)
nan_count_y = np.sum(nan_indices_y)
print(nan_count_y)

nan_indices_Y_test = np.isnan(y_test)
nan_count_Y_test = np.sum(nan_indices_Y_test)
print(nan_count_Y_test)

print("Shape of X_train:", X_train.shape)

7652
0
0
0
0
Shape of X_train: (7652, 1, 7)
