In [None]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt

import sys
sys.path.insert(1, '../../')
from keys import aiven_pwd 

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Dropout
from scikeras.wrappers import KerasClassifier
from tensorflow.keras.callbacks import ReduceLROnPlateau


from sqlalchemy import create_engine, text
sql_engine = create_engine(f"mysql+pymysql://avnadmin:{aiven_pwd}@mysql-nfl-mhoffmann-nfl.b.aivencloud.com:10448/nfl", pool_size=20, max_overflow=50)

2024-11-29 17:06:43.703675: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [20]:
years = "(2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009)"

query = f""" 
WITH play_stats AS (
    SELECT
        p.play_id,
        p.game_id,
        p.sequenceNumber,
        p.quarter,
        TIME_TO_SEC(p.clock) AS clock_seconds,
        p.offenseAtHome,
        p.down,
        p.distance,
        p.yardsToEndzone,
        p.playtype_id,
        g.season,
        g.game_type,
        g.week,
        CASE 
            WHEN p.offenseAtHome = TRUE THEN p.homeScore
            ELSE p.awayScore
        END AS offenseScore,
        CASE 
            WHEN p.offenseAtHome = FALSE THEN p.homeScore
            ELSE p.awayScore
        END AS defenseScore,
		CASE 
            WHEN p.offenseAtHome = TRUE THEN g.standing_home_overall_win
            ELSE g.standing_away_overall_win
        END AS standing_offense_overall_win,
        CASE 
            WHEN p.offenseAtHome = TRUE THEN g.standing_home_home_win
            ELSE g.standing_away_home_win
        END AS standing_offense_home_win,
        CASE 
            WHEN p.offenseAtHome = TRUE THEN g.standing_home_road_win
            ELSE g.standing_away_road_win
        END AS standing_offense_road_win,
        CASE 
            WHEN p.offenseAtHome = TRUE THEN g.standing_home_overall_loss
            ELSE g.standing_away_overall_loss
        END AS standing_offense_overall_loss,
        CASE 
            WHEN p.offenseAtHome = TRUE THEN g.standing_home_home_loss
            ELSE g.standing_away_home_loss
        END AS standing_offense_home_loss,
        CASE 
            WHEN p.offenseAtHome = TRUE THEN g.standing_home_road_loss
            ELSE g.standing_away_road_loss
        END AS standing_offense_road_loss,
        CASE 
            WHEN p.offenseAtHome = FALSE THEN g.standing_home_overall_win
            ELSE g.standing_away_overall_win
        END AS standing_defense_overall_win,
        CASE 
            WHEN p.offenseAtHome = FALSE THEN g.standing_home_home_win
            ELSE g.standing_away_home_win
        END AS standing_defense_home_win,
        CASE 
            WHEN p.offenseAtHome = FALSE THEN g.standing_home_road_win
            ELSE g.standing_away_road_win
        END AS standing_defense_road_win,
        CASE 
            WHEN p.offenseAtHome = FALSE THEN g.standing_home_overall_loss
            ELSE g.standing_away_overall_loss
        END AS standing_defense_overall_loss,
        CASE 
            WHEN p.offenseAtHome = FALSE THEN g.standing_home_home_loss
            ELSE g.standing_away_home_loss
        END AS standing_defense_home_loss,
        CASE 
            WHEN p.offenseAtHome = FALSE THEN g.standing_home_road_loss
            ELSE g.standing_away_road_loss
        END AS standing_defense_road_loss,
        t1.abbreviation AS offenseAbr,
        t2.abbreviation AS defenseAbr,
		CASE 
            WHEN p.offenseAtHome = TRUE THEN (p.homeScore - p.awayScore)
            ELSE (p.awayScore - p.homeScore)
        END AS scoreDiff,
        (TIME_TO_SEC(p.clock) + (4 - p.quarter) * 15 * 60) AS totalTimeLeft
    FROM
        nfl.plays p
    LEFT JOIN nfl.games g ON p.game_id = g.game_id
    LEFT JOIN nfl.teams t1 ON 
        (p.offenseAtHome = TRUE AND g.home_team_id = t1.team_id) OR
        (p.offenseAtHome = FALSE AND g.away_team_id = t1.team_id)
    LEFT JOIN nfl.teams t2 ON 
        (p.offenseAtHome = TRUE AND g.away_team_id = t2.team_id) OR
        (p.offenseAtHome = FALSE AND g.home_team_id = t2.team_id)
	WHERE
        g.season IN {years}
),
play_aggregates AS (
    SELECT
        p1.game_id,
        p1.play_id,
        p1.sequenceNumber,
        -- Completion Rate Calculation
        (
            SELECT 
                COUNT(*) * 1.0 / NULLIF(
                    (SELECT COUNT(*) 
                     FROM nfl.plays p2 
                     WHERE p2.game_id = p1.game_id 
                     AND p2.sequenceNumber < p1.sequenceNumber 
                     AND p2.playtype_id IN (67, 51, 24, 3, 6, 26, 36)), 0
                )
            FROM nfl.plays p2
            WHERE p2.game_id = p1.game_id 
              AND p2.sequenceNumber < p1.sequenceNumber 
              AND (p2.playtype_id IN (67, 24)
				OR (p2.playtype_id = 51 AND p2.description NOT LIKE '%incomplete%')
			  )
        ) AS completionRate,
        -- Pass to Rush Ratio Calculation
        (
            SELECT 
                COUNT(*) * 1.0 / NULLIF(
                    (SELECT COUNT(*) 
                     FROM nfl.plays p2 
                     WHERE p2.game_id = p1.game_id 
                     AND p2.sequenceNumber < p1.sequenceNumber 
                     AND p2.playtype_id IN (5, 68)), 0
                )
            FROM nfl.plays p2
            WHERE p2.game_id = p1.game_id 
              AND p2.sequenceNumber < p1.sequenceNumber 
              AND p2.playtype_id IN (67, 51, 24, 3, 6, 26, 36)
        ) AS passToRushRatio
    FROM nfl.plays p1
	LEFT JOIN nfl.games g ON p1.game_id = g.game_id
    WHERE g.season IN {years}
)
SELECT ps.*, pa.completionRate, pa.passToRushRatio
FROM play_stats ps
JOIN play_aggregates pa ON ps.play_id = pa.play_id;
"""

sql_data = pd.DataFrame(sql_engine.connect().execute(text(query)).fetchall())

In [21]:
data_df = sql_data.copy()
data_df.dropna(inplace=True)
play_ids = data_df.pop('play_id')
game_ids = data_df.pop('game_id')
data_df.drop(labels=['sequenceNumber'], axis=1, inplace=True)

data_df['offenseAtHome'] = data_df['offenseAtHome'].astype('boolean')
data_df['passToRushRatio'] = pd.to_numeric(data_df['passToRushRatio'])
data_df['completionRate'] = pd.to_numeric(data_df['completionRate'])

playtype_mapping = {
    2: None, 
    3: 'Pass', 
    5: 'Rush', 
    6: 'Pass', 
    7: None, 
    8: None, 
    9: None, 
    12: None, # 'Kickoff',
    15: None, # '2P-Pass',
    16:  None, # '2P-Rush',
    17: 'Punt',
    18: 'FG',
    20: None,
    21: None,
    24: 'Pass',
    26: 'Pass',
    29: None,
    30: 'Punt',
    32: None, # 'Kickoff',
    34: 'Punt',
    36: 'Pass',
    37: 'Punt',
    38: 'FG',
    39: None,
    40: 'FG',
    41: 'FG',
    43:  None, # 'PAT',
    51: 'Pass',
    52: 'Punt',
    53: None, # 'Kickoff',
    57: None,
    59: 'FG',
    60: 'FG',
    61:  None, # 'PAT',
    62:  None, # 'PAT',
    65: None,
    66: None,
    67: 'Pass',
    68: 'Rush',
    69: None,
    70: None,
    74: None,
    75: None,
    79: None
}

data_df['playtype'] =  data_df['playtype_id'].map(playtype_mapping)
data_df.drop(labels=['playtype_id'], axis=1, inplace=True)
data_df.dropna(subset=['playtype'], inplace=True)


In [73]:
X = data_df.copy()
y = X.pop('playtype')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=31416)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=123)


encoder = OneHotEncoder(sparse_output=False)
y_train_encoded = encoder.fit_transform(y_train.values.reshape(-1, 1))
y_test_encoded = encoder.transform(y_test.values.reshape(-1, 1))
y_val_encoded = encoder.transform(y_val.values.reshape(-1, 1))

X_num_columns = list(X.select_dtypes(include="number").columns.to_list())
X_cat_columns = list(X.select_dtypes(exclude="number").columns.to_list())
num_indices = [X.columns.get_loc(col) for col in X_num_columns]
cat_indices = [X.columns.get_loc(col) for col in X_cat_columns]

In [23]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_indices),
        ('cat', OneHotEncoder(drop='first'), cat_indices)
    ])

def create_nn_model(input_dim, output_dim):
    model = Sequential([
        Input(shape=(input_dim,)),  # Specifies that input data has 10 features
        Dense(512, activation='relu'),
        Dropout(0.5),
        Dense(256, activation='relu'),
        Dense(128, activation='relu'),
        Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
        Dense(32, activation='relu'),
        Dense(32, activation='relu'),
        Dense(output_dim, activation='softmax')  # Output layer
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-5)


keras_model = KerasClassifier(
    model=create_nn_model,
    input_dim=preprocessor.fit_transform(X_train).shape[1],  # Determined after preprocessing
    output_dim=len(y.unique()),  # Number of unique classes
    epochs=50,
    batch_size=10000,
    verbose=1
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Preprocessing for X
    ('model', keras_model)
])

In [24]:
pipeline.fit(X_train, y_train_encoded, model__validation_data=(preprocessor.fit_transform(X_val), y_val_encoded), model__callbacks=[reduce_lr])

Epoch 1/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 239ms/step - accuracy: 0.4546 - loss: 2.7218 - val_accuracy: 0.5696 - val_loss: 1.8015
Epoch 2/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 281ms/step - accuracy: 0.5507 - loss: 1.6566 - val_accuracy: 0.6339 - val_loss: 1.2237
Epoch 3/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 370ms/step - accuracy: 0.6245 - loss: 1.1554 - val_accuracy: 0.6576 - val_loss: 0.9144
Epoch 4/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 324ms/step - accuracy: 0.6465 - loss: 0.9007 - val_accuracy: 0.6745 - val_loss: 0.7730
Epoch 5/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 346ms/step - accuracy: 0.6678 - loss: 0.7686 - val_accuracy: 0.6850 - val_loss: 0.6854
Epoch 6/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 368ms/step - accuracy: 0.6713 - loss: 0.7015 - val_accuracy: 0.6833 - val_loss: 0.6548
Epoch 7/50
[1m37/37[

In [25]:
from sklearn.metrics import accuracy_score
y_pred = pipeline.predict(X_test)
acc = accuracy_score(y_test_encoded, y_pred)
print(f"Accuracy: {acc:.4f}")

[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 81ms/step
Accuracy: 0.6771


In [29]:
print(X_num_columns)
print(X_cat_columns)

['quarter', 'clock_seconds', 'down', 'distance', 'yardsToEndzone', 'season', 'week', 'offenseScore', 'defenseScore', 'standing_offense_overall_win', 'standing_offense_home_win', 'standing_offense_road_win', 'standing_offense_overall_loss', 'standing_offense_home_loss', 'standing_offense_road_loss', 'standing_defense_overall_win', 'standing_defense_home_win', 'standing_defense_road_win', 'standing_defense_overall_loss', 'standing_defense_home_loss', 'standing_defense_road_loss', 'scoreDiff', 'totalTimeLeft', 'completionRate', 'passToRushRatio']
['offenseAtHome', 'game_type', 'offenseAbr', 'defenseAbr']


In [44]:
test_df = sql_data.copy()
test_df.dropna(inplace=True)
test_df.drop(labels=['sequenceNumber'], axis=1, inplace=True)

test_df['offenseAtHome'] = test_df['offenseAtHome'].astype('boolean')
test_df['passToRushRatio'] = pd.to_numeric(test_df['passToRushRatio'])
test_df['completionRate'] = pd.to_numeric(test_df['completionRate'])

test_df['playtype'] =  test_df['playtype_id'].map(playtype_mapping)
test_df.drop(labels=['playtype_id'], axis=1, inplace=True)
test_df.dropna(subset=['playtype'], inplace=True)


In [88]:
pd.set_option('display.max_columns', None)
play_id = 4016717752770
test_play = test_df.loc[(test_df['game_id']==401671775)&(test_df['play_id']== 4016717752840)]


In [89]:

for i,j in zip(list(pipeline.predict_proba(test_play)), list(encoder.categories_[0])):
    print(i,j)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
2.8679211e-05 FG
0.7029093 Pass
7.639112e-06 Punt
0.29705444 Rush


In [92]:
import dill
with open('nn_classifier.pkl', 'wb') as f:
    dill.dump(pipeline, f)
with open('nn_encoder.pkl', 'wb') as f:
    dill.dump(encoder, f)

In [93]:
encoder.categories_


array(['FG', 'Pass', 'Punt', 'Rush'], dtype=object)