In [1]:
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.ticker as ticker
import sys
from iminuit import Minuit
from matplotlib.colors import ListedColormap
from tqdm import tqdm
import os

In [2]:
# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.inspection import permutation_importance

# classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb

In [3]:
# TensorFlow
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

2024-05-22 22:26:25.629493: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
sys.path.append('../External_Functions')
from ExternalFunctions import Chi2Regression, BinnedLH, UnbinnedLH, simpson38
from ExternalFunctions import nice_string_output, add_text_to_ax 

In [5]:
COLOUR = ['#1E90FF', # 0 # Dodgerblue
          '#FFBF00', # 1 # Amber
          '#FF6347', # 2 # Tomato
          '#00A86B', # 3 # Jade
          '#8A2BE2', # 4 # Blueviolet
          '#FF6FFF', # 5 # Ultra Pink
          '#00CCFF', # 6 # Vivid Sky Blue
          '#00ff40', # 7 # Erin
          '#FF004F', # 8 # Folly
          '#0063A6', # 9 # Lapis Lazuli
        ]
def setMplParam(classNum):
    # Define effective colors, line styles, and markers based on the class number
   
    LINE = ['-', '-.', '--', '-.', ':','--','-.','-', ':', '--']
    MARKER = ['.','*', '^', 's', '.', 'p', 'o', 's', '.', 'd']
    COLOUR_EFF = COLOUR[:classNum]
    LINE_EFF = LINE[:classNum]
    MARKER_EFF = MARKER[:classNum]

    # Set the color cycle for lines including color, line style, and marker
    plt.rcParams['axes.prop_cycle'] = (plt.cycler(color=COLOUR_EFF) +
                                       plt.cycler(linestyle=LINE_EFF)+
                                       plt.cycler(marker=MARKER_EFF))

    # Set default line and marker sizes
    plt.rcParams['lines.markersize'] = 3  # Example size
    plt.rcParams['lines.linewidth'] = 2   # Example width for lines

    # Set label and title sizes
    plt.rcParams['axes.labelsize'] = 20
    plt.rcParams['axes.titlesize'] = 20

    # Set tick properties
    plt.rcParams['xtick.direction'] = 'in'
    plt.rcParams['xtick.labelsize'] = 20
    plt.rcParams['ytick.direction'] = 'in'
    plt.rcParams['ytick.labelsize'] = 20

    # Set legend font size
    plt.rcParams['legend.fontsize'] = 12

    # Enable and configure grid
    plt.rcParams['axes.grid'] = True
    plt.rcParams['grid.alpha'] = 0.8
    plt.rcParams['grid.linestyle'] = '--'
    plt.rcParams['grid.linewidth'] = 1

    # Set axes line width
    plt.rcParams['axes.linewidth'] = 2

    # Set tick sizes and widths
    plt.rcParams['xtick.major.size'] = 7
    plt.rcParams['xtick.major.width'] = 3
    plt.rcParams['xtick.minor.size'] = 2
    plt.rcParams['xtick.minor.width'] = 2

    plt.rcParams['ytick.major.size'] = 7
    plt.rcParams['ytick.major.width'] = 3
    plt.rcParams['ytick.minor.size'] = 2
    plt.rcParams['ytick.minor.width'] = 2

setMplParam(10)


In [6]:
def readNFLPlay():
    subDirPath = '../NFLPlay/'
    plays = pd.read_csv(subDirPath+'plays.csv')
    return plays

In [7]:
# NFLplays = readNFLPlay()
NFLplays_raw = readNFLPlay()
# 26 sec

In [8]:
NFLplays_raw.head()

Unnamed: 0,playId,gameId,playSequence,quarter,possessionTeamId,nonpossessionTeamId,playType,playType2,playTypeDetailed,playNumberByTeam,...,scorePossession,scoreNonpossession,homeScorePre,visitingScorePre,homeScorePost,visitingScorePost,distanceToGoalPost,fieldGoalProbability,huddle,formation
0,30298,26909,1,1,2200,3200,kickoff,"kickoff, returned","kickoff, returned",1,...,0,0,0,0,0,0,,,,
1,30299,26909,2,1,3200,2200,pass,"pass, complete","pass, complete",1,...,0,0,0,0,0,0,44.0,0.26,huddle,
2,30300,26909,3,1,3200,2200,pass,"pass, complete","pass, complete",2,...,0,0,0,0,0,0,30.0,0.74,no huddle,shotgun
3,30301,26909,4,1,3200,2200,pass,"pass, incomplete","pass, incomplete",3,...,0,0,0,0,0,0,30.0,0.91,no huddle,shotgun
4,30302,26909,5,1,3200,2200,pass,"pass, complete","pass, complete",4,...,0,0,0,0,0,0,28.0,0.91,no huddle,


In [9]:
NFLplays_raw[NFLplays_raw['gameId']==26909][:5]

Unnamed: 0,playId,gameId,playSequence,quarter,possessionTeamId,nonpossessionTeamId,playType,playType2,playTypeDetailed,playNumberByTeam,...,scorePossession,scoreNonpossession,homeScorePre,visitingScorePre,homeScorePost,visitingScorePost,distanceToGoalPost,fieldGoalProbability,huddle,formation
0,30298,26909,1,1,2200,3200,kickoff,"kickoff, returned","kickoff, returned",1,...,0,0,0,0,0,0,,,,
1,30299,26909,2,1,3200,2200,pass,"pass, complete","pass, complete",1,...,0,0,0,0,0,0,44.0,0.26,huddle,
2,30300,26909,3,1,3200,2200,pass,"pass, complete","pass, complete",2,...,0,0,0,0,0,0,30.0,0.74,no huddle,shotgun
3,30301,26909,4,1,3200,2200,pass,"pass, incomplete","pass, incomplete",3,...,0,0,0,0,0,0,30.0,0.91,no huddle,shotgun
4,30302,26909,5,1,3200,2200,pass,"pass, complete","pass, complete",4,...,0,0,0,0,0,0,28.0,0.91,no huddle,


In [10]:
NFLplays_raw[NFLplays_raw['gameId']==26910][:10]

Unnamed: 0,playId,gameId,playSequence,quarter,possessionTeamId,nonpossessionTeamId,playType,playType2,playTypeDetailed,playNumberByTeam,...,scorePossession,scoreNonpossession,homeScorePre,visitingScorePre,homeScorePost,visitingScorePost,distanceToGoalPost,fieldGoalProbability,huddle,formation
164,5489,26910,1,1,610,2250,kickoff,"kickoff, returned","kickoff, returned",1,...,0,0,0,0,0,0,,,,
165,5490,26910,2,1,2250,610,rush,"rush, outside","rush, right tackle",1,...,0,0,0,0,0,0,76.0,0.0,huddle,
166,5491,26910,3,1,2250,610,pass,"pass, complete","pass, complete",2,...,0,0,0,0,0,0,74.0,0.0,huddle,
167,5492,26910,4,1,2250,610,pass,"pass, incomplete","pass, incomplete",3,...,0,0,0,0,0,0,74.0,0.0,huddle,shotgun
168,5493,26910,5,1,2250,610,punt,"punt, fair catch","punt, fair catch",4,...,0,0,0,0,0,0,,0.0,,
169,5494,26910,6,1,610,2250,pass,"pass, complete","pass, complete",2,...,0,0,0,0,0,0,60.0,0.03,huddle,
170,5495,26910,7,1,610,2250,rush,"rush, outside","rush, left tackle",3,...,0,0,0,0,0,0,55.0,0.42,huddle,
171,5496,26910,8,1,610,2250,rush,rush,rush,4,...,0,0,0,0,0,0,62.0,0.54,huddle,
172,5497,26910,9,1,610,2250,pass,"pass, incomplete","pass, incomplete",5,...,0,0,0,0,0,0,,0.34,huddle,
173,5498,26910,10,1,610,2250,rush,"rush, outside","rush, left tackle",6,...,0,0,0,0,0,0,57.0,0.49,huddle,


## the columns

In [11]:
# NFLplays_raw.columns

In [12]:
NFLplays_raw.shape

(870384, 44)

In [13]:
# for i in NFLplays_raw['formation'].unique():
#     n = NFLplays_raw[NFLplays_raw['formation']== i ].shape[0]
#     print(f'{i:10}----------{n:6d}')

In [14]:
indices = ['playId', 'gameId']

playCircumstance = ['playSequence', 
                'quarter', 
                'possessionTeamId',
                'nonpossessionTeamId', 
                'playNumberByTeam',
                'gameClock', 
                'down', 
                'distance',
                'distanceToGoalPre',
                'netYards',
                'scorePossession',
                'scoreNonpossession',
                'fieldGoalProbability',]

# classification
playType = ['playType'
            'huddle',
            'formation']

playResult = ['playType2', # only second item
                'gameClockSecondsExpired',
              'gameClockStoppedAfterPlay', 
               'noPlay', # is the play a penalty
               'offensiveYards']

playSubsequence = ['isClockRunning', 
                        'changePossession', 
                        'turnover',
                        'safety',
                        'firstDown',]

idk = [ 'typeOfPlay',
        'fourthDownConversion',
        'thirdDownConversion',
        'homeScorePre', 
        'visitingScorePre',
        'homeScorePost',
        'visitingScorePost',
        'distanceToGoalPost']

# the original dataset has 3 columns of their own prediction of the play we may be able to use them as a reference
reference = ['evPre',
             'evPost', 
             'evPlay',]

exclude = [ 'playTypeDetailed', # redundant to playType2
            'fieldPosition', 
            'playDescription',
            'playStats',
            'playDescriptionFull', 
            'efficientPlay']

# Data Analysis Algorithm
1. playCircumstance -> XGB -> playType
2. playCircumstance & playType -> Regression (NN?) -> playResult
3. playResult -> **manual function** -> playCircumstance
<!-- * updateCircumstance -->

# Data Preprocessing
1. `impute` imputes NaN
2. `numericalize` converts strings to numerics

In [15]:
from Preprocess import printColumnsHasNan, printNonNumericColumns, runPreprocess

In [16]:
NFLplays_raw['playType'].unique()

array(['kickoff', 'pass', 'penalty', 'field goal', 'rush', 'punt', 'xp',
       'spike', 'kneel', 'aborted', 'two-point'], dtype=object)

In [17]:
printColumnsHasNan(NFLplays_raw)

Columns with NaN values:
playType2
playTypeDetailed
fieldPosition
distanceToGoalPre
playStats
typeOfPlay
distanceToGoalPost
fieldGoalProbability
huddle
formation


In [18]:
printNonNumericColumns(NFLplays_raw)

Non-numeric columns:
playType
playType2
playTypeDetailed
gameClock
fieldPosition
playDescription
playStats
playDescriptionFull
typeOfPlay
huddle
formation


### `runPreprocess` : this is where the collective preprocessing algorithms come into play!

In [19]:
NFLplays = runPreprocess(NFLplays_raw, exclude, idk)

* test what values each column contains

In [20]:
NFLplays_raw['playType'].unique()

array(['kickoff', 'pass', 'penalty', 'field goal', 'rush', 'punt', 'xp',
       'spike', 'kneel', 'aborted', 'two-point'], dtype=object)

In [21]:
NFLplays['playType'].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

* `fieldGoalProbability` has nan for these `playType` values
  * 'kickoff'
  * 'xp'
  * 'two-point'
  * 'aborted'

In [22]:
NFLplays[NFLplays['fieldGoalProbability'].isnull() & 
                       (NFLplays['playType'] != 'kickoff') & 
                       (NFLplays['playType'] != 'xp') &
                       (NFLplays['playType'] != 'two-point')&
                       (NFLplays['playType'] != 'aborted')]

Unnamed: 0,playId,gameId,playSequence,quarter,possessionTeamId,nonpossessionTeamId,playType,playNumberByTeam,gameClock,gameClockSecondsExpired,...,firstDown,evPre,evPost,evPlay,scorePossession,scoreNonpossession,fieldGoalProbability,huddle,formation,playResult


* `huddle` has nan for these `playType` values
  * 'kickoff'
  * 'field goal'
  * 'punt'
  * 'xp'
  * 'two-point'
  * 'penalty'
  * 'aborted'

In [23]:
NFLplays[NFLplays['huddle'].isnull() & 
                       (NFLplays['playType'] != 'kickoff') & 
                       (NFLplays['playType'] != 'field goal') &
                        (NFLplays['playType'] != 'punt') &
                       (NFLplays['playType'] != 'xp')&
                          (NFLplays['playType'] != 'two-point')&
                          (NFLplays['playType'] != 'penalty')&
                       (NFLplays['playType'] != 'aborted')]

Unnamed: 0,playId,gameId,playSequence,quarter,possessionTeamId,nonpossessionTeamId,playType,playNumberByTeam,gameClock,gameClockSecondsExpired,...,firstDown,evPre,evPost,evPlay,scorePossession,scoreNonpossession,fieldGoalProbability,huddle,formation,playResult


In [24]:
NFLplays['playType']

0         0
1         1
2         1
3         1
4         1
         ..
870379    1
870380    8
870381    8
870382    8
870383    1
Name: playType, Length: 870384, dtype: int64

In [25]:
NFLplays['gameClock']

0         900
1         891
2         845
3         807
4         803
         ... 
870379     68
870380     57
870381     53
870382     51
870383      5
Name: gameClock, Length: 870384, dtype: int64