In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sys
from tqdm import tqdm
import os

In [3]:
# Sklearn
# from sklearn.model_selection import train_test_split, KFold, cross_val_score
# from sklearn.metrics import confusion_matrix, accuracy_score, ConfusionMatrixDisplay, classification_report

In [4]:
sys.path.append('../External_Functions')
from ExternalFunctions import Chi2Regression, BinnedLH, UnbinnedLH, simpson38
from ExternalFunctions import nice_string_output, add_text_to_ax 

In [5]:
from MplSetup import getColour, setMplParam
# from AccessibilityUtils import onCompletion

In [6]:
setMplParam(10)

In [7]:
def readNFLPlay():
    subDirPath = '../NFLPlay/'
    plays = pd.read_csv(subDirPath+'plays.csv')
    return plays

In [8]:
NFLplays_raw = readNFLPlay()
# 50 sec on Cyan's pc

In [9]:
# NFLplays_raw.head()

In [10]:
# NFLplays_raw[NFLplays_raw['gameId']==26909][:5]

In [11]:
# NFLplays_raw[NFLplays_raw['gameId']==26910][:10]

## the columns

In [12]:
# NFLplays_raw.columns

In [13]:
NFLplays_raw.shape

(870384, 44)

In [14]:
# for i in NFLplays_raw['formation'].unique():
#     n = NFLplays_raw[NFLplays_raw['formation']== i ].shape[0]
#     print(f'{i:10}----------{n:6d}')

In [15]:
indices = ['playId', 'gameId']

playCircumstance = ['playSequence', 
                'quarter', 
                'possessionTeamId',
                'nonpossessionTeamId', 
                'playNumberByTeam',
                'gameClock', 
                'down', 
                'distance',
                'distanceToGoalPre',
                'netYards',
                'scorePossession',
                'scoreNonpossession',
                'fieldGoalProbability',]

# classification
playType = ['playType'
            'huddle',
            'formation']

playResult = ['playType2', # only second item
                'gameClockSecondsExpired',
              'gameClockStoppedAfterPlay', 
               'noPlay', # is the play a penalty
               'offensiveYards']

playSubsequence = ['isClockRunning', 
                        'changePossession', 
                        'turnover',
                        'safety',
                        'firstDown',]

idk = [ 'typeOfPlay',
        'fourthDownConversion',
        'thirdDownConversion',
        'homeScorePre', 
        'visitingScorePre',
        'homeScorePost',
        'visitingScorePost',
        'distanceToGoalPost']

# the original dataset has 3 columns of their own prediction of the play we may be able to use them as a reference
reference = ['evPre',
             'evPost', 
             'evPlay',]

exclude = [ 'playTypeDetailed', # redundant to playType2
            'fieldPosition', 
            'playDescription',
            'playStats',
            'playDescriptionFull', 
            'efficientPlay']

# Data Analysis Algorithm
1. playCircumstance -> XGB -> playType
2. playCircumstance & playType -> Regression (NN?XGB?) -> playResult
3. playResult -> **manual function** -> playCircumstance
<!-- * updateCircumstance -->

# Data Preprocessing

In [16]:
from Preprocess import printColumnsHasNan, printNonNumericColumns, runPreprocess, getStringValue, getCircumstance, getPlayType, getPlayResult

* see the set of `'playType'` values

In [17]:
# NFLplays_raw['playType'].unique()

* see the lines that have NaN

In [18]:
# printColumnsHasNan(NFLplays_raw)

* see the lines that have non-numerical values

In [19]:
# printNonNumericColumns(NFLplays_raw)

### <span style="color:red">runPreprocess</span> 
this is where the collective preprocessing algorithms come into play!

In [20]:
NFLplays = runPreprocess(NFLplays_raw, exclude, idk)
# 8 sec on Cyan's laptop

In [21]:
NFLplays.shape[0]

870384

* see the set of `'playType'` values

In [20]:
# NFLplays['playType'].unique()

* `fieldGoalProbability` has nan for these `playType` values
  * 'kickoff'
  * 'xp'
  * 'two-point'
  * 'aborted'

In [21]:
# NFLplays[NFLplays['fieldGoalProbability'].isnull() & 
#                        (NFLplays['playType'] != 'kickoff') & 
#                        (NFLplays['playType'] != 'xp') &
#                        (NFLplays['playType'] != 'two-point')&
#                        (NFLplays['playType'] != 'aborted')]

* `huddle` has nan for these `playType` values
  * 'kickoff'
  * 'field goal'
  * 'punt'
  * 'xp'
  * 'two-point'
  * 'penalty'
  * 'aborted'

In [22]:
# NFLplays[NFLplays['huddle'].isnull() & 
#                        (NFLplays['playType'] != 'kickoff') & 
#                        (NFLplays['playType'] != 'field goal') &
#                         (NFLplays['playType'] != 'punt') &
#                        (NFLplays['playType'] != 'xp')&
#                           (NFLplays['playType'] != 'two-point')&
#                           (NFLplays['playType'] != 'penalty')&
#                        (NFLplays['playType'] != 'aborted')]

## Classification using XGB
* due to heavy computational cost, use `fraction` to sample some of the whole data
* if too small fraction of data are used, it returns an error

|fraction| time taken|
|--------|-----------|
|1.0     |485 min    |
|0.33    |152 min    |
|0.1     |42 min     |

In [22]:
from Classification import runPlayTypeClassification, convertFractionIntoString, loadResult, loadClassification

In [23]:
FRACTION = 1.0
analysisSampleSize = int(NFLplays.shape[0]*FRACTION)
print(f'Analysis Sample Size: {analysisSampleSize}')

Analysis Sample Size: 870384


In [24]:
print(f'Fraction string tag: {convertFractionIntoString(FRACTION)}')

Fraction string tag: 100


### <span style="color:red">runPlayTypeClassification</span> 
this is where the collective classification runs  
arguments
* the data frame
* the fraction(a numerical value between 0 and 1)
* the k-fold split number(no default value)


In [51]:
# runPlayTypeClassification(NFLplays, FRACTION, 5)
# 1.0 : 485 min
# 0.33: 152 min
# 0.1 : 42 min

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

In [25]:
loaded_results = loadResult('../PlayTypeClassification/Classification', 'playType', 1)

Results for playType:
  Cross-validation scores: [0.80457499, 0.80728069, 0.80660283, 0.80676367, 0.80533215]
  Average accuracy: 0.8061
  Best parameters:
  Accuracies: [0.8046381773582955, 0.8075794045163922, 0.8070394135928354, 0.8077000407865484, 0.8056653415749443]
  Best accuracy: 0.8077


In [27]:
loaded_classification = loadClassification('../PlayTypeClassification/Classification', 'playType', 1)
# loaded_classification[:20]
loaded_classification[-10:-1]

Unnamed: 0,X_test,y_test,y_pred,y_pred_proba
174067,870303,3,3,0.000179
174068,870309,4,4,0.357101
174069,870315,1,1,0.930258
174070,870321,4,1,0.492764
174071,870322,4,1,0.742609
174072,870323,1,1,0.82643
174073,870326,1,1,0.826433
174074,870362,1,4,0.068211
174075,870363,1,1,0.726353


1. data preprocessing
2. data division: playCircumstance, playType, playResult
3. XGB classification using play circumstance to predict play type.
4. combination of play circumstance and the predicted play type
5. regression using the combined data for get play result
6. define a function that takes the play result as one of its arguments and returns updated play circumstance
7. repeat (3)-(6)

so the first classification has finished
## Questions?
1. **What will we get from doing the regression?**  
 * By doing the regression, we can quantify how different circumstances and play types contribute to the result, which can be valuable for strategic decisions
2. **Use of the predicted playType for the regression**  
* We introduce a layer of uncertainty   
* Simulating real-world scenarios might not always have the actual playType  
* We can understand how errors in classification propagate through to our final result predictions  
3. **Function to Update Circumstances**  
* Scenario Analysis  
* Strategy Optimization  
* Long-Term Predictions(?)  

In [None]:
def