In [24]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sys
from tqdm import tqdm
import os

In [25]:
sys.path.append('../External_Functions')
from ExternalFunctions import Chi2Regression, BinnedLH, UnbinnedLH, simpson38
from ExternalFunctions import nice_string_output, add_text_to_ax 

In [26]:
from MplSetup import getColour, setMplParam

In [27]:
setMplParam(10)

In [28]:
def readNFLPlay():
    subDirPath = '../NFLPlay/'
    plays = pd.read_csv(subDirPath+'plays.csv')
    return plays

In [29]:
NFLplays_raw = readNFLPlay()
# 50 sec on Cyan's pc

## the columns

In [30]:
NFLplays_raw.shape

(870384, 44)

In [31]:
indices = ['playId', 'gameId']

playCircumstance = ['playSequence', 
                'quarter', 
                'possessionTeamId',
                'nonpossessionTeamId', 
                'playNumberByTeam',
                'gameClock', 
                'down', 
                'distance',
                'distanceToGoalPre',
                'netYards',
                'scorePossession',
                'scoreNonpossession',
                'fieldGoalProbability',]

# classification
playType = ['playType'
            'huddle',
            'formation']

playResult = ['playType2', # only second item
                'gameClockSecondsExpired',
              'gameClockStoppedAfterPlay', 
               'noPlay', # is the play a penalty
               'offensiveYards',
               ]

playSubsequence = ['isClockRunning', 
                        'changePossession', 
                        'turnover',
                        'safety',
                        'firstDown',]

idk = [ 'typeOfPlay',
        'fourthDownConversion',
        'thirdDownConversion',
        'homeScorePre', 
        'visitingScorePre',
        'homeScorePost',
        'visitingScorePost',
        'distanceToGoalPost']

# the original dataset has 3 columns of their own prediction of the play we may be able to use them as a reference
reference = ['evPre',
             'evPost', 
             'evPlay',]

exclude = [ 'playTypeDetailed', # redundant to playType2
            'fieldPosition', 
            'playDescription',
            'playStats',
            'playDescriptionFull', 
            'efficientPlay']

# Data Analysis Algorithm
1. playCircumstance -> XGB -> playType
2. playCircumstance & playType -> Regression (NN?XGB?) -> playResult
3. playResult -> **manual function** -> playCircumstance
<!-- * updateCircumstance -->

# Data Preprocessing

In [32]:
import Preprocess as pp

### <span style="color:red">runPreprocess</span> 
this is where the collective preprocessing algorithms come into play!

In [33]:
NFLplays = pp.runPreprocess(NFLplays_raw, exclude, idk)
# 8 sec on Cyan's laptop

In [38]:
splitted = pp.getSplittedList(NFLplays)
# 18.8sec

In [40]:
print(len(splitted))

5308


In [36]:
splitted[0][:20]

KeyError: 0

In [None]:
splitted[0][-20:-1]

In [None]:
NFLplays.columns

Index(['playId', 'gameId', 'playSequence', 'quarter', 'possessionTeamId',
       'nonpossessionTeamId', 'playType', 'playNumberByTeam', 'gameClock',
       'gameClockSecondsExpired', 'gameClockStoppedAfterPlay', 'down',
       'distance', 'distanceToGoalPre', 'noPlay', 'changePossession',
       'turnover', 'safety', 'offensiveYards', 'netYards', 'firstDown',
       'evPre', 'evPost', 'evPlay', 'scorePossession', 'scoreNonpossession',
       'fieldGoalProbability', 'huddle', 'formation', 'playResult'],
      dtype='object')

In [None]:
NFLplays.shape[0]

870384

* see the set of `'playType'` values

In [None]:
# NFLplays['playType'].unique()

* `fieldGoalProbability` has nan for these `playType` values
  * 'kickoff'
  * 'xp'
  * 'two-point'
  * 'aborted'

In [None]:
# NFLplays[NFLplays['fieldGoalProbability'].isnull() & 
#                        (NFLplays['playType'] != 'kickoff') & 
#                        (NFLplays['playType'] != 'xp') &
#                        (NFLplays['playType'] != 'two-point')&
#                        (NFLplays['playType'] != 'aborted')]

* `huddle` has nan for these `playType` values
  * 'kickoff'
  * 'field goal'
  * 'punt'
  * 'xp'
  * 'two-point'
  * 'penalty'
  * 'aborted'

In [None]:
# NFLplays[NFLplays['huddle'].isnull() & 
#                        (NFLplays['playType'] != 'kickoff') & 
#                        (NFLplays['playType'] != 'field goal') &
#                         (NFLplays['playType'] != 'punt') &
#                        (NFLplays['playType'] != 'xp')&
#                           (NFLplays['playType'] != 'two-point')&
#                           (NFLplays['playType'] != 'penalty')&
#                        (NFLplays['playType'] != 'aborted')]

## Classification using XGB
* due to heavy computational cost, use `fraction` to sample some of the whole data
* if too small fraction of data are used, it returns an error

|fraction| time taken| hh : mm |
|--------|-----------|---------|
|1.0     |485 min    | 08 : 05 |
|0.33    |152 min    | 02 : 32 |
|0.1     |42 min     | 00 : 42 |

In [None]:
import Classification as clf

In [None]:
FRACTION = 1.0
analysisSampleSize = int(NFLplays.shape[0]*FRACTION)
print(f'Analysis Sample Size: {analysisSampleSize}')

Analysis Sample Size: 870384


In [None]:
print(f'Fraction string tag: {clf.convertFractionIntoString(FRACTION)}')

Fraction string tag: 100


### <span style="color:red">runPlayTypeClassification</span> 
this is where the collective classification runs  
arguments
* the data frame
* the fraction(a numerical value between 0 and 1)
* the k-fold split number(no default value)


In [None]:
# runPlayTypeClassification(NFLplays, FRACTION, 5)
# 1.0 : 485 min : 8 hr 5 min
# 0.33: 152 min : 2 hr 32 min
# 0.1 : 42 min : 0.7 hr

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

In [None]:
loaded_results = clf.loadResult('../PlayTypeClassification/Classification', 'playType', 1)

Results for playType:
  Cross-validation scores: [0.80457499, 0.80728069, 0.80660283, 0.80676367, 0.80533215]
  Average accuracy: 0.8061
  Best parameters:
  Accuracies: [0.8046381773582955, 0.8075794045163922, 0.8070394135928354, 0.8077000407865484, 0.8056653415749443]
  Best accuracy: 0.8077


In [None]:
loaded_classification = clf.loadClassification('../PlayTypeClassification/Classification', 'playType', 1)

Unnamed: 0,X_test,y_test,y_pred,y_pred_proba
174067,870303,3,3,0.000179
174068,870309,4,4,0.357101
174069,870315,1,1,0.930258
174070,870321,4,1,0.492764
174071,870322,4,1,0.742609
174072,870323,1,1,0.82643
174073,870326,1,1,0.826433
174074,870362,1,4,0.068211
174075,870363,1,1,0.726353


In [None]:
loaded_classification[:20]

Unnamed: 0,X_test,y_test,y_pred,y_pred_proba
0,0,0,0,2.1e-05
1,1,1,1,0.76401
2,3,1,1,0.779311
3,5,1,1,0.925746
4,6,1,4,0.181157
5,8,1,1,0.92127
6,14,4,4,0.211993
7,16,4,4,0.398322
8,31,1,1,0.829907
9,42,5,5,0.000409


In [None]:
loaded_classification[-10:-1]

1. data preprocessing
2. data division: playCircumstance, playType, playResult
3. XGB classification using play circumstance to predict play type.
4. combination of play circumstance and the predicted play type
5. regression using the combined data for get play result
6. define a function that takes the play result as one of its arguments and returns updated play circumstance
7. repeat (3)-(6)

so the first classification has finished
## Questions?
1. **What will we get from doing the regression?**  
 * By doing the regression, we can quantify how different circumstances and play types contribute to the result, which can be valuable for strategic decisions
2. **Use of the predicted playType for the regression**  
* We introduce a layer of uncertainty   
* Simulating real-world scenarios might not always have the actual playType  
* We can understand how errors in classification propagate through to our final result predictions  
3. **Function to Update Circumstances**  
* Scenario Analysis  
* Strategy Optimization  
* Long-Term Predictions(?)  