<a href="https://colab.research.google.com/github/Keoni808/NFL_Data_Cleaning/blob/main/NFL_Plays_Cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mounting and Imports

In [1]:
# Mount your Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Used to access personal google cloud services
from google.colab import auth
auth.authenticate_user()
print('Authenticated')

Authenticated


In [3]:
# imports

# Data manipulation
import pandas as pd

# Regular expressions
import re

# Grab data from database
from google.cloud import bigquery

# Loading Data


In [4]:
# Client connect to bigquery project
client = bigquery.Client('nfl-data-430702')

## All Plays 2023

In [5]:
# nfl_2023_plays_query = """
#                        SELECT *
#                        FROM `nfl-data-430702.NFL_Scores.NFL-Plays-2023`
#                        """

# # Run the query, and return a pandas DataFrame
# dry_run_config = bigquery.QueryJobConfig(dry_run=True)
# dry_run_query = client.query(nfl_2023_plays_query, job_config=dry_run_config)
# print("This query will process {} bytes.".format(dry_run_query.total_bytes_processed))

# safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**9)
# safe_config_query = client.query(nfl_2023_plays_query, job_config=safe_config)

# # df_nfl_scores_data = safe_config_query.to_dataframe()

In [6]:
# df_2023_plays = safe_config_query.to_dataframe()

In [7]:
# df_2023_plays.head()

##Super Bowl Plays 2023

In [8]:
nfl_2023_sb_plays_query = """
                          SELECT *
                          FROM `nfl-data-430702.NFL_Scores.NFL-Plays-SuperBowl-2023`
                          """

# Run the query, and return a pandas DataFrame
dry_run_config = bigquery.QueryJobConfig(dry_run=True)
dry_run_query = client.query(nfl_2023_sb_plays_query, job_config=dry_run_config)
print("This query will process {} bytes.".format(dry_run_query.total_bytes_processed))

safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**9)
safe_config_query = client.query(nfl_2023_sb_plays_query, job_config=safe_config)

# df_nfl_scores_data = safe_config_query.to_dataframe()

This query will process 41291 bytes.


In [9]:
df_2023_plays_sb = safe_config_query.to_dataframe()

In [10]:
df_2023_plays_sb.head()

Unnamed: 0,Season,Week,Day,Date,AwayTeam,HomeTeam,Quarter,DriveNumber,TeamWithPossession,IsScoringDrive,PlayNumberInDrive,IsScoringPlay,PlayOutcome,PlayDescription,PlayStart
0,2023,Super Bowl,SUN,02/11,49ers,Chiefs,1ST QUARTER,2,KC,0,1,0,-3 Yard Run,(12:15) (Shotgun) I.Pacheco left guard to KC 2...,1st & 10 at KC 27
1,2023,Super Bowl,SUN,02/11,49ers,Chiefs,1ST QUARTER,2,KC,0,2,0,1 Yard Pass,(11:39) (Shotgun) P.Mahomes pass short left to...,2nd & 13 at KC 24
2,2023,Super Bowl,SUN,02/11,49ers,Chiefs,1ST QUARTER,2,KC,0,3,0,8 Yard Pass,(11:04) (Shotgun) P.Mahomes pass short right t...,3rd & 12 at KC 25
3,2023,Super Bowl,SUN,02/11,49ers,Chiefs,1ST QUARTER,2,KC,0,4,0,Punt,"(10:24) T.Townsend punts 43 yards to SF 24, Ce...",4th & 4 at KC 33
4,2023,Super Bowl,SUN,02/11,49ers,Chiefs,1ST QUARTER,4,KC,0,1,0,10 Yard Run,(6:28) (Shotgun) I.Pacheco right guard to KC 2...,1st & 10 at KC 11


In [11]:
df_2023_plays_sb.shape

(191, 15)

# Categorize Plays
- The goal here is to parse out the different values for 'PlayOutcome'
  - separate pass / run / kickoff / etc.

## Parsing Plays


In [12]:
# To see all possible play outcomes of the game.
# - From here we can categorize and clean plays accordingly
df_2023_plays_sb['PlayOutcome'].unique()

array(['-3 Yard Run', '1 Yard Pass', '8 Yard Pass', 'Punt', '10 Yard Run',
       '-4 Yard Sack', 'Pass for No Gain', '4 Yard Run', 'Kickoff',
       '7 Yard Pass', '2 Yard Run', '5 Yard Run', '52 Yard Pass',
       'Fumble', 'Pass Incomplete', '3 Yard Run', 'Run for No Gain',
       '10 Yard Pass', 'Interception', '9 Yard Pass', '5 Yard Pass',
       '6 Yard Run', '18 Yard Pass', '11 Yard Pass', '11 Yard Run',
       '-2 Yard Run', '-5 Yard Penalty', '-10 Yard Penalty',
       '12 Yard Pass', '24 Yard Run', '1 Yard Run', 'Sack',
       '-8 Yard Pass', '-1 Yard Run', '8 Yard Run', '6 Yard Pass',
       '21 Yard Pass', '3 Yard Pass', '-1 Yard Sack', 'Field Goal',
       '22 Yard Run', '2 Yard Pass', 'Touchdown Chiefs',
       'Extra Point Good', '16 Yard Pass', '13 Yard Pass', '25 Yard Pass',
       '9 Yard Run', '-3 Yard Sack', '22 Yard Pass', '-3 Yard Pass',
       '4 Yard Pass', '19 Yard Run', '5 Yard Penalty', '19 Yard Pass',
       '-4 Yard Run', '7 Yard Run', '16 Yard Run', 'Touch

In [13]:
# Looking at all unique play outcomes and categorizing them.
# - This type of approach does not feel very flexable because a play outcome can
#   arise that has not been seen yet.
df_2023_pass_sb = df_2023_plays_sb[df_2023_plays_sb['PlayOutcome'].str.contains('Pass')]
df_2023_run_sb = df_2023_plays_sb[df_2023_plays_sb['PlayOutcome'].str.contains('Run')]
df_2023_punt_sb = df_2023_plays_sb[df_2023_plays_sb['PlayOutcome'].str.contains('Punt')]
df_2023_sack_sb = df_2023_plays_sb[df_2023_plays_sb['PlayOutcome'].str.contains('Sack')]
df_2023_kickoff_sb = df_2023_plays_sb[df_2023_plays_sb['PlayOutcome'].str.contains('Kickoff')]
df_2023_fumble_sb = df_2023_plays_sb[df_2023_plays_sb['PlayOutcome'].str.contains('Fumble')]
df_2023_interception_sb = df_2023_plays_sb[df_2023_plays_sb['PlayOutcome'].str.contains('Interception')]
df_2023_penalty_sb = df_2023_plays_sb[df_2023_plays_sb['PlayOutcome'].str.contains('Penalty')]
df_2023_fieldgoal_sb = df_2023_plays_sb[df_2023_plays_sb['PlayOutcome'].str.contains('Field Goal')]
df_2023_touchdown_sb = df_2023_plays_sb[df_2023_plays_sb['PlayOutcome'].str.contains('Touchdown')]
df_2023_extrapoint_sb = df_2023_plays_sb[df_2023_plays_sb['PlayOutcome'].str.contains('Extra Point')]

plays_list = [df_2023_pass_sb,
              df_2023_run_sb,
              df_2023_punt_sb,
              df_2023_sack_sb,
              df_2023_kickoff_sb,
              df_2023_fumble_sb,
              df_2023_interception_sb,
              df_2023_penalty_sb,
              df_2023_fieldgoal_sb,
              df_2023_touchdown_sb,
              df_2023_extrapoint_sb]

## Sanity Check (All Plays Accounted for)

In [14]:
# A check to make sure that all plays have been categorized.
# - The check puts all categorized plays into a single dataframe
#   and will compare with the original dataframe to make sure
#   that they are the same.
df_check = pd.DataFrame()
for i in plays_list:
  df_check = pd.concat([df_check, i])

In [15]:
df_check = df_check.sort_index()

In [16]:
df_2023_plays_sb.equals(df_check)

True

# 'PlayDescription' Feature Breakdown

ISSUES:
- laterals?
- penalties
  - Declined or accepted?
  - What kind of penalty was it?
  - Multiple penalties in a row

- I need a check to make sure that all plays have been broken down
  - Possibly add a check for each type of play that happened?

- Touchdowns
  - A passing touchdown is not included within the passing category, it is in its own.

- Fumbles
  - How do I break this down?
  - What happens if theres a fumble after a fumble and it keeps going?

- Error correction catching system.
  - I need to raise errors when something does not break down correctly.



In [29]:
####################################################
# REGULAR EXPRESSIONS USED TO LOCATE SPECIFIC DATA #
####################################################

################
# PLAY DETAILS #
################

time_on_clock_pattern = r'\(\d*:\d+\)'
formation = r'\([A-Za-z]+ ?[A-Za-z]*,? ?[A-Za-z]*\)'
manual_yardage = r'\d+ yards?' # Used when 'PlayOutcome' does not have yardage gained

#################
# NAMES OFFENSE #
#################

name_pattern = r'\b[A-Za-z]+\.[A-Za-z]+-?[A-Za-z]*\b' # Grabs all names but will only be used for Quarterback
receiver_name_pattern = r'\b [A-Za-z]+\.[A-Za-z]+-?[A-Za-z]*\b' # Receivers have a space before their name
rusher_pattern = r'\b[A-Za-z]+\.[A-Za-z]+-?[A-Za-z]* \b' # Runningbacks, like quarterbacks, are the first names in play descriptions

#################
# NAMES DEFENSE #
#################

defense_tackler_1_name_pattern = r'\([A-Za-z]+\.[A-Za-z]+-?[A-Za-z]*' # Will have a "(" in front of the name
defense_tackler_2_name_pattern = r' [A-Za-z]+\.[A-Za-z]+-?[A-Za-z]*\)' # Will have a ")" at the end of the name
defense_pressure_name_pattern = r'\[[A-Za-z]+\.[A-Za-z]+-?[A-Za-z]*\]' # Surrounded by "[]" brackets

########################
# TEAM IDENTIFIED NAME #
########################

team_identified_name = r'-[A-Za-z]+\.[A-Za-z]+-?[A-Za-z]*' # team initials comes before their name (e.g. KC-B.Bob).
                                                           # - This occurs when there is an injury, penalty, fumble recovery.

In [18]:
####################################
# NEW COLUMNS FOR BROKEN DOWN DATA #
####################################

new_columns = ["PlayType", "TimeOnTheClock", "Formation", "Passer", "Rusher", "Receiver", "PassType", "PassDirection", "PassYardage",
                       "Tackler1", "Tackler2", "PressureBy",
                       "InjuredPlayers", "PenaltyPlayers"]

string_columns = ["PlayType", "TimeOnTheClock", "Formation", "Passer", "Rusher", "Receiver", "PassType", "PassDirection",
                  "Tackler1", "Tackler2", "PressureBy", "InjuredPlayers", "PenaltyPlayers"]

int_columns = ["PassYardage"]

df_2023_pass_sb_detailed = df_2023_pass_sb.copy()
df_2023_pass_sb_detailed = df_2023_pass_sb_detailed.reindex(columns=df_2023_pass_sb_detailed.columns.tolist() + new_columns)
df_2023_pass_sb_detailed[string_columns] = df_2023_pass_sb_detailed[string_columns].astype(str)
df_2023_pass_sb_detailed[int_columns] = df_2023_pass_sb_detailed[int_columns].astype(float)

## Pass

### Identify Different Pass Plays
- This section is used to categorize different pass plays to see if they have to be handled differently.
  - Eventually, each category of pass play will break down into the same set of features. The question here is how does each category of pass play break down to fall into these common features?

GOAL: To create a single method that will handle each type of pass play and break them down to a common set of features.

In [19]:
df_2023_pass_sb['PlayOutcome'].unique()

array(['1 Yard Pass', '8 Yard Pass', 'Pass for No Gain', '7 Yard Pass',
       '52 Yard Pass', 'Pass Incomplete', '10 Yard Pass', '9 Yard Pass',
       '5 Yard Pass', '18 Yard Pass', '11 Yard Pass', '12 Yard Pass',
       '-8 Yard Pass', '6 Yard Pass', '21 Yard Pass', '3 Yard Pass',
       '2 Yard Pass', '16 Yard Pass', '13 Yard Pass', '25 Yard Pass',
       '22 Yard Pass', '-3 Yard Pass', '4 Yard Pass', '19 Yard Pass',
       '17 Yard Pass', '20 Yard Pass', '23 Yard Pass', '24 Yard Pass'],
      dtype=object)

In [20]:
# 3 different formats as far as I can see.
# 1. '# Yard Pass'
# 2. 'Pass Incomplete'
# 3. 'Pass for No Gain'

df_successful_passes = df_2023_pass_sb[df_2023_pass_sb['PlayOutcome'].str.contains('Yard Pass')]
df_incomplete_passes = df_2023_pass_sb[df_2023_pass_sb['PlayOutcome'].str.contains('Pass Incomplete')]
df_pass_for_no_gain = df_2023_pass_sb[df_2023_pass_sb['PlayOutcome'].str.contains('Pass for No Gain')]

### New Features & DataFrame

In [33]:
for idx, value in df_2023_pass_sb['PlayOutcome'].items():
  play = df_2023_plays_sb['PlayDescription'].iloc[idx]
  print(play)
  print()

(11:39) (Shotgun) P.Mahomes pass short left to T.Kelce to KC 25 for 1 yard (C.Young; D.Greenlaw).

(11:04) (Shotgun) P.Mahomes pass short right to J.McKinnon to KC 33 for 8 yards (F.Warner, D.Greenlaw).

(5:15) (Shotgun) P.Mahomes pass short left to R.Rice to KC 17 for no gain (F.Warner).

(14:48) P.Mahomes pass short left to I.Pacheco pushed ob at KC 32 for 7 yards (T.Gipson).

(13:01) (Shotgun) P.Mahomes pass deep right to M.Hardman to SF 9 for 52 yards (J.Brown).

(9:16) (Shotgun) P.Mahomes pass incomplete short left [C.Young]. PENALTY on KC-P.Mahomes, Intentional Grounding, 10 yards, enforced at KC 20.

(14:15) (Shotgun) P.Mahomes pass short middle to N.Gray to KC 23 for 10 yards (L.Ryan).

(12:31) (Shotgun) P.Mahomes pass incomplete deep left to M.Valdes-Scantling.

(12:26) (Shotgun) P.Mahomes pass short right to T.Kelce to KC 11 for 9 yards (O.Burks; C.Ward).

(3:26) (Shotgun) P.Mahomes pass short middle to T.Kelce to KC 35 for 5 yards (O.Burks).

(2:49) (Shotgun) P.Mahomes pass 

In [21]:
for idx, value in df_2023_pass_sb['PlayOutcome'].items():
  play = df_2023_plays_sb['PlayDescription'].iloc[idx]

  ################
  # Play details #
  ################

  # Play Type
  df_2023_pass_sb_detailed.loc[idx, 'PlayType'] = 'Pass'

  # TimeOnTheClock
  TimeOnTheClock = re.findall(time_on_clock_pattern, play)
  df_2023_pass_sb_detailed.loc[idx, 'TimeOnTheClock'] = TimeOnTheClock[0][1:-1]

  #############
  #  OFFENSE  #
  #############

  # Formation
  Formation = re.findall(formation, play)
  if len(Formation) > 0:
    df_2023_pass_sb_detailed.loc[idx, 'Formation'] = Formation[0][1:-1]
  # Quarterback & Receiver
  Quarterback = re.findall(name_pattern, play)
  df_2023_pass_sb_detailed.loc[idx, 'Passer'] = Quarterback[0] # Quarterback
  Receiver = re.findall(receiver_name_pattern, play)
  if len(Receiver) > 0:
    df_2023_pass_sb_detailed.loc[idx, 'Receiver'] = Receiver[0][1:] # Receiver

  # Deep or short pass
  if value.find('Incomplete') != -1:
    df_2023_pass_sb_detailed.loc[idx, 'PassType'] = 'Incomplete'
    df_2023_pass_sb_detailed.loc[idx, 'PassYardage'] = 0
  elif value.find('No Gain') != -1:
    if play.find('short') != -1:
      df_2023_pass_sb_detailed.loc[idx, 'PassType'] = 'Short'
    elif play.find('deep') != -1:
      df_2023_pass_sb_detailed.loc[idx, 'PassType'] = 'Deep'
    df_2023_pass_sb_detailed.loc[idx, 'PassYardage'] = 0
  else:
    if int(value.split()[0]) >= 20:
      df_2023_pass_sb_detailed.loc[idx, 'PassType'] = 'Deep'
    else:
      df_2023_pass_sb_detailed.loc[idx, 'PassType'] = 'Short'
    # Yardage gained on play
    df_2023_pass_sb_detailed.loc[idx, 'PassYardage'] = int(value.split()[0])

  # Pass Direction
  if play.find('left') != -1:
    df_2023_pass_sb_detailed.loc[idx, 'PassDirection'] = 'Left'
  elif play.find('right') != -1:
    df_2023_pass_sb_detailed.loc[idx, 'PassDirection'] = 'Right'
  elif play.find('middle') != -1:
    df_2023_pass_sb_detailed.loc[idx, 'PassDirection'] = 'Middle'
  # # Yardage gained on play
  # df_successful_passes_detailed.loc[idx, 'PassYardage'] = int(value.split()[0])

  #############
  #  DEFENSE  #
  #############

  tackler_1 = re.findall(defense_tackler_1_name_pattern, play) # tackler #1 (Could be solo or the one who initiated the hit)
  if len(tackler_1) > 0:
    df_2023_pass_sb_detailed.loc[idx, 'Tackler1'] = tackler_1[0][1:]
  tackler_2 = re.findall(defense_tackler_2_name_pattern, play) # tackler #2 (equally contributed or assisted with tackle)
  if len(tackler_2) > 0:
    df_2023_pass_sb_detailed.loc[idx, 'Tackler2'] = tackler_2[0][1:-1]
  pressure = re.findall(defense_pressure_name_pattern, play)   # Player who applied pressure to passer
  if len(pressure) > 0:
    df_2023_pass_sb_detailed.loc[idx, 'PressureBy'] = pressure[0][1:-1]

  #############
  #  PENALTY  #
  #############

  if play.find('Penalty') != -1:
    penalty_name = re.findall(team_identified_name, play)
    # df_2023_pass_sb_detailed.at[idx, 'PenaltyPlayers'] = penalty_name
    df_2023_pass_sb_detailed.at[idx, 'PenaltyPlayers'] = [x[1:] for x in penalty_name]

  if play.find('injured') != -1:
    injured_name = re.findall(team_identified_name, play)
    # df_2023_pass_sb_detailed.at[idx, 'InjuredPlayers'] = injured_name
    df_2023_pass_sb_detailed.at[idx, 'InjuredPlayers'] = [x[1:] for x in injured_name]

In [22]:
df_2023_pass_sb_detailed

Unnamed: 0,Season,Week,Day,Date,AwayTeam,HomeTeam,Quarter,DriveNumber,TeamWithPossession,IsScoringDrive,...,Rusher,Receiver,PassType,PassDirection,PassYardage,Tackler1,Tackler2,PressureBy,InjuredPlayers,PenaltyPlayers
1,2023,Super Bowl,SUN,02/11,49ers,Chiefs,1ST QUARTER,2,KC,0,...,,T.Kelce,Short,Left,1.0,C.Young,D.Greenlaw,,,
2,2023,Super Bowl,SUN,02/11,49ers,Chiefs,1ST QUARTER,2,KC,0,...,,J.McKinnon,Short,Right,8.0,F.Warner,D.Greenlaw,,,
6,2023,Super Bowl,SUN,02/11,49ers,Chiefs,1ST QUARTER,4,KC,0,...,,R.Rice,Short,Left,0.0,F.Warner,,,,
10,2023,Super Bowl,SUN,02/11,49ers,Chiefs,2ND QUARTER,2,KC,0,...,,I.Pacheco,Short,Left,7.0,T.Gipson,,,,
13,2023,Super Bowl,SUN,02/11,49ers,Chiefs,2ND QUARTER,2,KC,0,...,,M.Hardman,Deep,Right,52.0,J.Brown,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177,2023,Super Bowl,SUN,02/11,49ers,Chiefs,OVERTIME,1,SF,1,...,,C.McCaffrey,Short,Middle,2.0,C.Jones,,,,
181,2023,Super Bowl,SUN,02/11,49ers,Chiefs,OVERTIME,1,SF,1,...,,B.Aiyuk,Short,Left,11.0,M.Edwards,,,,
183,2023,Super Bowl,SUN,02/11,49ers,Chiefs,OVERTIME,1,SF,1,...,,C.McCaffrey,Deep,Left,24.0,L.Sneed,,G.Karlaftis,,
186,2023,Super Bowl,SUN,02/11,49ers,Chiefs,OVERTIME,1,SF,1,...,,K.Juszczyk,Short,Right,13.0,,,,,


In [23]:
df_2023_pass_sb_detailed[["PlayDescription", "PlayType", "TimeOnTheClock", "Formation", "Passer", "Receiver", "PassType", "PassDirection", "PassYardage",
                          "Tackler1", "Tackler2", "PressureBy",
                          "InjuredPlayers", "PenaltyPlayers"]]

Unnamed: 0,PlayDescription,PlayType,TimeOnTheClock,Formation,Passer,Receiver,PassType,PassDirection,PassYardage,Tackler1,Tackler2,PressureBy,InjuredPlayers,PenaltyPlayers
1,(11:39) (Shotgun) P.Mahomes pass short left to...,Pass,11:39,Shotgun,P.Mahomes,T.Kelce,Short,Left,1.0,C.Young,D.Greenlaw,,,
2,(11:04) (Shotgun) P.Mahomes pass short right t...,Pass,11:04,Shotgun,P.Mahomes,J.McKinnon,Short,Right,8.0,F.Warner,D.Greenlaw,,,
6,(5:15) (Shotgun) P.Mahomes pass short left to ...,Pass,5:15,Shotgun,P.Mahomes,R.Rice,Short,Left,0.0,F.Warner,,,,
10,(14:48) P.Mahomes pass short left to I.Pacheco...,Pass,14:48,,P.Mahomes,I.Pacheco,Short,Left,7.0,T.Gipson,,,,
13,(13:01) (Shotgun) P.Mahomes pass deep right to...,Pass,13:01,Shotgun,P.Mahomes,M.Hardman,Deep,Right,52.0,J.Brown,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177,(14:55) (Shotgun) B.Purdy pass short middle to...,Pass,14:55,Shotgun,B.Purdy,C.McCaffrey,Short,Middle,2.0,C.Jones,,,,
181,(12:38) (Shotgun) B.Purdy pass short left to B...,Pass,12:38,Shotgun,B.Purdy,B.Aiyuk,Short,Left,11.0,M.Edwards,,,,
183,(11:12) (Shotgun) B.Purdy pass short left to C...,Pass,11:12,Shotgun,B.Purdy,C.McCaffrey,Deep,Left,24.0,L.Sneed,,G.Karlaftis,,
186,(9:25) (Shotgun) B.Purdy pass short right to K...,Pass,9:25,Shotgun,B.Purdy,K.Juszczyk,Short,Right,13.0,,,,,


## Run

### Identify Different Run Plays
- This section is used to categorize different run plays to see if they have to be handled differently.
  - Eventually, each category of pass play will break down into the same set of features. The question here is how does each category of pass play break down to fall into these common features?

GOAL: To create a single method that will handle each type of pass play and break them down to a common set of features.

In [24]:
df_2023_run_sb['PlayOutcome'].unique()

array(['-3 Yard Run', '10 Yard Run', '4 Yard Run', '2 Yard Run',
       '5 Yard Run', '3 Yard Run', 'Run for No Gain', '6 Yard Run',
       '11 Yard Run', '-2 Yard Run', '24 Yard Run', '1 Yard Run',
       '-1 Yard Run', '8 Yard Run', '22 Yard Run', '9 Yard Run',
       '19 Yard Run', '-4 Yard Run', '7 Yard Run', '16 Yard Run'],
      dtype=object)

In [25]:
# 2 different formats ?

df_yard_runs = df_2023_run_sb[df_2023_run_sb['PlayOutcome'].str.contains('Yard Run')]
df_run_for_no_gain = df_2023_run_sb[df_2023_run_sb['PlayOutcome'].str.contains('Run for No Gain')]

In [48]:
###########################
# NEW COLUMN DESCRIPTIONS #
###########################

# PlayType           - The type of play (e.g. pass/run)
# TimeOnTheClock     - The time that was on the clock when the play started
# Formation          - Play formation
# Passer             - Player that threw the ball (mostly the quarterback)
# Rusher             - Player that ran the ball (mostly the runningback)
# Receiver           - Player on the same team as the passer that caught the ball
# PassType           - Whether the pass was a deep or short pass?
# Direction          - Where the ball is going during the play
# Yardage            - Yards gained during the play
# TackleBy1          - Main tackler on the play (could be solo or could be with someone else)
# TackleBy2          - Assisted tackler1
# PressureBy         - Defender that applied pressure to the passer
# ForcedFumbleBy     - Defender that forced a fumble
# AfterFumble        - A list that has what happened after the fumble
#                      - [recovered by, yards gained, tackled by]
# InjuredPlayers     - Players that were injured during the play
# PenaltyDescription - If there is a penalty, gives a description of it
#                      - [who caused the penalty, what was the penalty, yards lost if penalty accepted]

new_columns = ["PlayType", "TimeOnTheClock", "Formation", "Passer", "Rusher", "Receiver", "PassType", "Direction", "Yardage",
               "TackleBy1", "TackleBy2", "PressureBy", "ForcedFumbleBy",
               "AfterFumble",
               "InjuredPlayers", "PenaltyDescription"]

string_columns = ["PlayType", "TimeOnTheClock", "Formation", "Passer", "Rusher", "Receiver", "PassType", "Direction",
                  "TackleBy1", "TackleBy2", "PressureBy", "ForcedFumbleBy",
                  "AfterFumble",
                  "InjuredPlayers", "PenaltyDescription"]

int_columns = ["Yardage"]

df_yard_runs_detailed = df_2023_run_sb.copy()
df_yard_runs_detailed = df_yard_runs_detailed.reindex(columns=df_yard_runs_detailed.columns.tolist() + new_columns)
df_yard_runs_detailed[string_columns] = df_yard_runs_detailed[string_columns].astype(str)
df_yard_runs_detailed[int_columns] = df_yard_runs_detailed[int_columns].astype(float)

In [None]:
for idx, value in df_yard_runs_detailed['PlayOutcome'].items():
  play = df_2023_plays_sb['PlayDescription'].iloc[idx]

  # Yardage from play. (grabbed from play outcome)
  # - Will be rewritten if there was a fumble or penalty
  df_yard_runs_detailed.loc[idx, 'Yardage'] = int(value.split()[0])

  ##########################
  # MISTAKES AND TURNOVERS #
  ##########################

  # Fumble
  if play.find('FUMBLES') != -1:
    fumble_play_elements = play.split(". ")
    play = fumble_play_elements[0] # Updated play description
    # Yardage from actual carry
    yardage = re.findall(manual_yardage, play)
    df_yard_runs_detailed.loc[idx, 'Yardage'] = int(yardage[0].split()[0])

    for i in fumble_play_elements[1::]:
      # fumble & recovery
      # Reason it is in a for loop is just in case there were more fumbles following.
      if i.find('FUMBLES') != -1:
        # Player who forced fumble
        player_forced_fumble = re.findall(defense_tackler_1_name_pattern, i)
        df_yard_runs_detailed.loc[idx, 'ForcedFumbleBy'] = player_forced_fumble[0][1:]

      else:
        # Who recovered?
        action_after_fumble = []
        player_running_after_fumble = re.findall(rusher_pattern, i)
        action_after_fumble.append(player_running_after_fumble[0])
        yardage_gained_after_fumble = re.findall(manual_yardage, i)
        action_after_fumble.append(yardage_gained_after_fumble[0])
        tackler = re.findall(defense_tackler_1_name_pattern, i)
        action_after_fumble.append(tackler[0][1:])
        df_yard_runs_detailed.loc[idx, 'AfterFumble'] = action_after_fumble
        # print(action_after_fumble)

  # Penalty
  if play.find('PENALTY') != -1:
    penalty_play_elements = play.split(". ")
    play = penalty_play_elements[0]
    # Yardage from actual carry
    yardage = re.findall(manual_yardage, play)
    df_yard_runs_detailed.loc[idx, 'Yardage'] = int(yardage[0])

    for i in penalty_play_elements[1::]:
      penalty_breakdown = []
      penalty = i.split(", ")
      # Player
      penalty_called_on = re.findall(team_identified_name, i)
      penalty_breakdown.append(penalty_called_on[0][1:])
      # Penalty
      penalty_breakdown.append(penalty[1])
      # Yardage from penalty
      penalty_breakdown.append(penalty[2])
      df_yard_runs_detailed.loc[idx, 'PenaltyDescription'] = penalty_breakdown

  ################
  # Play details #
  ################

  # Play Type
  df_yard_runs_detailed.loc[idx, 'PlayType'] = 'Run'

  #############
  #  OFFENSE  #
  #############

  # Runningback
  rusher_names = re.findall(rusher_pattern, play)
  rusher_name = rusher_names[0][:-1]
  print(rusher_name)

  # Direction
  rushing_directions = ['guard', 'middle', 'tackle', 'end', 'kneels']
  for i in rushing_directions:
    if play.find(i) != -1:
      start = play.find(rusher_name) + len(rusher_name) + 1
      end = play.find(i) + len(i)
      print(play[start:end])

  print()

(12:15) (Shotgun) I.Pacheco left guard to KC 24 for -3 yards (N.Bosa, J.Hargrave).
-3 Yard Run


NameError: name 'df_2023_run_sb_detailed' is not defined

In [43]:
# I do not think the play outcomes have the right values for true yardage gained.

for idx, value in df_yard_runs['PlayOutcome'].items():
  play = df_2023_plays_sb['PlayDescription'].iloc[idx]

  # Insert yardage here from playoutcome.
  # - Will be rewritten if it goes into fumbles or penalties.

  if play.find('FUMBLES') != -1:
    fumble_play_elements = play.split(". ")
    play = fumble_play_elements[0]
    # Yardage from actual carry
    yardage = re.findall(manual_yardage, play)
    print(yardage[0].split()[0])

    for i in fumble_play_elements[1::]:
      # fumble & recovery
      print(i)
      if i.find('FUMBLES') != -1:
        # Who forced fumble?
        player_forced_fumble = re.findall(defense_tackler_1_name_pattern, i)
        print(player_forced_fumble[0][1:])

      else:
        # Who recovered?
        action_after_fumble = []
        player_running_after_fumble = re.findall(rusher_pattern, i)
        action_after_fumble.append(player_running_after_fumble[0])
        yardage_gained_after_fumble = re.findall(manual_yardage, i)
        action_after_fumble.append(yardage_gained_after_fumble[0])
        tackler = re.findall(defense_tackler_1_name_pattern, i)
        action_after_fumble.append(tackler[0][1:])
        print(action_after_fumble)

  if play.find('PENALTY') != -1:
    penalty_play_elements = play.split(". ")
    play = penalty_play_elements[0]
    # Yardage from actual carry
    yardage = re.findall(manual_yardage, play)
    print(yardage[0])

    for i in penalty_play_elements[1::]:
      penalty_breakdown = []
      penalty = i.split(", ")
      # Player
      penalty_called_on = re.findall(team_identified_name, i)
      penalty_breakdown.append(penalty_called_on[0][1:])
      # Penalty
      penalty_breakdown.append(penalty[1])
      # Yardage from penalty
      penalty_breakdown.append(penalty[2])
      print(penalty_breakdown)

  print(play)
  print(value)

  ################
  # Play details #
  ################

  # Play Type
  # df_2023_run_sb_detailed.loc[idx, 'PlayType'] = 'Run'

  #############
  #  OFFENSE  #
  #############

  # Runningback
  rusher_names = re.findall(rusher_pattern, play)
  rusher_name = rusher_names[0][:-1]
  print(rusher_name)

  # Direction
  rushing_directions = ['guard', 'middle', 'tackle', 'end', 'kneels']
  for i in rushing_directions:
    if play.find(i) != -1:
      start = play.find(rusher_name) + len(rusher_name) + 1
      end = play.find(i) + len(i)
      print(play[start:end])

  print()

(12:15) (Shotgun) I.Pacheco left guard to KC 24 for -3 yards (N.Bosa, J.Hargrave).
-3 Yard Run
I.Pacheco
left guard

(6:28) (Shotgun) I.Pacheco right guard to KC 21 for 10 yards (L.Ryan; J.Brown).
10 Yard Run
I.Pacheco
right guard

(4:36) (Shotgun) P.Mahomes scrambles up the middle to KC 21 for 4 yards (R.Gregory).
4 Yard Run
P.Mahomes
scrambles up the middle

(14:15) (Shotgun) I.Pacheco right guard to KC 34 for 2 yards (K.Givens).
2 Yard Run
I.Pacheco
right guard

3
FUMBLES (L.Ryan), recovered by KC-Ju.Watson at KC 37
L.Ryan
Ju.Watson to KC 39 for 2 yards (J.Brown).
['Ju.Watson ', '2 yards', 'J.Brown']
(13:41) R.Rice right tackle to KC 37 for 3 yards (L.Ryan; D.Greenlaw)
5 Yard Run
R.Rice
right tackle

(9:07) (Shotgun) I.Pacheco up the middle to KC 14 for 4 yards (F.Warner; J.Kinlaw).
4 Yard Run
I.Pacheco
up the middle

(8:35) (Shotgun) P.Mahomes scrambles right tackle to KC 17 for 3 yards (A.Armstead).
3 Yard Run
P.Mahomes
scrambles right tackle

(3:59) (Shotgun) I.Pacheco up the mid