<a href="https://colab.research.google.com/github/Keoni808/NFL_Data_Cleaning/blob/main/NFL_Plays_Week1_2023.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

PURPOSE:
- To view a larger sample size of plays.
  - Currently working on breaking down a single game but do not have enough data in that game to correctly break down all play descriptions for different play types.

NOTES:

# MOUNTING AND IMPORTS

In [None]:
# Mount your Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Used to access personal google cloud services
from google.colab import auth
auth.authenticate_user()
print('Authenticated')

Authenticated


In [None]:
# Installs
!pip install ipdb

Collecting ipdb
  Downloading ipdb-0.13.13-py3-none-any.whl.metadata (14 kB)
Collecting jedi>=0.16 (from ipython>=7.31.1->ipdb)
  Using cached jedi-0.19.1-py2.py3-none-any.whl.metadata (22 kB)
Downloading ipdb-0.13.13-py3-none-any.whl (12 kB)
Using cached jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
Installing collected packages: jedi, ipdb
Successfully installed ipdb-0.13.13 jedi-0.19.1


In [None]:
# Imports

# Data manipulation
import pandas as pd

# Regular expressions
import re

# Grab data from database
from google.cloud import bigquery

# # Debugging (Not going to use right now)
# import ipdb

In [None]:
# # Turning on automatic debugger
# %pdb on

# LOADING DATA (BigQuery queries)

In [None]:
# Client connect to bigquery project
client = bigquery.Client('nfl-data-430702')

## Season 2023 Week 1

In [None]:
# Grabbing all plays from Super Bowl 2023
week1_2023_plays_query = """
                         SELECT *
                         FROM `nfl-data-430702.NFL_Scores.NFL-Plays-Week1_2023`
                         """

# Running psuedo query, and returns the amount of bytes it will take to run query
dry_run_config = bigquery.QueryJobConfig(dry_run=True)
dry_run_query = client.query(week1_2023_plays_query, job_config=dry_run_config)
print("This query will process {} bytes.".format(dry_run_query.total_bytes_processed))

# Running query (Being mindful of the amount of data being grabbed)
# Will grab a maximum of a Gigabyte
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**9)
safe_config_query = client.query(week1_2023_plays_query, job_config=safe_config)

This query will process 570194 bytes.


In [None]:
# Putting data attained from query into a dataframe
week1_2023_plays = safe_config_query.to_dataframe()

In [None]:
week1_2023_plays.head()

Unnamed: 0,Season,Week,Day,Date,AwayTeam,HomeTeam,Quarter,DriveNumber,TeamWithPossession,IsScoringDrive,PlayNumberInDrive,IsScoringPlay,PlayOutcome,PlayDescription,PlayStart
0,2023,Week 1,MON,09/11,Bills,Jets,1ST QUARTER,1,BUF,0,1,0,Kickoff,G.Zuerlein kicks 65 yards from NYJ 35 to end z...,Kickoff from NYJ 35
1,2023,Week 1,MON,09/11,Bills,Jets,1ST QUARTER,1,BUF,0,2,0,7 Yard Pass,(15:00) (Shotgun) J.Allen pass short right to ...,1st & 10 at BUF 25
2,2023,Week 1,MON,09/11,Bills,Jets,1ST QUARTER,1,BUF,0,3,0,5 Yard Pass,"(14:34) (No Huddle, Shotgun) J.Allen pass shor...",2nd & 3 at BUF 32
3,2023,Week 1,MON,09/11,Bills,Jets,1ST QUARTER,1,BUF,0,4,0,3 Yard Run,(14:01) J.Cook up the middle to BUF 40 for 3 y...,1st & 10 at BUF 37
4,2023,Week 1,MON,09/11,Bills,Jets,1ST QUARTER,1,BUF,0,5,0,2 Yard Run,(13:24) (Shotgun) J.Cook up the middle to BUF ...,2nd & 7 at BUF 40


In [None]:
# Observation of the amount of data being worked on
week1_2023_plays.shape

(2600, 15)

# CATEGORIZE PLAYS
- The goal here is to parse out the different values for 'PlayOutcome'
  - separate pass / run / kickoff / etc.

## PARSING


In [None]:
# Maybe try to fuzzywuzzy this in the future?

# All play outcomes from the game
# - From here we can categorize and clean plays accordingly
week1_2023_plays['PlayOutcome'].unique()

array(['Kickoff', '7 Yard Pass', '5 Yard Pass', '3 Yard Run',
       '2 Yard Run', 'Pass Incomplete', 'Punt', '-5 Yard Penalty',
       '5 Yard Run', '1 Yard Pass', '14 Yard Run', '3 Yard Pass',
       '8 Yard Run', '6 Yard Pass', '15 Yard Pass', '-9 Yard Sack',
       '4 Yard Pass', '13 Yard Pass', 'Field Goal', '-2 Yard Sack',
       'Interception', '-5 Yard Run', '18 Yard Pass', '8 Yard Pass',
       '6 Yard Run', '12 Yard Run', '-1 Yard Run', '26 Yard Pass',
       'Touchdown Bills', 'Extra Point Good', '13 Yard Run',
       '-3 Yard Sack', '7 Yard Run', '9 Yard Pass', '4 Yard Run',
       'Fumble', '-10 Yard Penalty', '10 Yard Pass', '26 Yard Run',
       '5 Yard Penalty', '-10 Yard Sack', '22 Yard Pass', '-4 Yard Run',
       '-12 Yard Sack', '83 Yard Run', '1 Yard Run', '2 Yard Pass',
       '10 Yard Run', 'Run for No Gain', '12 Yard Pass', '20 Yard Pass',
       '9 Yard Run', '-2 Yard Pass', 'Sack', '24 Yard Pass',
       '14 Yard Pass', 'Touchdown Jets', '-3 Yard Run', '-2 Yar

In [None]:
# There are more types of plays that I have not made yet for Week 1.

# Looking at all unique play outcomes and categorizing them.
# - This type of approach does not feel very flexable because a play outcome can
#   arise that has not been seen yet.
# - There may be more in the future when working on a full season, let alone all seasons and future games
df_2023_pass_week1 = week1_2023_plays[week1_2023_plays['PlayOutcome'].str.contains('Pass')]
df_2023_run_week1 = week1_2023_plays[week1_2023_plays['PlayOutcome'].str.contains('Run')]

# df_2023_punt_week1 = week1_2023_plays[week1_2023_plays['PlayOutcome'].str.contains('Punt')]
# df_2023_sack_week1 = week1_2023_plays[week1_2023_plays['PlayOutcome'].str.contains('Sack')]
# df_2023_kickoff_week1 = week1_2023_plays[week1_2023_plays['PlayOutcome'].str.contains('Kickoff')]
# df_2023_fumble_week1 = week1_2023_plays[week1_2023_plays['PlayOutcome'].str.contains('Fumble')]
# df_2023_interception_week1 = week1_2023_plays[week1_2023_plays['PlayOutcome'].str.contains('Interception')]
# df_2023_penalty_week1 = week1_2023_plays[week1_2023_plays['PlayOutcome'].str.contains('Penalty')]
# df_2023_fieldgoal_week1 = week1_2023_plays[week1_2023_plays['PlayOutcome'].str.contains('Field Goal')]
# df_2023_touchdown_week1 = week1_2023_plays[week1_2023_plays['PlayOutcome'].str.contains('Touchdown')]
# df_2023_extrapoint_week1 = week1_2023_plays[week1_2023_plays['PlayOutcome'].str.contains('Extra Point')]

# plays_list = [df_2023_pass_sb,
#               df_2023_run_sb,
#               df_2023_punt_sb,
#               df_2023_sack_sb,
#               df_2023_kickoff_sb,
#               df_2023_fumble_sb,
#               df_2023_interception_sb,
#               df_2023_penalty_sb,
#               df_2023_fieldgoal_sb,
#               df_2023_touchdown_sb,
#               df_2023_extrapoint_sb]

## SANITY CHECK (All Plays Accounted for)
- NOT COMPLETE
  - Still need to grab other play types

## HELPER METHODS (personal use)
- For personal use, does not actually take part in cleaning dataset at all.

In [None]:
# PURPOSE:
# - Quick look at a section of plays
#   - Ideally the plays that the user wants to break down and clean.
# INPUT PARAMETERS:
# df_all_plays      - DataFrame - The original dataframe where the desired plays to view came from
# df_section_plays  - DataFrame - A section of the original dataframe the user wants to view
# RETURN:
# - Printing to the console:
#   1. index of play
#   2. 'PlayDescription' feature of play
#   3. 'PlayOutcome' feature of play
def print_plays(df_all_plays, df_section_plays):
  for idx, value in df_section_plays['PlayOutcome'].items():
    print("index:" + str(idx))
    play = df_all_plays['PlayDescription'].iloc[idx]
    print(play)
    print(value)
    print()

# PIPELINE

## CLEANING METHODS
- Will contain all methods to clean every type of play
- Uses the raw data in the feature 'PlayOutcome' to parse plays into different categories

In [None]:
####################################################
# REGULAR EXPRESSIONS USED TO LOCATE SPECIFIC DATA #
####################################################

################
# PLAY DETAILS #
################

time_on_clock_pattern = r'\(\d*:\d+\)'
formation = r'\([A-Za-z]+ ?[A-Za-z]*,? ?[A-Za-z]*\)'
yardage_gained = r'for -?[0-9]+ yards?'

#################
# NAMES OFFENSE #
#################

name_pattern = r'\b[A-Za-z]+\.[A-Za-z]+-?[A-Za-z]*\b' # Grabs all names but will only be used for Passer
receiver_name_pattern = r'\b [A-Za-z]+\.[A-Za-z]+-?[A-Za-z]*\b' # Receivers have a space before their name
rusher_pattern = r'\b[A-Za-z]+\.[A-Za-z]+-?[A-Za-z]* \b' # Runningbacks, like quarterbacks, are the first names in play descriptions

#################
# NAMES DEFENSE #
#################

defense_tackler_1_name_pattern = r'\([A-Za-z]+\.[A-Za-z]+-?[A-Za-z]*' # Will have a "(" in front of the name
defense_tackler_2_name_pattern = r' [A-Za-z]+\.[A-Za-z]+-?[A-Za-z]*\)' # Will have a ")" at the end of the name
defense_pressure_name_pattern = r'\[[A-Za-z]+\.[A-Za-z]+-?[A-Za-z]*\]' # Surrounded by "[]" brackets

########################
# TEAM IDENTIFIED NAME #
########################

team_identified_name = r'-[A-Za-z]+\.[A-Za-z]+-?[A-Za-z]*' # team initials comes before their name (e.g. KC-B.Bob).
                                                           # - This occurs when there is an injury, penalty, fumble recovery.

#######################
# PATTERNS ON FUMBLES #
#######################

qb_fumble = "[A-Za-z]+\.[A-Za-z]+-?[A-Za-z]* to [A-Z]+ [0-9]+ for -?[0-9]+ yards$" # When quarterbacks fumble the ball after snap(?)
run_after_recovery = "^[A-Za-z]+\.[A-Za-z]+-?[A-Za-z]* to [A-Z]+ [0-9]+ for " # yardage after recovery (formatted almost exactly like a regular run play)


### HELPER METHODS (Pass & Run)
- Methods that will help break down unique situations that happen during ordinary plays.
- ADDRESSED SITUATIONS:
  1. Fumbles ( pass / run )
    - Still in the making


##### Pass Fumble Plays

In [None]:
# THIS IS ONLY FOR PASSING RIGHT NOW.

# Regular expression used to grab QB only fumbles.
# Example - "(14:21) J.Love to CHI 44 for -3 yards"
# NOTE:
# - There are other plays that follow this format.
#   So far I have seen:
#   1. P.Campbell to NYG 33 for -2 yards
#      - What looks like to be an ordinary run play
qb_fumble = "[A-Za-z]+\.[A-Za-z]+-?[A-Za-z]* to [A-Z]+ [0-9]+ for -?[0-9]+ yards$"

# Regular expression used for players who recovered the fumbled ball.
# Example: NYG-P.Campbell
fumble_recoverer = "[A-Z]+-[A-Z]+\.[A-Za-z]+-?[A-Za-z]*"

# PURPOSE:
# - Extract fumble data from fumble plays.
#   - Goal is to strictly grab data that can only appear during fumbled plays,
#     the rest of the data will go down through the pipeline.

def extract_fumble_data(play):

  # Every action of the play is recorded into sentences that can be broken down.
  # - Goal is to strictly grab data that only appears during fumbled plays,
  #   the rest will go through the set play type pipeline.
  play_elements = play.split(". ")
  # Collecting fumble data in the exact order in which it happened.
  extracted_fumble_details = [None] * len(play_elements)
  push_back_to_pipeline = []
  # When traversing through each element, some elements will singal that
  # the next element is a detail exclusively found in fumble plays.
  automatic_fumble_detail_add = False

  for i in play_elements:
    if automatic_fumble_detail_add:
      extracted_fumble_details.pop(play_elements.index(i))
      extracted_fumble_details.insert(play_elements.index(i), i)
      automatic_fumble_detail_add = False
      continue
    else:
      # All plays added to this list, then shaved off if neccessary.
      push_back_to_pipeline.append(i)

    # QB only fumbles
    # (e.g. '(14:21) J.Love to CHI 44 for -3 yards.')
    passer = re.findall(qb_fumble, i)
    if len(passer) == 1:
      # Wanted element (QB only fumble) does not:
      # 1. Follow a sentence stating that the ball has been fumbled.
      #    - In order to check the previous sentence, we must make sure there
      #      is a sentence there to check in the first place.
      if play_elements.index(i) > 0 and play_elements[play_elements.index(i)-1].find('FUMBLES') != -1:
        continue
      else:
        push_back_to_pipeline.pop(push_back_to_pipeline.index(i))
        extracted_fumble_details.pop(play_elements.index(i))
        extracted_fumble_details.insert(play_elements.index(i), i)

    # Fumble and recovery
    # If the person who recovered the ball then goes on to run the ball after,
    # their yardage gained from that run will be automatically added to extracted_fumble_details
    if i.find('FUMBLES') != -1:
      recoverer = re.findall(fumble_recoverer, i)
      if len(recoverer) > 0:
        player_who_recovered_ball = recoverer[0][recoverer[0].find("-") + 1:]
        try:
          if play_elements[play_elements.index(i)+1].find(player_who_recovered_ball) != -1:
            automatic_fumble_detail_add = True
        except IndexError:
          pass
      push_back_to_pipeline.pop(push_back_to_pipeline.index(i))
      extracted_fumble_details.pop(play_elements.index(i))
      extracted_fumble_details.insert(play_elements.index(i), i)

    # Reversed
    # If play has been reversed, the only offensive stats recorded are the
    # sentences that follow the play reversal.
    if i.find('REVERSED') != -1:
      for j in push_back_to_pipeline:
        extracted_fumble_details.pop(play_elements.index(j))
        extracted_fumble_details.insert(play_elements.index(j), j)
      push_back_to_pipeline.clear()

  return extracted_fumble_details, push_back_to_pipeline

In [None]:
for idx, value in df_2023_pass_week1['PlayOutcome'].items():
  play = week1_2023_plays['PlayDescription'].iloc[idx]
  if play.find('FUMBLES') != -1:
    fumble_details, main_play = extract_fumble_data(play)
    print(fumble_details)
    print(main_play)
    print(". ".join(main_play))
    print(value)
    print()

['(14:21) J.Love to CHI 44 for -3 yards', 'FUMBLES, and recovers at CHI 46', None]
['J.Love pass deep left to L.Musgrave to CHI 4 for 37 yards (T.Stevenson) [D.Walker].']
J.Love pass deep left to L.Musgrave to CHI 4 for 37 yards (T.Stevenson) [D.Walker].
37 Yard Pass

['(14:15) T.Lawrence pass short right to C.Ridley to JAX 47 for 14 yards (R.Thomas, E.Speed)', 'FUMBLES (E.Speed), RECOVERED by IND-E.Speed at IND 49', 'E.Speed ran ob at IND 49 for no gain', 'The Replay Official reviewed the ball was inbounds ruling, and the play was REVERSED', None, 'FUMBLES (E.Speed), ball out of bounds at IND 49', None, None]
['T.Lawrence pass short right to C.Ridley to JAX 47 for 14 yards (R.Thomas, E.Speed)', 'IND-K.Moore was injured during the play', 'IND-D.Flowers was injured during the play.']
T.Lawrence pass short right to C.Ridley to JAX 47 for 14 yards (R.Thomas, E.Speed). IND-K.Moore was injured during the play. IND-D.Flowers was injured during the play.
14 Yard Pass

[None, 'FUMBLES (B.Okere

In [None]:
for idx, value in df_2023_pass_week1['PlayOutcome'].items():
  play = week1_2023_plays['PlayDescription'].iloc[idx]
  if play.find('FUMBLES') != -1:
    print("index:" + str(idx))
    fumble_play_elements = play.split(". ")
    for i in fumble_play_elements:
      print(i)
    # print(play)
    print(value)
    print()

index:213
(14:21) J.Love to CHI 44 for -3 yards
FUMBLES, and recovers at CHI 46
J.Love pass deep left to L.Musgrave to CHI 4 for 37 yards (T.Stevenson) [D.Walker].
37 Yard Pass

index:423
(14:15) T.Lawrence pass short right to C.Ridley to JAX 47 for 14 yards (R.Thomas, E.Speed)
FUMBLES (E.Speed), RECOVERED by IND-E.Speed at IND 49
E.Speed ran ob at IND 49 for no gain
The Replay Official reviewed the ball was inbounds ruling, and the play was REVERSED
T.Lawrence pass short right to C.Ridley to JAX 47 for 14 yards (R.Thomas, E.Speed)
FUMBLES (E.Speed), ball out of bounds at IND 49
IND-K.Moore was injured during the play
IND-D.Flowers was injured during the play.
14 Yard Pass

index:872
(11:26) (Shotgun) D.Prescott pass short right to T.Pollard to NYG 12 for 7 yards (B.Okereke)
FUMBLES (B.Okereke), recovered by DAL-T.Biadasz at NYG 4.
7 Yard Pass

index:961
(4:45) (Shotgun) D.Jones pass short left to M.Breida to NYG 43 for 5 yards (M.Bell)
FUMBLES (M.Bell), recovered by NYG-P.Campbell at 

##### Run Fumble Plays

In [None]:
##################
# DESIGNING AREA #
##################

# ADDING A NEW ROW AT A DESIRED INDEX WHILE CLEANING DATAFRAME:
# - I need this right now (probably will need for other things in the future)
#   for run plays that involve fumble recoveries for gain.
#   - When these plays come up, they are usually formated as:
#     1. Rusher goes for x amount of yards
#     2. fumbled somehow
#     3. player recovers fumble and runs x amount of yards.
#     - Because this is all in 1 play, I would like to split it into 2
#       (1. and 3. having their own separate rows)

# PLAN:
# - Once signaled that a new row needs to be made:
#   1. Get index of play that needs new row (1. and 3. are grouped)
#   2. Create inputs for 2 separate rows by splitting single play
#   3. Change data of original play index to match 1.
#   4. Input new row directly after index of 1.
#      4.1 concat df_before_input, input, df_after_input
#      4.2 reset index of dataframe
#   5. Continue cleaning (CRUCIAL STEP)
#      - There are 3 separate methods in play here within my pipeline.
#        1. the main pipeline method that holds all methods that clean
#           specific plays within the dataframe input.
#           (INPUT: dataframe to be cleaned
#            OUTPUT: cleaned dataframe)
#            - Currently I have it designed that the original input dataframe
#              is the same dataframe that will be output.
#              - This might be a problem. Maybe it would be better to have
#                a separate dataframe returned instead of adjusting the
#                original that was input.
#        2.
#
#
#     - Issue here is that all plays after this will have a different
#       index. If the loop continues, the indexes will point to the
#       wrong plays.
#       - ~ SOLUTION:
#           - memorize the last index of play that needed the addition of a new row
#             - Relocate all specified type of plays within dataframe and have
#               the cleaning method start where it left off.

In [None]:
# Version 2

# NOTES:
# - I think "PlayOutcomes" is what determines the yardage gained on an intended play?
#   - This does not seem right to me.
#   - EXAMPLE:
#     - (9:54) Bre.Hall left end to BUF 22 for -1 yards (G.Rousseau)
#       FUMBLES (G.Rousseau), ball out of bounds at BUF 25.
#       - I would think that Bre.Hall would get docked -1 yards for his run.
#         - But I believe that he is actually docked -4
#           - 'PlayStart' = 2nd & 9 at BUF 21
#           - The play ends at BUF 25
#             - In my opinion and how I am going to track yardage is based on
#               possession of the ball. So I will track this as -1 yard not -4.

# PURPOSE:
# - Extract fumble details and push back data from plays that can be broken down by the
#   main play cleaning method.
# INPUT PARAMTERS:
# play - string - 'PlayDescription' of current focused play that contains a fumble
# OUTPUT:
# extracted_fumble_details - list - all details of the fumbled play that contain data
#                                   that is of less importance
#                                   - The reason for this is to save space. It does not
#                                     make sense to have features for this data when
#                                     1/100 plays will contain a fumble.
# push_back_to_pipeline    - list - All details of the fumbled play that can be broken
#                                   down by the main play cleaning method.

def extract_fumble_data_run(df_plays, play, play_index):
  play_elements = play.split(". ")
  extracted_fumble_details = [None] * len(play_elements)
  push_back_to_pipeline = []

  for i in play_elements:
    push_back_to_pipeline.append(i)

    # - Sometimes this play is the only sentence within 'PlayDescription'
    #   - I will need it to be broken down in the main method for play details
    #     such as 'PlayStart' and 'Formation'.
    #   - It will be within fumble_details as well to signify that it was in fact
    #     a fumble.
    if i.find('Aborted') != -1:
      extracted_fumble_details.pop(play_elements.index(i))
      extracted_fumble_details.insert(play_elements.index(i), i)
      # Will obtain in main play breakdown method
      # - formation
      # - quarterback || rusher
      continue

    if i.find('FUMBLES') != -1:
      push_back_to_pipeline.pop(push_back_to_pipeline.index(i))
      extracted_fumble_details.pop(play_elements.index(i))
      extracted_fumble_details.insert(play_elements.index(i), i)

    # 1. create new dataframe with added row
    # 2. replace old dataframe with new
    # 3. rerun cleaning method and continue cleaning
    #    starting with the added row.

    # This checks for plays after a fumble recovery.
    # When this passes, the current play description
    # will be split into 2. Each given their own row
    # to be broken down separately.
    # 1. before fumble recovery
    # 2. after fumble recovery
    after_fumble_action = re.findall(run_after_recovery, i)
    if len(after_fumble_action) > 0:
      # GOAL: Create a new row to add to the dataframe

      # 1. use index to split the dataframe at point of insertion
      df_before_insert = df_plays.iloc[:play_index+1] # include current play
      df_after_insert = df_plays.iloc[play_index+1:] # exclude current play

      # 2. Adjust current play PlayDescription
      #    - Within the input dataframe, change the current 'PlayDescription'
      #      to right before the sentence stating yardage after recovery.
      df_plays.at[play_index, 'PlayDescription'] = ". ".join(play_elements[:play_elements.index(i)])

      # 3. add data to create new row
      #    - 'PlayOutcome' = The same as the original PlayOutcome
      #      - REASONING:
      #        a. Will be group in with the same targeted plays
      #        b. An indication showing that although this play has been
      #           split, it is still the same play.
      #    - 'PlayDescription' = Split
      #       - What happens if it's a fumble after a fumble..?
      #       - I think the playdescription should take place from where the added
      #         new row play is. That way if there is another fumble after, it will
      #         again create a new row.
      added_row = df_plays.iloc[play_index].copy()
      added_row['PlayOutcome'] = value
      added_row['PlayDescription'] = ". ".join(play_elements[play_elements.index(i):])
      df_added_row = pd.DataFrame([added_row], columns=df_before_insert.columns)
      # df_added_row['PlayOutcome'] = value
      # added_row['PlayDescription'] = ". ".join(play_elements[play_elements.index(i):])
      # df_added_row.at['PlayDescription'] = ". ".join(play_elements[play_elements.index(i):])
      # print(added_row['PlayDescription'])
      # I need to change the play description of the current play as well and
      # have the cleaning method start from the original play.
      # - I will have to add all needed rows and once there are no more rows left
      #   to be added, then I will clean all those rows.

      # 4. concat 3 pieces together (before break df, added row, after break df)
      # 5. reindex

      # I dont think this is going to work.
      # df_plays_with_added_row = pd.concat([df_before_insert, df_added_row, df_after_insert], ignore_index=True)
      # df_plays = pd.concat([df_before_insert, df_added_row, df_after_insert], ignore_index=True)

      # 6. break out of 'extract_fumble_data_run'
      # 7. have the method refind all of the plays that it was trying to clean
      #    out of the original dataframe, and have it start cleaning plays right
      #    from where it left off. This way, it will start cleaning at the new
      #    row just created and will continue on.

      return None, None, df_added_row, play_index
      # return None, None, df_plays_with_added_row, play_index
      # return None, None, df_plays, play_index
      # return extracted_fumble_details, push_back_to_pipeline, df_plays_with_added_row, idx

  return extracted_fumble_details, push_back_to_pipeline, None, None

In [None]:
for idx, value in df_2023_run_week1['PlayOutcome'].items():
  play = week1_2023_plays['PlayDescription'].iloc[idx]
  if play.find('FUMBLES') != -1:
    fumble_play_elements = play.split(". ")
    print("index: " + str(idx))
    for i in fumble_play_elements:
      print(i)
    print()

index: 115
(9:54) Bre.Hall left end to BUF 22 for -1 yards (G.Rousseau)
FUMBLES (G.Rousseau), ball out of bounds at BUF 25.

index: 230
(2:08) S.Clifford FUMBLES (Aborted) at CHI 35, and recovers at CHI 35.

index: 756
(6:44) (Shotgun) J.Goff Aborted
F.Ragnow FUMBLES at KC 24, recovered by DET-J.Goff at KC 27
J.Goff to KC 27 for no gain (G.Karlaftis).

index: 826
(8:53) (Shotgun) D.Jones Aborted
J.Schmitz FUMBLES at DAL 18, recovered by NYG-D.Jones at DAL 27.

index: 933
(9:27) (Shotgun) D.Jones FUMBLES (Aborted) at NYG 30, and recovers at NYG 30
D.Jones to NYG 32 for 2 yards (M.Smith).

index: 1015
(6:33) (No Huddle, Shotgun) L.Jackson scrambles right end to HOU 20 for 6 yards (T.Thomas)
FUMBLES (T.Thomas), recovered by BAL-K.Zeitler at HOU 23
HOU-H.Ridgeway was injured during the play.

index: 1214
(1:39) J.Williams right tackle to TEN 9 for 11 yards (K.Byard, S.Murphy-Bunting)
FUMBLES (S.Murphy-Bunting), and recovers at TEN 9.

index: 1343
(3:02) T.Munford reported in as eligible
 J

In [None]:
# Maybe I should make a copy of the original df before changing things inside?
# - Although in a pipeline, the dataframe has to change. So maybe I'll make a copy
#   within the cleaning method?
week1_2023_plays_copy = week1_2023_plays.copy()

for idx, value in df_2023_run_week1['PlayOutcome'].items():
  play = week1_2023_plays_copy['PlayDescription'].iloc[idx]
  if play.find('FUMBLES') != -1:
    print("index: " + str(idx))
    fumble_details, initial_play, adjusted_df, index = extract_fumble_data_run(week1_2023_plays_copy, play, idx)

    if index != None:
      print()
      continue

    print(fumble_details)
    print(initial_play)
    print()

index: 115
[None, 'FUMBLES (G.Rousseau), ball out of bounds at BUF 25.']
['(9:54) Bre.Hall left end to BUF 22 for -1 yards (G.Rousseau)']

index: 230
['(2:08) S.Clifford FUMBLES (Aborted) at CHI 35, and recovers at CHI 35.']
['(2:08) S.Clifford FUMBLES (Aborted) at CHI 35, and recovers at CHI 35.']

index: 756

index: 826
['(8:53) (Shotgun) D.Jones Aborted', 'J.Schmitz FUMBLES at DAL 18, recovered by NYG-D.Jones at DAL 27.']
['(8:53) (Shotgun) D.Jones Aborted']

index: 933

index: 1015
[None, 'FUMBLES (T.Thomas), recovered by BAL-K.Zeitler at HOU 23', None]
['(6:33) (No Huddle, Shotgun) L.Jackson scrambles right end to HOU 20 for 6 yards (T.Thomas)', 'HOU-H.Ridgeway was injured during the play.']

index: 1214
[None, 'FUMBLES (S.Murphy-Bunting), and recovers at TEN 9.']
['(1:39) J.Williams right tackle to TEN 9 for 11 yards (K.Byard, S.Murphy-Bunting)']

index: 1343
[None, ' J.Garoppolo FUMBLES (Aborted) at DEN 1, and recovers at DEN 1.']
['(3:02) T.Munford reported in as eligible', ' J

In [None]:
for idx, value in df_2023_run_week1['PlayOutcome'].items():
  play = week1_2023_plays['PlayDescription'].iloc[idx]
  print("index: " + str(idx))
  print(play)
  print()

index: 3
(14:01) J.Cook up the middle to BUF 40 for 3 yards (J.Johnson, J.Franklin-Myers).

index: 4
(13:24) (Shotgun) J.Cook up the middle to BUF 42 for 2 yards (Q.Williams; J.Franklin-Myers).

index: 8
(9:24) (Shotgun) J.Cook right tackle to BUF 23 for 5 yards (Qu.Williams).

index: 10
(8:02) (Shotgun) J.Allen scrambles up the middle to BUF 38 for 14 yards (D.Reed).

index: 12
(6:52) (No Huddle, Shotgun) J.Cook up the middle to BUF 49 for 8 yards (D.Reed).

index: 23
(10:35) (Shotgun) J.Cook left end to BUF 20 for -5 yards (Q.Williams).

index: 27
(8:40) (Shotgun) J.Allen scrambles left end pushed ob at NYJ 48 for 6 yards (Q.Jefferson; C.Mosley).

index: 28
(7:59) J.Cook right end ran ob at NYJ 36 for 12 yards (J.Sherwood).

index: 29
(7:32) (No Huddle, Shotgun) J.Cook up the middle to NYJ 37 for -1 yards (Qu.Williams, A.Gardner).

index: 32
(5:34) (Shotgun) D.Harris up the middle to NYJ 5 for 3 yards (Qu.Williams).

index: 36
(2:36) (Shotgun) J.Cook left tackle pushed ob at NYJ 31 f

In [None]:
for idx, value in week1_2023_plays['PlayOutcome'].items():
  play = week1_2023_plays['PlayDescription'].iloc[idx]
  if play.find('REVERSED') != -1:
    print("index:" + str(idx))
    elements = play.split(". ")
    for i in elements:
      print(i)
    print()
    print(value)
    print()

index:423
(14:15) T.Lawrence pass short right to C.Ridley to JAX 47 for 14 yards (R.Thomas, E.Speed)
FUMBLES (E.Speed), RECOVERED by IND-E.Speed at IND 49
E.Speed ran ob at IND 49 for no gain
The Replay Official reviewed the ball was inbounds ruling, and the play was REVERSED
T.Lawrence pass short right to C.Ridley to JAX 47 for 14 yards (R.Thomas, E.Speed)
FUMBLES (E.Speed), ball out of bounds at IND 49
IND-K.Moore was injured during the play
IND-D.Flowers was injured during the play.

14 Yard Pass

index:1135
N.Folk kicks 63 yards from TEN 35 to NO 2
R.Shaheed to NO 21 for 19 yards (A.Hooker)
FUMBLES (A.Hooker), ball out of bounds at NO 24
PENALTY on NO-A.Prentice, Offensive Holding, 10 yards, enforced at NO 20
Tennessee challenged the ball was out of bounds ruling, and the play was REVERSED
N.Folk kicks 63 yards from TEN 35 to NO 2
R.Shaheed to NO 21 for 19 yards (A.Hooker)
FUMBLES (A.Hooker), RECOVERED by TEN-A.Hooker at NO 24
Penalty on NO-A.Prentice, Offensive Holding, declined.


### RUN PLAYS

In [None]:
# An idea for how to adjust when a row is added to df
# index_start - Can be changed if some rows have already been cleaned

# def clean_run_plays(df_plays, index_start=0)

def clean_run_plays(df_plays, index_start = None):

  # All run plays within df_plays
  df_run_plays = df_plays[df_plays['PlayOutcome'].str.contains('Run')]

  # Cut df_run_plays to have only from index_start to the end
  if index_start != None:
    df_run_plays = df_run_plays.iloc[df_run_plays.index.tolist().index(index_start):] # This might be wrong.

  # Iterating through all run plays
  for idx, value in df_run_plays['PlayOutcome'].items():
    play = df_plays['PlayDescription'].iloc[idx]

    ################
    # Play details #
    ################

    # Play Type
    # Should be different for aborted or fumble plays?
    df_plays.loc[idx, 'PlayType'] = 'Run'

    # TimeOnTheClock
    TimeOnTheClock = re.findall(time_on_clock_pattern, play)
    if len(TimeOnTheClock) > 0:
      df_plays.loc[idx, 'TimeOnTheClock'] = TimeOnTheClock[0][1:-1]

    ############
    # REVERSES #
    ############

    # We do not need to record any information before the reversed statement
    # within the play description.
    if play.find('REVERSED') != -1:
      play_elements = play.split(". ")
      for i in play_elements:
        if i.find("REVERSED") != -1:
          df_plays.at[idx, 'ReverseDetails'] = play_elements[:play_elements.index(i) + 1]
          play = ". ".join(play_elements[play_elements.index(i) + 1:])
          break

    ############################
    # REPORTING IN AS ELIGIBLE #
    ############################

    # I do not think this contains any useful data so I am going to exclude it.
    if play.find('reported in as eligible') != -1:
      play_elements = play.split(". ")
      for i in play_elements:
        if i.find('reported in as eligible') != -1:
          play = ". ".join(play_elements[play_elements.index(i) + 1:])
          break

    # Handle Fumbles
    if play.find('FUMBLES') != -1:
      # Method for fumble
      # - I think right here I want to return 3 things for 2 different situations
      #   1. will return: (list, string, NOTHING, NOTHING)
      #      - In this situation, the fumble did not have
      #        yardage after fumble.
      #        - No additional rows added and will continue using
      #          the original dataframe
      #   2. will return: (NOTHING, NOTHING, dataframe, Index_of_row)
      #      - In this situation, the fumble did have
      #        yardage after fumble.
      #        - Additional row was added and a new dataframe
      #          was created with said additional row
      #          - I will need to figure out a way to use this dataframe
      #            in place of the one currently being used.
      # fumble_details, play, new_df, checkmarker_index = extract_fumble_data_run(df_plays, play, idx)
      fumble_details, play, new_row, checkmarker_index = extract_fumble_data_run(df_plays, play, idx)

      # If a row_index appears, that means that there has been an addition
      # to the dataframe (new_df) and this method needs to be ran again and start
      # cleaning where it left off (checkmarker_index)
      # - Because
      if checkmarker_index != None:
        # Run this method over again starting at checkmarker_index
        # df_plays = new_df
        # clean_run_plays(new_df, checkmarker_index)
        df_plays = pd.concat([df_plays.iloc[:checkmarker_index+1], new_row, df_plays.iloc[checkmarker_index+1:]], ignore_index=True)
        # print(df_plays.iloc[checkmarker_index+1])
        print(df_plays.iloc[935])
        print(df_plays.iloc[936])
        print(df_plays.shape)
        clean_run_plays(df_plays, checkmarker_index)
        return

      df_plays.at[idx, 'FumbleDetails'] = fumble_details
      play = ". ".join(play)
      # continue

    yardage = re.findall(yardage_gained, play)
    if len(yardage) > 0:
      df_plays.loc[idx, 'Yardage'] = int(yardage[0].split()[1])
    else:
      df_plays.loc[idx, 'Yardage'] = 0

    ##########################
    # MISTAKES AND TURNOVERS #
    ##########################

    if play.find('PENALTY') != -1:
      # Method for penalty
      continue

    #############
    #  OFFENSE  #
    #############

    # Formation
    Formation = re.findall(formation, play)
    if len(Formation) > 0:
      if Formation[0][1:-1] == 'Aborted':
        pass
      else:
        df_plays.loc[idx, 'Formation'] = Formation[0][1:-1]

    # Rusher
    rusher_names = re.findall(rusher_pattern, play) # May grab name(s) bc regular expression. (Only want rusher)
    rusher_name = rusher_names[0][:-1]
    df_plays.loc[idx, 'Rusher'] = rusher_name
    # Direction
    rushing_directions = ['guard', 'middle', 'tackle', 'end', 'kneels']
    for i in rushing_directions:
      if play.find(i) != -1:
        start = play.find(rusher_name) + len(rusher_name) + 1
        end = play.find(i) + len(i)
        df_plays.loc[idx, 'Direction'] = play[start:end]

    #############
    #  DEFENSE  #
    #############

    tackler_1 = re.findall(defense_tackler_1_name_pattern, play) # tackler #1 (Could be solo or the one who initiated the hit)
    if len(tackler_1) > 0:
      df_plays.loc[idx, 'TackleBy1'] = tackler_1[0][1:]
    tackler_2 = re.findall(defense_tackler_2_name_pattern, play) # tackler #2 (equally contributed or assisted with tackle)
    if len(tackler_2) > 0:
      df_plays.loc[idx, 'TackleBy2'] = tackler_2[0][1:-1]

## PIPELINE MAIN METHOD
- Method that wraps all unique play type cleaning methods into one.

In [None]:
# PURPOSE:
# - Simply accept a dataframe of plays and return a dataframe with those plays cleaned.
# INPUT PARAMTERS:
# df_all_plays         - dataframe - all plays in raw form from NFL_Scraper that user
#                                    would like to clean.
# OUTPUT:
# df_all_plays_cleaned - dataframe - all plays from 'df_all_plays' cleaned and data
#                                    dispersed into individual new features.

# CURRENT DESIGN PLAN:
# 1. clean_dataframe_of_plays
#   1.1 Get indexes of a single category of play type within dataframe
#       (e.g. pass, run, touchdown, punt, sack, ... )
#   1.2 Use uniquely designed method for play type to clean within dataframe
#   1.3 Repeat until all plays within dataframe have been cleaned.
#      NOTE:
#      - It is important to fully clean a play type before moving to the next
#        because sometimes cleaning could involve adding a new row to the dataframe,
#        causing a reset to the dataframes indexing.
#        - If we were to separate all play types from the beginning, the indexes
#          could shift around causing an index that might be pointing to a run play
#          to now pointing at a pass play.

def clean_dataframe_of_plays(df_all_plays):

  ###########################
  # NEW COLUMN DESCRIPTIONS #
  ###########################

  # PlayType           - The type of play (e.g. pass/run)
  # TimeOnTheClock     - The time that was on the clock when the play started
  # Formation          - Play formation
  # Passer             - Player that threw the ball (mostly the quarterback)
  # Rusher             - Player that ran the ball (mostly the runningback)
  # Receiver           - Player on the same team as the passer that caught the ball
  # PassType           - Whether the pass was a deep or short pass?
  # Direction          - Where the ball is going during the play
  # Yardage            - Yards gained during the play
  # TackleBy1          - Main tackler on the play (could be solo or could be with someone else)
  # TackleBy2          - Assisted tackler1
  # PressureBy         - Defender that applied pressure to the passer
  # FumbleDetails      - A list that has what happened after the fumble
  #                      - [forced fumble by, recovered by, yards gained, tackled by]
  # ReverseDetails     - A list having plays leading up to play reversal
  # InjuredPlayers     - Players that were injured during the play
  # PenaltyDescription - If there is a penalty, gives a description of it
  #                      - [who caused the penalty, what was the penalty, yards lost if penalty accepted]
  # Yardage            - Total yardage gained on intended play (yardage gained from penalties and fumble recoveries do not count)

  new_columns = ["PlayType", "TimeOnTheClock", "Formation", "Passer", "Rusher", "Receiver", "PassType", "Direction", "Yardage",
                "TackleBy1", "TackleBy2", "PressureBy",
                "FumbleDetails", "ReverseDetails",
                "InjuredPlayers", "PenaltyDescription"]

  string_columns = ["PlayType", "TimeOnTheClock", "Formation", "Passer", "Rusher", "Receiver", "PassType", "Direction",
                    "TackleBy1", "TackleBy2", "PressureBy",
                    "FumbleDetails", "ReverseDetails",
                    "InjuredPlayers", "PenaltyDescription"]

  int_columns = ["Yardage"]

  ########################################
  # RETURN DATAFRAME WITH ADDED FEATURES #
  ########################################

  df_all_plays_cleaned = df_all_plays.copy()
  df_all_plays_cleaned = df_all_plays_cleaned.reindex(columns=df_all_plays_cleaned.columns.tolist() + new_columns)
  df_all_plays_cleaned[string_columns] = df_all_plays_cleaned[string_columns].astype(str)
  df_all_plays_cleaned[int_columns] = df_all_plays_cleaned[int_columns].astype(float)

  ########################################
  # GETTING PLAY CATEGORIES AND CLEANING #
  ########################################

  clean_run_plays(df_all_plays_cleaned)

  return df_all_plays_cleaned

In [None]:
week1_2023_plays_copy = week1_2023_plays.copy()

df = clean_dataframe_of_plays(week1_2023_plays_copy)

Season                                                             2023
Week                                                             Week 1
Day                                                                 SUN
Date                                                              09/10
AwayTeam                                                        Cowboys
HomeTeam                                                         Giants
Quarter                                                     3RD QUARTER
DriveNumber                                                           2
TeamWithPossession                                                  NYG
IsScoringDrive                                                        0
PlayNumberInDrive                                                     4
IsScoringPlay                                                         0
PlayOutcome                                             Run for No Gain
PlayDescription       (8:49) (Shotgun) D.Jones up the middle to 

In [None]:
week1_2023_plays.shape

(2600, 15)

In [None]:
df.shape

(2600, 31)

In [None]:
df.iloc[756]

Unnamed: 0,756
Season,2023
Week,Week 1
Day,THU
Date,09/07
AwayTeam,Lions
HomeTeam,Chiefs
Quarter,2ND QUARTER
DriveNumber,2
TeamWithPossession,DET
IsScoringDrive,0


In [None]:
bills_jets_run_plays = df.loc[(df['PlayType'] == 'Run') & (df['AwayTeam'] == 'Bills')]

In [None]:
bills_jets_run_plays['Rusher'].unique()

array(['J.Cook', 'J.Allen', 'D.Harris', 'D.Harty', 'L.Murray', 'Bre.Hall',
       'D.Cook', 'Z.Wilson', 'Mi.Carter'], dtype=object)

In [None]:
bills_jets_run_plays['Yardage'].loc[bills_jets_run_plays['Rusher'] == 'Bre.Hall'].sum()

130.0

In [None]:
df['Yardage'].unique()

array([nan,  3.,  2.,  5., 14.,  8., -5.,  6., 12., -1., 13.,  7.,  4.,
       26., -4., 83.,  1., 10.,  0.,  9., -3., -2., 11., -7., 22., 17.,
       19., 16.])