<a href="https://colab.research.google.com/github/Keoni808/NFL_Data_Cleaning/blob/main/NFL_Plays_Week1_2023.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

PURPOSE:
- Correctly clean a week sample size of plays
  - Season 2023 -> Week 1

CONCERNS FOR LATER:
- Players with the same name
  - Goal for now is to use the least amount of different indicators or features for players with the same name to differentiate?
    - Maybe there is a more simple way.

# MOUNTING AND IMPORTS

In [None]:
# Mount your Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Used to access personal google cloud services
from google.colab import auth
auth.authenticate_user()
print('Authenticated')

Authenticated


In [None]:
# Imports

# Data manipulation
import pandas as pd

# Regular expressions
import re

# Grab data from database
from google.cloud import bigquery

In [None]:
# # debugger (maybe use in the future)
# %pdb on

# LOADING DATA (BigQuery queries)

In [None]:
# Client connect to bigquery project
client = bigquery.Client('nfl-data-430702')

## Season 2023 Week 1

In [None]:
# Grabbing all plays from 2023 Week 1 NFL Sesason
week1_2023_plays_query = """
                         SELECT *
                         FROM `nfl-data-430702.NFL_Scores.NFL-Plays-Week1_2023`
                         """

# Running psuedo query, and returns the amount of bytes it will take to run query
dry_run_config = bigquery.QueryJobConfig(dry_run=True)
dry_run_query = client.query(week1_2023_plays_query, job_config=dry_run_config)
print("This query will process {} bytes.".format(dry_run_query.total_bytes_processed))

# Running query (Being mindful of the amount of data being grabbed)
# Will grab a maximum of a Gigabyte
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**9)
safe_config_query = client.query(week1_2023_plays_query, job_config=safe_config)

This query will process 570194 bytes.


In [None]:
# Putting data attained from query into a dataframe
week1_2023_plays = safe_config_query.to_dataframe()

In [None]:
week1_2023_plays.head()

Unnamed: 0,Season,Week,Day,Date,AwayTeam,HomeTeam,Quarter,DriveNumber,TeamWithPossession,IsScoringDrive,PlayNumberInDrive,IsScoringPlay,PlayOutcome,PlayDescription,PlayStart
0,2023,Week 1,MON,09/11,Bills,Jets,1ST QUARTER,1,BUF,0,1,0,Kickoff,G.Zuerlein kicks 65 yards from NYJ 35 to end z...,Kickoff from NYJ 35
1,2023,Week 1,MON,09/11,Bills,Jets,1ST QUARTER,1,BUF,0,2,0,7 Yard Pass,(15:00) (Shotgun) J.Allen pass short right to ...,1st & 10 at BUF 25
2,2023,Week 1,MON,09/11,Bills,Jets,1ST QUARTER,1,BUF,0,3,0,5 Yard Pass,"(14:34) (No Huddle, Shotgun) J.Allen pass shor...",2nd & 3 at BUF 32
3,2023,Week 1,MON,09/11,Bills,Jets,1ST QUARTER,1,BUF,0,4,0,3 Yard Run,(14:01) J.Cook up the middle to BUF 40 for 3 y...,1st & 10 at BUF 37
4,2023,Week 1,MON,09/11,Bills,Jets,1ST QUARTER,1,BUF,0,5,0,2 Yard Run,(13:24) (Shotgun) J.Cook up the middle to BUF ...,2nd & 7 at BUF 40


In [None]:
# Noting the original size of the raw uncleaned dataframe of data
# - (rows, columns)
week1_2023_plays.shape

(2600, 15)

# CATEGORIZE PLAYS
- The goal here is to parse out the different values for 'PlayOutcome'
  - This is where I will separate different types of plays
    - ( pass / run / kickoff / etc. )

In [None]:
# Maybe try to fuzzywuzzy this in the future?

# All play outcomes from the game
# - From here we can categorize and clean plays accordingly
week1_2023_plays['PlayOutcome'].unique()

array(['Kickoff', '7 Yard Pass', '5 Yard Pass', '3 Yard Run',
       '2 Yard Run', 'Pass Incomplete', 'Punt', '-5 Yard Penalty',
       '5 Yard Run', '1 Yard Pass', '14 Yard Run', '3 Yard Pass',
       '8 Yard Run', '6 Yard Pass', '15 Yard Pass', '-9 Yard Sack',
       '4 Yard Pass', '13 Yard Pass', 'Field Goal', '-2 Yard Sack',
       'Interception', '-5 Yard Run', '18 Yard Pass', '8 Yard Pass',
       '6 Yard Run', '12 Yard Run', '-1 Yard Run', '26 Yard Pass',
       'Touchdown Bills', 'Extra Point Good', '13 Yard Run',
       '-3 Yard Sack', '7 Yard Run', '9 Yard Pass', '4 Yard Run',
       'Fumble', '-10 Yard Penalty', '10 Yard Pass', '26 Yard Run',
       '5 Yard Penalty', '-10 Yard Sack', '22 Yard Pass', '-4 Yard Run',
       '-12 Yard Sack', '83 Yard Run', '1 Yard Run', '2 Yard Pass',
       '10 Yard Run', 'Run for No Gain', '12 Yard Pass', '20 Yard Pass',
       '9 Yard Run', '-2 Yard Pass', 'Sack', '24 Yard Pass',
       '14 Yard Pass', 'Touchdown Jets', '-3 Yard Run', '-2 Yar

In [None]:
# NOTE:
# There are more play types that I have not made yet for Week 1.

# Eyeing at all unique play outcomes to categorizing them.
# - This type of approach does not feel very flexable because a play outcome can
#   arise that has not been seen yet.
# - There may be more in the future when working on a full season, let alone all seasons and future games

df_2023_pass_week1 = week1_2023_plays[week1_2023_plays['PlayOutcome'].str.contains('Pass')]
df_2023_run_week1 = week1_2023_plays[week1_2023_plays['PlayOutcome'].str.contains('Run')]

# df_2023_punt_week1 = week1_2023_plays[week1_2023_plays['PlayOutcome'].str.contains('Punt')]
# df_2023_sack_week1 = week1_2023_plays[week1_2023_plays['PlayOutcome'].str.contains('Sack')]
# df_2023_kickoff_week1 = week1_2023_plays[week1_2023_plays['PlayOutcome'].str.contains('Kickoff')]
# df_2023_fumble_week1 = week1_2023_plays[week1_2023_plays['PlayOutcome'].str.contains('Fumble')]
# df_2023_interception_week1 = week1_2023_plays[week1_2023_plays['PlayOutcome'].str.contains('Interception')]
# df_2023_penalty_week1 = week1_2023_plays[week1_2023_plays['PlayOutcome'].str.contains('Penalty')]
# df_2023_fieldgoal_week1 = week1_2023_plays[week1_2023_plays['PlayOutcome'].str.contains('Field Goal')]
# df_2023_touchdown_week1 = week1_2023_plays[week1_2023_plays['PlayOutcome'].str.contains('Touchdown')]
# df_2023_extrapoint_week1 = week1_2023_plays[week1_2023_plays['PlayOutcome'].str.contains('Extra Point')]

# plays_list = [df_2023_pass_sb,
#               df_2023_run_sb,
#               df_2023_punt_sb,
#               df_2023_sack_sb,
#               df_2023_kickoff_sb,
#               df_2023_fumble_sb,
#               df_2023_interception_sb,
#               df_2023_penalty_sb,
#               df_2023_fieldgoal_sb,
#               df_2023_touchdown_sb,
#               df_2023_extrapoint_sb]

## SANITY CHECK (All Plays Accounted for)
- NOT COMPLETE
  - Still need to grab other play types
    - Once all plays have been categorizing, will compare the sum to the size of the original dataframe of plays

In [None]:
# Empty for now.

# HELPER METHODS (personal use)
- For personal use, does not actually take part in cleaning dataset at all.

In [None]:
# PURPOSE:
# - Quick look at a section of plays
#   - Ideally the plays that the user wants to break down and clean.
# INPUT PARAMETERS:
# df_all_plays      - DataFrame - The original dataframe where the desired plays to view came from
# df_section_plays  - DataFrame - A section of the original dataframe the user wants to view
# RETURN:
# - Printing to the console:
#   1. index of play
#   2. 'PlayDescription' feature of play
#   3. 'PlayOutcome' feature of play
def print_plays(df_all_plays, df_section_plays):
  for idx, value in df_section_plays['PlayOutcome'].items():
    play = df_all_plays['PlayDescription'].iloc[idx]
    print("index:" + str(idx))
    for i in play.split(". "):
      print(i)
    print(value)
    print()

# PIPELINE
  - ORDER
    1. Regular expressions
      - Used to find common patterns within raw data
    1. Cleaning methods
      - Unique cleaning methods for each play type
        - Some methods may include helper methods
    2. Main pipeline method
      - Control flow of cleaning methods



## 1. REGULAR EXPRESSIONS

In [None]:
####################################################
# REGULAR EXPRESSIONS USED TO LOCATE SPECIFIC DATA #
####################################################

################
# PLAY DETAILS #
################

time_on_clock_pattern = r'\((\d*:\d+)\)'
formation = r'\(([A-Za-z]+ ?[A-Za-z]*,? ?[A-Za-z]*)\)'
yardage_gained = r'for -?[0-9]+ yards?'

#################
# NAMES OFFENSE #
#################

name_pattern = "(?:[A-Za-z]+-)*[A-Za-z]+\.[A-Za-z]+(?:-[A-Za-z]+)*"
passer_name_pattern = f"({name_pattern}) pass" # All passers are exclusively followed by ' pass'
receiver_name_pattern = f"to ({name_pattern})" # All receivers exclusively follow 'to '
rusher_pattern = f"({name_pattern}) " # Runningbacks, like quarterbacks, are the first names in play descriptions

#################
# NAMES DEFENSE #
#################

defense_tackler_1_name_pattern = f"\(({name_pattern})" # Will have a "(" in front of the name
defense_tackler_2_name_pattern = f" ({name_pattern})\)" # Will have a ")" at the end of the name
# MIGHT NEED TO CHANGE:
# - I think it might be possible for multiple defenders to apply pressure to the passer.
defense_pressure_name_pattern = f"\[({name_pattern})\]" # Surrounded by "[]" brackets

#######################
# PATTERNS ON FUMBLES #
#######################

qb_fumble = f" {name_pattern} to [A-Z]+ [0-9]+ for -?[0-9]+ yards$" # Passer fumbles are always the initial action on the play,
#                                                                     will have time displayed before action and possibly formation too
run_after_recovery = f"^{name_pattern} to [A-Z]+ [0-9]+ for " # yardage after recovery (formatted almost exactly like a regular run play)

##############
#  INJURIES  #
##############

injury = f"[A-Z]+-({name_pattern}) was injured during the play" # Returns the player(s) who go injuried during play

## 2. CLEANING METHODS

#### pass helper method (Fumbles)


In [None]:
# FOR PASSING ONLY RIGHT NOW
# - A possible goal down the road is to create a single method that can handle
#   all fumble situations, whether it be a running fumble or a passing fumble.

# PURPOSE:
# - Extract fumble data from fumbled plays.
#   - The goal is to strictly grab data that can only appear during fumbled plays,
#     while attempting to push all commonly formated play type data to main cleaning methods.

# NOTE:
# - It is common for a single fumbled play row to be divided into multiple rows.
#   - For example, an intended play has been fumbled and a player recoveres the fumble for a touchdown.
#     - This will be split into 2 separate rows, (1) the intended play row and (2) the fumble recovery row.
#   - The concern here is making sure those rows within the main dataframe of
#     plays are tied together in some way, to signify that the multiple rows
#     are not different plays but all instances of the same.
#     - A solution here could be the features the multi play rows share.
#       - For example, (TimeOnTheClock, Week, Quarter, DriveNumber, PlayNumberInDriver, etc..)

#####################################################
# ROUGH DESIGN OF SINGLE ROW PLAY -> MULTI ROW PLAY #
#####################################################

# - SINGLE PLAY ROW TO SINGLE PLAY ROW(S) METHOD:
#   1. Split play into appropriate divisions (e.g. 1 row -> 3 rows)
#      a. (row 1) - passer fumble
#      b. (row 2) - passing play
#      c. (row 3) - recovery for yardage
#      NOTE:
#      - These are all instances that call for a split
#      - This will always be the cronological order
#        - Any row out of these can be missing depending on the play.
#   2. Clean each row individually
#      1. Transform data into individual single row dataframes
#      2. Run each row through appropriate cleaning method (e.i. passing, running, ...)
#   3. Organize rows cronologically
#      1. Create single dataframe containing all individual rows

# - REPLACING PLAY WITHIN MAIN DATAFRAME:
#   1. return single play multi row dataframe(?)
#      -> MAIN CLEANING METHOD:
#         1. replace original play row with new single play multi rows
#            1. identify index of original play
#            2. break main dataframe in 2 pieces
#               a. Dataframe 1 - dataframe before index (exclusive)
#               a. Dataframe 2 - dataframe after index (exclusive)
#            3. concat new dataframe (Dataframe 1 +
#                                     single play multi row dataframe +
#                                     Dataframe 2)
#         2. rerun main cleaning method (recursion)
#            - manually insert index after last added row to pick up where it left off
#            - exit case will be when the last passing type play has been cleaned

##########################
# EXAMPLE PLAY BREAKDOWN #
##########################

# PLAY (WITH NOTES):
# (14:21) J.Love to CHI 44 for -3 yards <- signal for an additional row needed (passer fumble: grabbing passer name and yardage)
# FUMBLES, and recovers at CHI 46 <- added to play feature 'FumbleDetails'
# J.Love pass deep left to L.Musgrave to CHI 4 for 37 yards (T.Stevenson) [D.Walker]. <- pass to main breakdown method (follows traditional passing play format)
# NOTE:
# - If the fumble was to be recovered and ran for yardage, that would also call for an additional row needed.
# EXAMPLE:
# (4:45) (Shotgun) D.Jones pass short left to M.Breida to NYG 43 for 5 yards (M.Bell)
# FUMBLES (M.Bell), recovered by NYG-P.Campbell at NYG 35
# P.Campbell to NYG 33 for -2 yards <- signal for an additional row needed (fumble recovery for yards: grabbing player who recovered and yardage)
# Officially, a pass for -3 yards.

def extract_fumble_data_pass(df_plays, play, play_index):

  # Separating each sentence within play (each sentence represents a single action)
  play_elements = play.split(". ")
  # Collecting fumble data in the exact order in which it happened.
  extracted_fumble_details = [None] * len(play_elements)
  back_to_main_cleaning_method = []

  # list for plays that need multiple rows
  multi_row_play = []
  # lists to collect distinct actions that will become their own rows
  passer_fumble = []
  fumble_recovery = []

  for i in play_elements:
    # Assume everything is going back to main cleaning method
    back_to_main_cleaning_method.append(i)

    # Passer fumble
    # 1. Isolate the passer fumble action. (Take out of list going back to main cleaning method)
    # 2. create new row (dataframe) with passer fumble action
    # 3. clean newly created row (dataframe)
    #    - QUESTION: Should 'PlayType' remain as 'pass' or should it be something else..?
    #      - For now it will be 'run'.
    # 4. append newly created row to 'passer_fumble'
    #    - will be a list of single row dataframes (only expecting this list to have 1 element)
    #    - POTENTIAL ERROR:
    #      - qb_fumble I believe will not pick up REVERSED plays that initially start with a qb fumble.
    passer_fumble_action = re.findall(qb_fumble, i)
    if len(passer_fumble_action) > 0:
      # 1. Isolate the passer fumble action. (Take out of list going back to main cleaning method)
      back_to_main_cleaning_method.pop(back_to_main_cleaning_method.index(i))
      # 2. create new row (dataframe) with passer fumble action
      passer_fumble_row = df_plays.iloc[play_index].copy()
      passer_fumble_row['PlayDescription'] = i
      passer_fumble_row = pd.DataFrame([passer_fumble_row], columns=df_plays.columns)
      # 3. clean newly created row (dataframe)
      passer_fumble_row['PlayOutcome'] = 'Run' # <- This is ugly. Without this, the cleaning method for run plays will not clean.
      cleaned_passer_fumble_row = clean_run_plays(passer_fumble_row)
      cleaned_passer_fumble_row['PlayOutcome'] = df_plays.at[play_index, 'PlayOutcome'] # <- This is ugly.
      #                                                                                      Switching 'PlayOutcome' back to it's shared value
      #                                                                                      with the rest of the grouped rows representing the play.
      # 4. append newly created row to 'passer_fumble'
      passer_fumble.append(cleaned_passer_fumble_row)

    # Fumble sentences to (fumble details)
    if i.find('FUMBLES') != -1:
      back_to_main_cleaning_method.pop(back_to_main_cleaning_method.index(i))
      extracted_fumble_details.pop(play_elements.index(i))
      extracted_fumble_details.insert(play_elements.index(i), i)

    # Recovery for yardage
    # 1. Isolate the recovery for yardage action
    # 2. create new row (dataframe) with recovery for yardage action
    # 3. clean newly created row (dataframe)
    # 4. append newly created row to 'df_fumble_recovery'
    fumble_recovery_action = re.findall(run_after_recovery, i)
    if len(fumble_recovery_action) > 0:
      # 1. Isolate the recovery for yardage action
      back_to_main_cleaning_method.pop(back_to_main_cleaning_method.index(i))
      # 2. create new row (dataframe) with recovery for yardage action
      fumble_recovery_row = df_plays.iloc[play_index].copy()
      fumble_recovery_row['PlayDescription'] = i
      fumble_recovery_row = pd.DataFrame([fumble_recovery_row], columns=df_plays.columns)
      # 3. clean newly created row (dataframe)
      fumble_recovery_row['PlayOutcome'] = 'Run' # <- This is ugly. Without this, 'clean_run_plays' will not clean.
      cleaned_fumble_recovery_row = clean_run_plays(fumble_recovery_row)
      cleaned_fumble_recovery_row['PlayOutcome'] = df_plays.at[play_index, 'PlayOutcome'] # <- This too is ugly.
      #                                                                                             Switching 'PlayOutcome' back to it's shared value
      #                                                                                             with the rest of the grouped rows representing the play.
      # 4. append newly created row to 'df_fumble_recovery'
      fumble_recovery.append(cleaned_fumble_recovery_row)

  ##################################################
  # COMBINING ROWS FOR PLAYS THAT REQUIRE MULTIPLE #
  ##################################################

  # Check to see if additional rows are needed (e.i. if there are any elements within these 2 lists)
  if len(passer_fumble) + len(fumble_recovery) > 0:
    # Creating and cleaning row for intended play
    # - Cleaning all data that was going to be sent back to the main cleaning method
    main_play_row = df_plays.iloc[play_index].copy()
    main_play_row['PlayDescription'] = '. '.join(back_to_main_cleaning_method)
    main_play_row = pd.DataFrame([main_play_row], columns=df_plays.columns)
    cleaned_main_play_row = clean_pass_plays(main_play_row)
    # Organize rows cronologically
    # 1. (row 1) - passer fumble
    # 2. (row 2) - passing play
    # 3. (row 3) - recovery for yardage
    multi_row_play.extend(passer_fumble)
    multi_row_play.append(cleaned_main_play_row)
    multi_row_play.extend(fumble_recovery)
    # Creating dataframe to group the divided single play rows
    df_split_single_play = pd.DataFrame(columns=df_plays.columns)
    # Iterate through each row and add to dataframe
    for i in multi_row_play:
      # Add the single play's 'FumbleDetails' to each row
      if len(extracted_fumble_details) > 0:
        # 'multi_row_play' is a list full of single row dataframes.
        # - This means that there is only one index for every dataframe within 'multi_row_play'
        row_index = i.index[0]
        i.at[row_index, 'FumbleDetails'] = extracted_fumble_details
      # Combining each row, all peices of a single play, into a dataframe
      if df_split_single_play.empty:
        df_split_single_play = i # Pandas depricating the ability to concat an empty dataframe with one that is not.
      else:
        df_split_single_play = pd.concat([df_split_single_play, i], ignore_index=True)
    return None, None, df_split_single_play

  # returning empty dataframe because there will be zero additional rows added
  return extracted_fumble_details, back_to_main_cleaning_method, pd.DataFrame()

### PASS PLAYS

In [None]:
# PURPOSE:
# - Clean all passing type plays within a given dataframe.
# INPUT PARAMETERS:
# df_plays    - dataframe - NFL plays (can include play types other than passing)
# index_start -  integer  - index where within the dataframe the method will start
#                           cleaning in ascending order.
# RETURN:
# df_plays - dataframe - the same df_plays input but with all passing play types cleaned

def clean_pass_plays(df_plays, index_start = None):

  # Locating all passing type plays within dataframe
  df_pass_plays = df_plays[df_plays['PlayOutcome'].str.contains('Pass')]

  if index_start != None:
    # All rows within 'df_pass_plays' still have their original indexes from 'df_plays'
    df_pass_plays = df_pass_plays.iloc[df_pass_plays.index.tolist().index(index_start):]

  for idx, play in df_pass_plays['PlayDescription'].items():

    ################
    # Play details #
    ################

    # Play Type
    df_plays.loc[idx, 'PlayType'] = 'Pass'

    # TimeOnTheClock
    TimeOnTheClock = re.findall(time_on_clock_pattern, play)
    if len(TimeOnTheClock) > 0:
      df_plays.loc[idx, 'TimeOnTheClock'] = TimeOnTheClock[0]

    ############
    # REVERSES #
    ############

    # In 'PlayDescription' all information before the "reversed" sentence is not needed.
    # - All information before is stored within 'ReverseDetails' and the remaining is cleaned.
    if play.find('REVERSED') != -1:
      play_elements = play.split(". ")
      for i in play_elements:
        if i.find("REVERSED") != -1:
          df_plays.at[idx, 'ReverseDetails'] = play_elements[:play_elements.index(i) + 1]
          play = ". ".join(play_elements[play_elements.index(i) + 1:])
          break

    ############################
    # REPORTING IN AS ELIGIBLE #
    ############################

    # I do not think this contains any useful data so I am going to exclude it.
    if play.find('reported in as eligible') != -1:
      play_elements = play.split(". ")
      for i in play_elements:
        if i.find('reported in as eligible') != -1:
          play = ". ".join(play_elements[play_elements.index(i) + 1:])
          break

    ###########
    # FUMBLES #
    ###########

    # Additional rows may be added after certain types of fumbled passing plays.
    # - The idea here is that, in those situations, the helping method 'extract_fumble_data_pass'
    #   will return a small dataframe of the rows that the single play split into.
    #   - When this small dataframe is returned, it will need to replace the original play
    #     within the main dataframe of plays and then continue on cleaning the rest of the passing plays.

    if play.find('FUMBLES') != -1:
      fumble_details, play, df_added_rows = extract_fumble_data_pass(df_plays, play, idx)
      if not df_added_rows.empty:
        df_before = df_plays.iloc[:idx]
        df_after = df_plays.iloc[idx+1:]
        df_plays = pd.concat([df_before, df_added_rows, df_after], ignore_index=True)
        index_of_last_added_row = idx + len(df_added_rows) - 1
        return clean_pass_plays(df_plays, index_of_last_added_row)

      df_plays.at[idx, 'FumbleDetails'] = fumble_details
      play = ". ".join(play)

    ###########
    # OFFENSE #
    ###########

    # NOTE:
    # - Incomplete passes will have 'PlayOutcome' as 'Pass Incomplete' as well
    #   as yardage value being 0.0

    # Yardage gained
    yardage = re.findall(yardage_gained, play)
    if len(yardage) > 0:
      df_plays.loc[idx, 'Yardage'] = int(yardage[0].split()[1])
    else:
      df_plays.loc[idx, 'Yardage'] = 0

    # Formation
    Formation = re.findall(formation, play)
    if len(Formation) > 0:
      if Formation[0] == 'Aborted':
        pass
      else:
        df_plays.loc[idx, 'Formation'] = Formation[0]

    # Passer (What about spikes?)
    passer_name = re.findall(passer_name_pattern, play)
    if len(passer_name) > 0:
      df_plays.loc[idx, 'Passer'] = passer_name[0]

    # Pass Type
    if play.find('deep') != -1:
      df_plays.loc[idx, 'PassType'] = 'Deep'
    elif play.find('short') != -1:
      df_plays.loc[idx, 'PassType'] = 'Short'

    # Pass Direction
    if play.find('left') != -1:
      df_plays.loc[idx, 'Direction'] = 'Left'
    elif play.find('right') != -1:
      df_plays.loc[idx, 'Direction'] = 'Right'
    elif play.find('middle') != -1:
      df_plays.loc[idx, 'Direction'] = 'Middle'

    # Unique situation (offense spikes the ball)
    if play.find('spike') != -1:
      df_plays.loc[idx, 'PassType'] = 'Spike'
      df_plays.loc[idx, 'Passer'] = re.findall(name_pattern, play)[0]

    # Receiver
    receiver_names = re.findall(receiver_name_pattern, play)
    if len(receiver_names) > 0:
      df_plays.loc[idx, 'Receiver'] = receiver_names[0]

    #############
    #  DEFENSE  #
    #############

    # Difference between ", " and "; " separating tacklers
    # ', ' - both defenders worked together to make the tackle
    # "; " - first defender initiated hit and second finished
    # - Should I mark the differences?

    tackler_1 = re.findall(defense_tackler_1_name_pattern, play) # tackler #1 (Could be solo or the one who initiated the hit)
    if len(tackler_1) > 0:
      df_plays.loc[idx, 'TackleBy1'] = tackler_1[0]

    tackler_2 = re.findall(defense_tackler_2_name_pattern, play) # tackler #2 (equally contributed or assisted with tackle)
    if len(tackler_2) > 0:
      df_plays.loc[idx, 'TackleBy2'] = tackler_2[0]

    pressure_by = re.findall(defense_pressure_name_pattern, play) # defender who applied pressure to the passer
    if len(pressure_by) > 0:
      df_plays.loc[idx, 'PressureBy'] = pressure_by[0]

    ##############
    #  INJURIES  #
    ##############

    injuries = re.findall(injury, play)
    if len(injuries) > 0:
      df_plays.at[idx, 'InjuredPlayers'] = injuries

    #############
    #  PENALTY  #
    #############

    if play.find('PENALTY') != -1:
      play_elements = play.split(". ")
      penalties = []
      for i in play_elements:
        if i.find('PENALTY') != -1:
          penalties.append(i)
      df_plays.at[idx, 'PenaltyDescription'] = penalties

  if df_pass_plays.tail(1).index.tolist()[0] == idx:
    return df_plays

### run helper method (Fumbles)
- Goal might be to combine both pass and run helper methods for fumbles

In [None]:
# Version 2

# PURPOSE:
# - Extract fumble details and push back data from fumbled plays that can be broken
#   down by the main play cleaning method.
# INPUT PARAMTERS:
# df_plays   - dataframe - dataframe of plays
# play       -  string   - 'PlayDescription' of play that contains a fumble
# play_index -  integer  - index of the fumbled play within 'df_plays'
# RETURN (TUPLE):
# extracted_fumble_details -   list   - all details of the fumbled play that contain data
#                                       that is of less importance
#                                       - The reason for this is to save space. It does not
#                                         make sense to have features for this data when
#                                         1/100 plays will contain a fumble.
# push_back_to_pipeline    -   list    - All details of the fumbled play that can be broken
#                                         down by the main play cleaning method.
# df_added_row             - dataframe - A single row of data that will be added to the
#                                        main set of plays dataframe
#                                        - Only returned for yardage gained during fumble recoveries
# play_index               -  integer  - index of the fumbled play within 'df_plays'
#                                        - Only returned when adding an additional row to dataframe

# !!!!! ERROR FOUND !!!!!
# - For fumbled plays that require the addition of rows,
#   I need to make sure that the bundle of rows representing
#   a single play have the same 'FumbleDetails' in them.
#   PLAN FOR ADJUSTMENT:
#   - I am taking a new approach for cleaning fumbled pass type plays
#     and I am thinking of switching this approach for fumbled run type plays
#     to match. When the time comes, I will make adjustments then.

def extract_fumble_data_run(df_plays, play, play_index):

  # 'PlayDescription' is made up of a group of sentences, each containing individual actions of the play.
  play_elements = play.split(". ")
  extracted_fumble_details = [None] * len(play_elements)
  push_back_to_pipeline = []

  # list for plays that need multiple rows
  multi_row_play = []
  # To collect distinct actions that will become their own rows
  df_fumble_recovery = []

  # iterating through each sentence within 'PlayDescription'
  for i in play_elements:
    push_back_to_pipeline.append(i)

    # Aborted sentences to both (fumble details & main cleaning method)
    if i.find('Aborted') != -1:
      extracted_fumble_details.pop(play_elements.index(i))
      extracted_fumble_details.insert(play_elements.index(i), i)
      continue

    # Fumble sentences to (fumble details)
    if i.find('FUMBLES') != -1:
      push_back_to_pipeline.pop(push_back_to_pipeline.index(i))
      extracted_fumble_details.pop(play_elements.index(i))
      extracted_fumble_details.insert(play_elements.index(i), i)

    # This checks for plays after a fumble recovery. When this occurs, the current play description
    # will be split into 2. Each given their own row to be broken down separately.
    # 1. before fumble recovery (includes fumbled action)
    # 2. after fumble recovery (excludes fumble action)
    # NOTE:
    # Main cleaning method will start extracting data at initial play once all
    # fumble recovery plays for yards have been split entirely.
    # - Could potentially have multiple fumble recoveries in a single play.
    after_fumble_action = re.findall(run_after_recovery, i)
    if len(after_fumble_action) > 0:
      # creating added row
      push_back_to_pipeline.pop(push_back_to_pipeline.index(i))

      recovery_for_yardage_row = df_plays.iloc[play_index].copy()
      recovery_for_yardage_row['PlayDescription'] = i
      recovery_for_yardage_row = pd.DataFrame([recovery_for_yardage_row], columns=df_plays.columns)

      cleaned_recovery_for_yardage_row = clean_run_plays(recovery_for_yardage_row)

      df_fumble_recovery.append(cleaned_recovery_for_yardage_row)

  # Check to see if additional rows are to be added
  if len(df_fumble_recovery) > 0:
    # Creating and cleaning row for intended play
    main_play_row = df_plays.iloc[play_index].copy()
    main_play_row['PlayDescription'] = '. '.join(push_back_to_pipeline)
    main_play_row = pd.DataFrame([main_play_row], columns=df_plays.columns)
    cleaned_main_play_row = clean_run_plays(main_play_row)

    # Organize rows cronologically
    # 1. (row 1) - running play
    # 2. (row 2) - recovery for yardage

    multi_row_play.append(main_play_row)
    multi_row_play.extend(df_fumble_recovery)

    df_split_single_play = pd.DataFrame(columns=df_plays.columns)

    for i in multi_row_play:
      row_index = i.index[0]
      i.at[row_index, 'FumbleDetails'] = extracted_fumble_details
      if df_split_single_play.empty:
        df_split_single_play = i
      else:
        df_split_single_play = pd.concat([df_split_single_play, i], ignore_index=True)

    return None, None, df_split_single_play

  return extracted_fumble_details, push_back_to_pipeline, pd.DataFrame()

### RUN PLAYS

In [None]:
# Version 2

# PURPOSE:
# - Clean run type plays
# INPUT PARAMETERS:
# df_plays    - dataframe - dataframe of plays
# index_start -  integer  - the starting index of the associated input dataframe
#                           to begin cleaning. (Needs to be the index of a run play)
# RETURN:
# df_plays - dataframe - dataframe of plays that now has all useful run play
#                        data accessable and clean.

def clean_run_plays(df_plays, index_start = None):

  # All run plays within 'df_plays'
  df_run_plays = df_plays[df_plays['PlayOutcome'].str.contains('Run')]

  # Cut 'df_run_plays' to begin from 'index_start' to the last run play available in dataframe
  if index_start != None:
    df_run_plays = df_run_plays.iloc[df_run_plays.index.tolist().index(index_start):]

  # Iterating through every run play within 'df_run_plays'
  for idx, play in df_run_plays['PlayDescription'].items():

    ################
    # Play details #
    ################

    # Play Type
    df_plays.loc[idx, 'PlayType'] = 'Run'

    # TimeOnTheClock
    TimeOnTheClock = re.findall(time_on_clock_pattern, play)
    if len(TimeOnTheClock) > 0:
      df_plays.loc[idx, 'TimeOnTheClock'] = TimeOnTheClock[0]

    ############
    # REVERSES #
    ############

    # In 'PlayDescription' all information before the "reversed" sentence is not needed.
    # - All information before is stored within 'ReverseDetails' and the remaining is cleaned.
    if play.find('REVERSED') != -1:
      play_elements = play.split(". ")
      for i in play_elements:
        if i.find("REVERSED") != -1:
          df_plays.at[idx, 'ReverseDetails'] = play_elements[:play_elements.index(i) + 1]
          play = ". ".join(play_elements[play_elements.index(i) + 1:])
          break

    ############################
    # REPORTING IN AS ELIGIBLE #
    ############################

    # I do not think this contains any useful data so I am going to exclude it.
    if play.find('reported in as eligible') != -1:
      play_elements = play.split(". ")
      for i in play_elements:
        if i.find('reported in as eligible') != -1:
          play = ". ".join(play_elements[play_elements.index(i) + 1:])
          break

    ###########
    # FUMBLES #
    ###########

    if play.find('FUMBLES') != -1:
      fumble_details, play, df_added_rows = extract_fumble_data_run(df_plays, play, idx)
      if not df_added_rows.empty:
        df_before = df_plays.iloc[:idx]
        df_after = df_plays.iloc[idx+1:]
        df_plays = pd.concat([df_before, df_added_rows, df_after], ignore_index=True)
        index_of_last_added_row = idx + len(df_added_rows) - 1
        return clean_run_plays(df_plays, index_of_last_added_row)

      df_plays.at[idx, 'FumbleDetails'] = fumble_details
      play = ". ".join(play)

    # Yardage gained
    yardage = re.findall(yardage_gained, play)
    if len(yardage) > 0:
      df_plays.loc[idx, 'Yardage'] = int(yardage[0].split()[1])
    else:
      df_plays.loc[idx, 'Yardage'] = 0

    #############
    #  OFFENSE  #
    #############

    # Formation
    Formation = re.findall(formation, play)
    if len(Formation) > 0:
      if Formation[0] == 'Aborted':
        pass
      else:
        df_plays.loc[idx, 'Formation'] = Formation[0]
    # Rusher
    rusher_names = re.findall(rusher_pattern, play) # May grab name(s) bc regular expression. (Only want rusher)
    rusher_name = rusher_names[0]
    df_plays.loc[idx, 'Rusher'] = rusher_name
    # Direction
    rushing_directions = ['guard', 'middle', 'tackle', 'end', 'kneels']
    for i in rushing_directions:
      if play.find(i) != -1:
        start = play.find(rusher_name) + len(rusher_name) + 1
        end = play.find(i) + len(i)
        df_plays.loc[idx, 'Direction'] = play[start:end]

    #############
    #  DEFENSE  #
    #############

    tackler_1 = re.findall(defense_tackler_1_name_pattern, play) # tackler #1 (Could be solo or the one who initiated the hit)
    if len(tackler_1) > 0:
      df_plays.loc[idx, 'TackleBy1'] = tackler_1[0]
    tackler_2 = re.findall(defense_tackler_2_name_pattern, play) # tackler #2 (equally contributed or assisted with tackle)
    if len(tackler_2) > 0:
      df_plays.loc[idx, 'TackleBy2'] = tackler_2[0]

    ##############
    #  INJURIES  #
    ##############

    injuries = re.findall(injury, play)
    if len(injuries) > 0:
      df_plays.at[idx, 'InjuredPlayers'] = injuries

    #############
    #  PENALTY  #
    #############

    if play.find('PENALTY') != -1:
      play_elements = play.split(". ")
      penalties = []
      for i in play_elements:
        if i.find('PENALTY') != -1:
          penalties.append(i)
      df_plays.at[idx, 'PenaltyDescription'] = penalties

    # Return if the last play has been cleaned in 'df_run_plays'
    if df_run_plays.tail(1).index.tolist()[0] == idx:
      return df_plays

## 3. PIPELINE MAIN METHOD

In [None]:
# PURPOSE:
# - Accept a dataframe of plays (dataframes formatted by NFL_Scrapers) and
#   return a cleaned dataframe of those plays.
# INPUT PARAMTERS:
# df_all_plays         - dataframe - all plays in raw form from NFL_Scraper that user
#                                    would like to clean.
# OUTPUT:
# df_all_plays_cleaned - dataframe - all plays from 'df_all_plays' cleaned and data
#                                    dispersed into individual new features.

# CURRENT DESIGN PLAN:
# 1. clean_dataframe_of_plays
#   1.1 Get indexes of a single category of play type within dataframe
#       (e.g. pass, run, touchdown, punt, sack, ... )
#   1.2 Use uniquely designed method for play type to clean within dataframe
#   1.3 Repeat until all plays within dataframe have been cleaned.
#      NOTE:
#      - It is important to fully clean a play type before moving to the next
#        because sometimes cleaning could involve adding a new row to the dataframe,
#        causing a reset to the dataframes indexing.
#        - If we were to separate all play types from the beginning, the indexes
#          could shift around causing an index that might be pointing to a run play
#          to now pointing at a pass play.

# NOTES:
# - I think "PlayOutcomes" is what determines the yardage gained on an intended play?
#   - This does not seem right to me.
#   - EXAMPLE:
#     - (9:54) Bre.Hall left end to BUF 22 for -1 yards (G.Rousseau)
#       FUMBLES (G.Rousseau), ball out of bounds at BUF 25.
#       - I would think that Bre.Hall would get docked -1 yards for his run.
#         - But I believe that he is actually docked -4
#           - 'PlayStart' = 2nd & 9 at BUF 21
#           - The play ends at BUF 25
#             - In my opinion and how I am going to track yardage is based on
#               possession of the ball. So I will track this as -1 yard not -4.

def clean_dataframe_of_plays(df_all_plays):

  ###########################
  # NEW COLUMN DESCRIPTIONS #
  ###########################

  # PlayType           - The type of play (e.g. pass/run)
  # TimeOnTheClock     - The time that was on the clock when the play started
  # Formation          - Play formation
  # Passer             - Player that threw the ball (mostly the quarterback)
  # Rusher             - Player that ran the ball (mostly the runningback)
  # Receiver           - Player on the same team as the passer that caught the ball
  # PassType           - Whether the pass was a deep or short pass?
  # Direction          - Where the ball is going during the play
  # Yardage            - Yards gained during the play
  # TackleBy1          - Main tackler on the play (could be solo or could be with someone else)
  # TackleBy2          - Assisted tackler1
  # PressureBy         - Defender that applied pressure to the passer
  # FumbleDetails      - A list that has what happened after the fumble
  #                      - [forced fumble by, recovered by, yards gained, tackled by]
  # ReverseDetails     - A list having plays leading up to play reversal
  # InjuredPlayers     - Players that were injured during the play
  # PenaltyDescription - If there is a penalty, gives a description of it
  #                      - [who caused the penalty, what was the penalty, yards lost if penalty accepted]

  new_columns = ["PlayType", "TimeOnTheClock", "Formation", "Passer", "Rusher", "Receiver", "PassType", "Direction", "Yardage",
                "TackleBy1", "TackleBy2", "PressureBy",
                "FumbleDetails", "ReverseDetails",
                "InjuredPlayers", "PenaltyDescription"]

  string_columns = ["PlayType", "TimeOnTheClock", "Formation", "Passer", "Rusher", "Receiver", "PassType", "Direction",
                    "TackleBy1", "TackleBy2", "PressureBy",
                    "FumbleDetails", "ReverseDetails",
                    "InjuredPlayers", "PenaltyDescription"]

  int_columns = ["Yardage"]

  ########################################
  # RETURN DATAFRAME WITH ADDED FEATURES #
  ########################################

  df_all_plays_cleaned = df_all_plays.copy()
  df_all_plays_cleaned = df_all_plays_cleaned.reindex(columns=df_all_plays_cleaned.columns.tolist() + new_columns)
  df_all_plays_cleaned[string_columns] = df_all_plays_cleaned[string_columns].astype(str)
  df_all_plays_cleaned[int_columns] = df_all_plays_cleaned[int_columns].astype(float)

  ########################################
  # GETTING PLAY CATEGORIES AND CLEANING #
  ########################################

  # clean_run_plays(df_all_plays_cleaned)
  df_all_run_plays_cleaned = clean_run_plays(df_all_plays_cleaned)
  df_all_pass_plays_cleaned = clean_pass_plays(df_all_run_plays_cleaned)

  return df_all_pass_plays_cleaned

# TESTING AREA

In [None]:
week1_2023_plays_copy = week1_2023_plays.copy()

df_week1_plays_cleaned = clean_dataframe_of_plays(week1_2023_plays_copy)

In [None]:
df_week1_plays_cleaned.shape

(2605, 31)

In [None]:
df_week1_plays_cleaned['Yardage'].loc[df_week1_plays_cleaned['Passer'] == 'J.Goff'].sum()

241.0

# cleaned dataset observations

In [None]:
df_week1_plays_cleaned.iloc[2299]

Unnamed: 0,2299
Season,2023
Week,Week 1
Day,SUN
Date,09/10
AwayTeam,49ers
HomeTeam,Steelers
Quarter,2ND QUARTER
DriveNumber,1
TeamWithPossession,SF
IsScoringDrive,1


In [None]:
# Original run plays (Raw data)

df_run_plays = week1_2023_plays[week1_2023_plays['PlayOutcome'].str.contains('Run')]

for idx, play in df_run_plays['PlayDescription'].items():
  if play.find('FUMBLES') != -1:
    print(idx)
    fumble_play_elements = play.split(". ")
    for i in fumble_play_elements:
      print(i)
    print()

115
(9:54) Bre.Hall left end to BUF 22 for -1 yards (G.Rousseau)
FUMBLES (G.Rousseau), ball out of bounds at BUF 25.

230
(2:08) S.Clifford FUMBLES (Aborted) at CHI 35, and recovers at CHI 35.

756
(6:44) (Shotgun) J.Goff Aborted
F.Ragnow FUMBLES at KC 24, recovered by DET-J.Goff at KC 27
J.Goff to KC 27 for no gain (G.Karlaftis).

826
(8:53) (Shotgun) D.Jones Aborted
J.Schmitz FUMBLES at DAL 18, recovered by NYG-D.Jones at DAL 27.

933
(9:27) (Shotgun) D.Jones FUMBLES (Aborted) at NYG 30, and recovers at NYG 30
D.Jones to NYG 32 for 2 yards (M.Smith).

1015
(6:33) (No Huddle, Shotgun) L.Jackson scrambles right end to HOU 20 for 6 yards (T.Thomas)
FUMBLES (T.Thomas), recovered by BAL-K.Zeitler at HOU 23
HOU-H.Ridgeway was injured during the play.

1214
(1:39) J.Williams right tackle to TEN 9 for 11 yards (K.Byard, S.Murphy-Bunting)
FUMBLES (S.Murphy-Bunting), and recovers at TEN 9.

1343
(3:02) T.Munford reported in as eligible
 J.Garoppolo FUMBLES (Aborted) at DEN 1, and recovers at D

In [None]:
# Original passing plays (raw data)

df_original_pass_plays = week1_2023_plays[week1_2023_plays['PlayOutcome'].str.contains('Pass')]

for idx, play in df_original_pass_plays['PlayDescription'].items():
  if play.find('FUMBLES') != -1:
    print(idx)
    fumble_play_elements = play.split(". ")
    for i in fumble_play_elements:
      print(i)
    print()

213
(14:21) J.Love to CHI 44 for -3 yards
FUMBLES, and recovers at CHI 46
J.Love pass deep left to L.Musgrave to CHI 4 for 37 yards (T.Stevenson) [D.Walker].

423
(14:15) T.Lawrence pass short right to C.Ridley to JAX 47 for 14 yards (R.Thomas, E.Speed)
FUMBLES (E.Speed), RECOVERED by IND-E.Speed at IND 49
E.Speed ran ob at IND 49 for no gain
The Replay Official reviewed the ball was inbounds ruling, and the play was REVERSED
T.Lawrence pass short right to C.Ridley to JAX 47 for 14 yards (R.Thomas, E.Speed)
FUMBLES (E.Speed), ball out of bounds at IND 49
IND-K.Moore was injured during the play
IND-D.Flowers was injured during the play.

872
(11:26) (Shotgun) D.Prescott pass short right to T.Pollard to NYG 12 for 7 yards (B.Okereke)
FUMBLES (B.Okereke), recovered by DAL-T.Biadasz at NYG 4.

961
(4:45) (Shotgun) D.Jones pass short left to M.Breida to NYG 43 for 5 yards (M.Bell)
FUMBLES (M.Bell), recovered by NYG-P.Campbell at NYG 35
P.Campbell to NYG 33 for -2 yards
Officially, a pass fo

In [None]:
df_week1_run_plays_cleaned = df_week1_plays_cleaned[df_week1_plays_cleaned['PlayOutcome'].str.contains('Run')]

for idx, play in df_week1_run_plays_cleaned['PlayDescription'].items():
  if play.find('FUMBLES') != -1:
    print(idx)
    fumble_play_elements = play.split(". ")
    for i in fumble_play_elements:
      print(i)
    print()

115
(9:54) Bre.Hall left end to BUF 22 for -1 yards (G.Rousseau)
FUMBLES (G.Rousseau), ball out of bounds at BUF 25.

230
(2:08) S.Clifford FUMBLES (Aborted) at CHI 35, and recovers at CHI 35.

827
(8:53) (Shotgun) D.Jones Aborted
J.Schmitz FUMBLES at DAL 18, recovered by NYG-D.Jones at DAL 27.

934
(9:27) (Shotgun) D.Jones FUMBLES (Aborted) at NYG 30, and recovers at NYG 30

1018
(6:33) (No Huddle, Shotgun) L.Jackson scrambles right end to HOU 20 for 6 yards (T.Thomas)
FUMBLES (T.Thomas), recovered by BAL-K.Zeitler at HOU 23
HOU-H.Ridgeway was injured during the play.

1217
(1:39) J.Williams right tackle to TEN 9 for 11 yards (K.Byard, S.Murphy-Bunting)
FUMBLES (S.Murphy-Bunting), and recovers at TEN 9.

1346
(3:02) T.Munford reported in as eligible
 J.Garoppolo FUMBLES (Aborted) at DEN 1, and recovers at DEN 1.

1925
(13:56) (Shotgun) T.Tagovailoa FUMBLES (Aborted) at MIA 20, touched at MIA 20, and recovers at MIA 20.



In [None]:
df_week1_plays_cleaned['Passer'].unique()

array(['nan', 'J.Allen', 'A.Rodgers', 'Z.Wilson', 'J.Love', 'J.Fields',
       'A.Richardson', 'G.Minshew', 'T.Lawrence', 'J.Burrow',
       'J.Browning', 'D.Watson', 'P.Mahomes', 'J.Goff', 'D.Jones',
       'D.Prescott', 'T.Taylor', 'L.Jackson', 'C.Stroud', 'D.Carr',
       'R.Tannehill', 'J.Garoppolo', 'R.Wilson', 'D.Ridder', 'B.Young',
       'B.Mayfield', 'K.Cousins', 'J.Herbert', 'T.Tagovailoa', 'M.Jones',
       'J.Hurts', 'M.Stafford', 'G.Smith', 'B.Purdy', 'K.Pickett',
       'J.Dobbs', 'S.Howell'], dtype=object)

In [None]:
df_week1_plays_cleaned['Yardage'].loc[df_week1_plays_cleaned['Passer'] == 'L.Jackson'].sum()

169.0

In [None]:
# Locating offensive players involved for specific games

team = 'Cowboys'
position = 'Receiver'

df_week1_plays_cleaned[position].loc[(df_week1_plays_cleaned['HomeTeam'] == team) |
                           (df_week1_plays_cleaned['AwayTeam'] == team)].unique()

array(['nan', 'S.Barkley', 'J.Ferguson', 'C.Lamb', 'K.Turpin', 'B.Cooks',
       'T.Pollard', 'M.Gallup', 'P.Hendershot', 'D.Slayton', 'D.Waller',
       'I.Hodgins', 'P.Campbell', 'J.Hyatt', 'D.Bellinger', 'S.Shepard',
       'L.Cager', 'M.Breida', 'G.Brightwell'], dtype=object)

In [None]:
position = 'Rusher'

df_week1_plays_cleaned[position].loc[(df_week1_plays_cleaned['HomeTeam'] == team) |
                           (df_week1_plays_cleaned['AwayTeam'] == team)].unique()

array(['nan', 'S.Barkley', 'D.Jones', 'M.Breida', 'T.Pollard', 'R.Dowdle',
       'K.Turpin', 'D.Prescott', 'C.Rush', 'D.Vaughn', 'G.Brightwell',
       'P.Campbell'], dtype=object)

In [None]:
# Getting stats for specific player given play type
playtype = 'Pass'
player = 'C.Lamb'
position = 'Receiver'

player = df_week1_plays_cleaned['Yardage'].loc[(df_week1_plays_cleaned['PlayType'] == playtype) &
                                              (df_week1_plays_cleaned[position] == player)]

print("Total yards: {}".format(player.sum()))
print("Total carries/targets: {}".format(len(player)))

Total yards: 77.0
Total carries/targets: 4


In [None]:
# Defensive stats (how many tackles did a specific player have in a game)

df_week1_plays_cleaned['PressureBy'].loc[(df_week1_plays_cleaned['HomeTeam'] == 'Cowboys') |
                                        (df_week1_plays_cleaned['AwayTeam'] == 'Cowboys')].value_counts()

Unnamed: 0_level_0,count
PressureBy,Unnamed: 1_level_1
,148
D.Lawrence,2
D.Fowler,2
J.Ward,1
D.Bland,1


In [None]:
# Injuries during week 1 2023

df_week1_plays_cleaned['InjuredPlayers'].value_counts()
# df_week1_plays_cleaned.loc[df_week1_plays_cleaned['InjuredPlayers'] == 'F.Cox']

Unnamed: 0_level_0,count
InjuredPlayers,Unnamed: 1_level_1
,2559
[F.Cox],1
[P.Turner],1
[C.Sterns],1
[J.Meyers],1
[J.Horn],1
[J.Bates],1
[D.White],1
[L.David],1
[M.Williams],1


In [None]:
packers_game = df_week1_plays_cleaned.loc[(df_week1_plays_cleaned['HomeTeam'] == 'Packers') |
                                          (df_week1_plays_cleaned['AwayTeam'] == 'Packers')]

packers_game.loc[(packers_game['PlayType'] == 'Run') & (packers_game['Rusher'] == 'A.Jones')]

Unnamed: 0,Season,Week,Day,Date,AwayTeam,HomeTeam,Quarter,DriveNumber,TeamWithPossession,IsScoringDrive,...,PassType,Direction,Yardage,TackleBy1,TackleBy2,PressureBy,FumbleDetails,ReverseDetails,InjuredPlayers,PenaltyDescription
153,2023,Week 1,SUN,09/10,Packers,Bears,1ST QUARTER,2,GB,1,...,,left tackle,-1.0,D.Walker,,,,,,
156,2023,Week 1,SUN,09/10,Packers,Bears,1ST QUARTER,2,GB,1,...,,right end,7.0,J.Brisker,T.Edwards,,,,,
157,2023,Week 1,SUN,09/10,Packers,Bears,1ST QUARTER,2,GB,1,...,,left guard,3.0,J.Sanborn,,,,,,
158,2023,Week 1,SUN,09/10,Packers,Bears,1ST QUARTER,2,GB,1,...,,right tackle,8.0,Y.Ngakoue,,,,,,
160,2023,Week 1,SUN,09/10,Packers,Bears,1ST QUARTER,2,GB,1,...,,right end,7.0,J.Brisker,,,,,[J.Brisker],
190,2023,Week 1,SUN,09/10,Packers,Bears,3RD QUARTER,1,GB,1,...,,left guard,9.0,T.Edmunds,,,,,,
191,2023,Week 1,SUN,09/10,Packers,Bears,3RD QUARTER,1,GB,1,...,,left tackle,0.0,T.Edwards,T.Edmunds,,,,,
193,2023,Week 1,SUN,09/10,Packers,Bears,3RD QUARTER,1,GB,1,...,,left end,7.0,T.Edwards,,,,,,
