# Explore the Transformed Play by Play Data

## Setup - Paths - Dependencies

In [105]:
from config import recent_play_by_play, recent_clean_db

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

## Load Play by Play data
pbp_raw_df = pd.read_csv(recent_play_by_play, low_memory=False)


data_folder = os.path.join('..', 'data') # data folder
temp_folder = os.path.join('..', 'TEMP') # temp folder

# pbp_raw_df.head()


In [106]:
#### HOTFIX TO CLEAN 

### Drop games Exhibition games from dataset
# see two problem teams St Anselm and Assumption - if Game_ID contains either of the teams drop the row
pbp_raw_df = pbp_raw_df[~pbp_raw_df['Game_ID'].str.contains('St Anselm|Assumption')]

# Fix Problem with a few Primary_team values
# Substitute 'UCN' and 'UCN Gustafsson' with 'Connecticut'
pbp_raw_df.loc[pbp_raw_df['Primary_team'] == 'UCN', 'Primary_team'] = 'Connecticut'
pbp_raw_df.loc[pbp_raw_df['Primary_team'] == 'UCN Gustafsson', 'Primary_team'] = 'Connecticut'
# Replace the weird 'PRI de la' with 'Princeton'
pbp_raw_df.loc[pbp_raw_df['Primary_team'] == 'PRI de la', 'Primary_team'] = 'Princeton'



In [107]:
# Rename to df for convenience
df = pbp_raw_df

# Filter for relevant events: Faceoffs and Goals
faceoff_events = df[df['Event_type'] == 'Faceoff']
goal_events = df[df['Event_type'] == 'Goal']

# Merge faceoff events with subsequent goal events within the same game and period
merged_df = pd.merge(
    faceoff_events[['Game_ID', 'Period', 'Time', 'Primary_team']],
    goal_events[['Game_ID', 'Period', 'Time', 'Primary_team']],
    on=['Game_ID', 'Period'],
    suffixes=('_faceoff', '_goal')
)

# Calculate the time difference between faceoff and goal
merged_df['time_diff'] = merged_df['Time_goal'] - merged_df['Time_faceoff']

# Filter only instances where the goal happens after the faceoff
merged_df = merged_df[merged_df['time_diff'] > 0]


# Count how often a goal is scored within 10 seconds and 5 seconds of a faceoff
goals_within_10s = (merged_df['time_diff'] <= 10).sum()
goals_within_5s = (merged_df['time_diff'] <= 5).sum()

# Count the number of times each team has scored a goal within 5 seconds of a faceoff
teams_scoring_within_5s = merged_df[merged_df['time_diff'] <= 5]['Primary_team_goal'].value_counts()

# Display results
# goals_within_10s, goals_within_5s, teams_scoring_within_5s


# Data Elporation

## Faceoff Danger
- Table of how many times a team has scored within 3,5, 7, and 10 seconds of a faceoff

In [108]:
# Recalculate the time difference properly
merged_df = pd.merge(
    faceoff_events[['Game_ID', 'Period', 'Time', 'Primary_team']],
    goal_events[['Game_ID', 'Period', 'Time', 'Primary_team']],
    on=['Game_ID', 'Period'],
    suffixes=('_faceoff', '_goal')
)

# Ensure that the goal occurs after the faceoff in time
merged_df = merged_df[merged_df['Time_goal'] > merged_df['Time_faceoff']]

# Calculate the correct time difference between faceoff and goal
merged_df['time_diff'] = merged_df['Time_goal'] - merged_df['Time_faceoff']

# Count goals per team in different time frames
teams_goal_counts = merged_df.groupby('Primary_team_goal')['time_diff'].agg(
    Scored_within_1s=lambda x: (x <= 1).sum(),
    Scored_within_2s=lambda x: (x <= 2).sum(),
    Scored_within_3s=lambda x: (x <= 3).sum(),
    Scored_within_4s=lambda x: (x <= 4).sum(),
    Scored_within_5s=lambda x: (x <= 5).sum(),
    Scored_within_6s=lambda x: (x <= 6).sum(),
    Scored_within_7s=lambda x: (x <= 7).sum(),
    Scored_within_8s=lambda x: (x <= 8).sum(),
    Scored_within_9s=lambda x: (x <= 9).sum(),
    Scored_within_10s=lambda x: (x <= 10).sum()
).reset_index()

# Add the total goals scored by each team
total_goals_per_team = goal_events['Primary_team'].value_counts().reset_index()
total_goals_per_team.columns = ['Primary_team_goal', 'Total_Goals']

# Merge with the team goal counts
teams_goal_counts = teams_goal_counts.merge(total_goals_per_team, on='Primary_team_goal', how='left')



In [109]:
teams_goal_counts.head()

Unnamed: 0,Primary_team_goal,Scored_within_1s,Scored_within_2s,Scored_within_3s,Scored_within_4s,Scored_within_5s,Scored_within_6s,Scored_within_7s,Scored_within_8s,Scored_within_9s,Scored_within_10s,Total_Goals
0,Air Force,0,0,0,2,2,2,4,5,6,8,55
1,Alaska,0,1,1,3,4,6,6,6,8,9,54
2,Alaska Anchorage,0,0,0,0,0,1,1,2,2,2,55
3,American Intl,0,0,0,0,1,1,1,2,2,4,57
4,Arizona State,0,0,0,0,1,1,1,1,1,2,90


In [110]:
# Calculate the percentage for each goal duration
for sec in range(1, 11):
    col = f'Scored_within_{sec}s'
    pct_col = f'{col}_pct'
    teams_goal_counts[pct_col] = teams_goal_counts[col] / teams_goal_counts['Total_Goals']

# Rearrange columns in an interleaved order: original value then its percentage
cols = ['Primary_team_goal', 'Total_Goals'] + [
    item for sec in range(1, 11)
    for item in (f'Scored_within_{sec}s', f'Scored_within_{sec}s_pct')
]
teams_goal_counts = teams_goal_counts[cols]

# Rename Primary_team_goal to Team
teams_goal_counts.rename(columns={'Primary_team_goal': 'Team'}, inplace=True)

In [111]:
# team_goal_counts.sample(10)

### Save Resulting Table as CSV

In [112]:
### Save to temp folder
teams_goal_counts.to_csv(os.path.join(temp_folder, 'team_goal_counts.csv'), index=False)




## Goals Allowed Just after faceoffs

In [113]:
## Reset df to a clean copy of the full pbp data
df = pbp_raw_df

# Fill the Secondary team column with the opposite team 
# If the Primary team is 'Home', the Secondary team is 'Away' and vice versa based on Game_ID
# Away is after third '-' in game_id. home is after 4th
# check primary team against game id and put opposite in secondary

# Create a new column 'Secondary_team' and initialize it with None
df['Secondary_team'] = None
# Loop through each row in the DataFrame
for index, row in df.iterrows():
    # Extract the game ID
    game_id = row['Game_ID']
    # Determine the home and away teams based on the game ID
    away_team = game_id.split('-')[3]
    home_team = game_id.split('-')[4]
    # Assign the opposite team to the 'Secondary_team' column based on the 'Primary_team'
    if row['Primary_team'] == home_team:
        df.at[index, 'Secondary_team'] = away_team
    else:
        df.at[index, 'Secondary_team'] = home_team


# Filter for relevant events: Faceoffs and Goals
faceoff_events = df[df['Event_type'] == 'Faceoff']
goal_events = df[df['Event_type'] == 'Goal']

# Check the first few rows to verify the new column
# df.head()

In [114]:

# Merge faceoff events with subsequent goal events within the same game and period
merged_allowed_df = pd.merge(
    faceoff_events[['Game_ID', 'Period', 'Time', 'Primary_team', 'Secondary_team']],
    goal_events[['Game_ID', 'Period', 'Time', 'Primary_team', 'Secondary_team']],
    on=['Game_ID', 'Period'],
    suffixes=('_faceoff', '_goal')
)

# Calculate the time difference between faceoff and goal
merged_allowed_df['time_diff'] = merged_allowed_df['Time_goal'] - merged_allowed_df['Time_faceoff']

# Filter only instances where the goal happens after the faceoff
merged_allowed_df = merged_allowed_df[merged_allowed_df['time_diff'] > 0]

merged_allowed_df.head()
# Count goals per team in different time frames
teams_goal_allowed = merged_allowed_df.groupby('Secondary_team_goal')['time_diff'].agg(
    within_1s=lambda x: (x <= 1).sum(),
    within_2s=lambda x: (x <= 2).sum(),
    within_3s=lambda x: (x <= 3).sum(),
    within_4s=lambda x: (x <= 4).sum(),
    within_5s=lambda x: (x <= 5).sum(),
    within_6s=lambda x: (x <= 6).sum(),
    within_7s=lambda x: (x <= 7).sum(),
    within_8s=lambda x: (x <= 8).sum(),
    within_9s=lambda x: (x <= 9).sum(),
    within_10s=lambda x: (x <= 10).sum()
).reset_index()

## LEGACY
# Add the total goals scored by each team 
# total_goals_per_team = goal_events['Secondary_team'].value_counts().reset_index()
# total_goals_per_team.columns = ['Secondary_team', 'Total_Goals']

# Merge with the team goal counts
 

# teams_goal_allowed.head()



In [115]:
# Add the season total of goals allowed by each team
total_goals_allowed_per_team = goal_events['Secondary_team'].value_counts().reset_index()
total_goals_allowed_per_team.columns = ['Secondary_team_goal', 'Total_Goals_Allowed']

# add the data into the teams_goal_allowed dataframe
teams_goal_allowed = teams_goal_allowed.merge(total_goals_allowed_per_team, on='Secondary_team_goal', how='left')


### Merge Two Datatables

In [116]:
# Rename columns for clarity in the allowed goals DataFrame
teams_goal_allowed.columns = ['Team', 'Allowed_within_1s', 'Allowed_within_2s', 'Allowed_within_3s',
                               'Allowed_within_4s', 'Allowed_within_5s', 'Allowed_within_6s', 'Allowed_within_7s',
                               'Allowed_within_8s', 'Allowed_within_9s', 'Allowed_within_10s', 'Total_Goals_Allowed']

# Check the data
teams_goal_allowed.sample(10)


Unnamed: 0,Team,Allowed_within_1s,Allowed_within_2s,Allowed_within_3s,Allowed_within_4s,Allowed_within_5s,Allowed_within_6s,Allowed_within_7s,Allowed_within_8s,Allowed_within_9s,Allowed_within_10s,Total_Goals_Allowed
40,Niagara,0,0,2,2,4,4,4,4,5,6,78
30,Mercyhurst,0,0,1,3,4,5,8,10,14,18,116
58,Stonehill,0,0,0,1,3,3,3,4,6,9,92
49,Providence,0,0,2,2,2,3,3,3,3,3,57
22,Harvard,0,0,1,1,2,3,5,6,7,7,53
3,American Intl,0,1,3,4,7,7,7,8,9,10,79
57,St. Thomas,0,0,1,2,4,5,7,8,9,10,72
44,Notre Dame,0,0,1,1,1,3,6,7,7,9,78
15,Colgate,0,0,0,1,1,3,4,5,5,6,74
50,Quinnipiac,0,0,1,1,1,1,1,2,2,6,57


In [117]:
# # Merge the allowed goals DataFrame with the total goals DataFrame
# face_off_goal_success = teams_goal_allowed.merge(teams_goal_counts, on='Team', how='left')
# Calculate the percentage for each goal duration
for sec in range(1, 11):
    col = f'Allowed_within_{sec}s'
    pct_col = f'{col}_pct'
    teams_goal_allowed[pct_col] = teams_goal_allowed[col] / teams_goal_allowed['Total_Goals_Allowed']


# Merge the allowed goals DataFrame with the total goals DataFrame
face_off_goal_success = teams_goal_allowed.merge(teams_goal_counts, on='Team', how='left')




In [118]:
## Save Final Table to Data Folder
# Get today's date as a string
today = pd.Timestamp.now().strftime('%Y-%m-%d')
face_off_goal_success.to_csv(os.path.join(data_folder, f'face_off_goal_success_{today}.csv'), index=False)

# Examine table
face_off_goal_success.head(10)

# Check info
# face_off_goal_success.info()

Unnamed: 0,Team,Allowed_within_1s,Allowed_within_2s,Allowed_within_3s,Allowed_within_4s,Allowed_within_5s,Allowed_within_6s,Allowed_within_7s,Allowed_within_8s,Allowed_within_9s,...,Scored_within_6s,Scored_within_6s_pct,Scored_within_7s,Scored_within_7s_pct,Scored_within_8s,Scored_within_8s_pct,Scored_within_9s,Scored_within_9s_pct,Scored_within_10s,Scored_within_10s_pct
0,Air Force,0,2,2,2,2,2,3,3,5,...,2.0,0.036364,4.0,0.072727,5.0,0.090909,6.0,0.109091,8.0,0.145455
1,Alaska,0,0,0,1,2,2,2,2,2,...,6.0,0.111111,6.0,0.111111,6.0,0.111111,8.0,0.148148,9.0,0.166667
2,Alaska Anchorage,1,1,3,3,4,4,4,6,8,...,1.0,0.018182,1.0,0.018182,2.0,0.036364,2.0,0.036364,2.0,0.036364
3,American Intl,0,1,3,4,7,7,7,8,9,...,1.0,0.017544,1.0,0.017544,2.0,0.035088,2.0,0.035088,4.0,0.070175
4,Arizona State,0,0,0,1,1,1,2,3,3,...,1.0,0.011111,1.0,0.011111,1.0,0.011111,1.0,0.011111,2.0,0.022222
5,Army,0,0,0,1,1,2,2,2,4,...,4.0,0.054795,4.0,0.054795,5.0,0.068493,7.0,0.09589,9.0,0.123288
6,Augustana,0,1,2,3,3,3,3,3,3,...,0.0,0.0,0.0,0.0,2.0,0.027778,2.0,0.027778,2.0,0.027778
7,Bemidji State,0,0,0,3,3,3,4,7,8,...,4.0,0.064516,4.0,0.064516,5.0,0.080645,6.0,0.096774,6.0,0.096774
8,Bentley,0,0,0,0,1,1,1,1,2,...,1.0,0.014706,1.0,0.014706,2.0,0.029412,5.0,0.073529,7.0,0.102941
9,Boston College,0,1,2,3,3,3,3,4,5,...,2.0,0.025,3.0,0.0375,3.0,0.0375,4.0,0.05,4.0,0.05


### AFTER PP SUCCESS - Within 10 seconds of end of PP