# Explore the Transformed Play by Play Data

## Setup - Paths - Dependencies

In [29]:
from config import recent_play_by_play, recent_clean_db

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

## Load Play by Play data
pbp_raw_df = pd.read_csv(recent_play_by_play, low_memory=False)


data_folder = os.path.join('..', 'data') # data folder
temp_folder = os.path.join('..', 'TEMP') # temp folder

# pbp_raw_df.head()


In [30]:
# Rename to df for convenience
df = pbp_raw_df

# Filter for relevant events: Faceoffs and Goals
faceoff_events = df[df['Event_type'] == 'Faceoff']
goal_events = df[df['Event_type'] == 'Goal']

# Merge faceoff events with subsequent goal events within the same game and period
merged_df = pd.merge(
    faceoff_events[['Game_ID', 'Period', 'Time', 'Primary_team']],
    goal_events[['Game_ID', 'Period', 'Time', 'Primary_team']],
    on=['Game_ID', 'Period'],
    suffixes=('_faceoff', '_goal')
)

# Calculate the time difference between faceoff and goal
merged_df['time_diff'] = merged_df['Time_goal'] - merged_df['Time_faceoff']

# Filter only instances where the goal happens after the faceoff
merged_df = merged_df[merged_df['time_diff'] > 0]


# Count how often a goal is scored within 10 seconds and 5 seconds of a faceoff
goals_within_10s = (merged_df['time_diff'] <= 10).sum()
goals_within_5s = (merged_df['time_diff'] <= 5).sum()

# Count the number of times each team has scored a goal within 5 seconds of a faceoff
teams_scoring_within_5s = merged_df[merged_df['time_diff'] <= 5]['Primary_team_goal'].value_counts()

# Display results
# goals_within_10s, goals_within_5s, teams_scoring_within_5s


## Data Elporation

### Faceoff Danger
- Table of how many times a team has scored within 3,5, 7, and 10 seconds of a faceoff

In [31]:
# Recalculate the time difference properly
merged_df = pd.merge(
    faceoff_events[['Game_ID', 'Period', 'Time', 'Primary_team']],
    goal_events[['Game_ID', 'Period', 'Time', 'Primary_team']],
    on=['Game_ID', 'Period'],
    suffixes=('_faceoff', '_goal')
)

# Ensure that the goal occurs after the faceoff in time
merged_df = merged_df[merged_df['Time_goal'] > merged_df['Time_faceoff']]

# Calculate the correct time difference between faceoff and goal
merged_df['time_diff'] = merged_df['Time_goal'] - merged_df['Time_faceoff']

# Count goals per team in different time frames
teams_goal_counts = merged_df.groupby('Primary_team_goal')['time_diff'].agg(
    within_1s=lambda x: (x <= 1).sum(),
    within_2s=lambda x: (x <= 2).sum(),
    within_3s=lambda x: (x <= 3).sum(),
    within_4s=lambda x: (x <= 4).sum(),
    within_5s=lambda x: (x <= 5).sum(),
    within_6s=lambda x: (x <= 6).sum(),
    within_7s=lambda x: (x <= 7).sum(),
    within_8s=lambda x: (x <= 8).sum(),
    within_9s=lambda x: (x <= 9).sum(),
    within_10s=lambda x: (x <= 10).sum()
).reset_index()

# Add the total goals scored by each team
total_goals_per_team = goal_events['Primary_team'].value_counts().reset_index()
total_goals_per_team.columns = ['Primary_team_goal', 'Total_Goals']

# Merge with the team goal counts
teams_goal_counts = teams_goal_counts.merge(total_goals_per_team, on='Primary_team_goal', how='left')



In [32]:
teams_goal_counts.head()

Unnamed: 0,Primary_team_goal,within_1s,within_2s,within_3s,within_4s,within_5s,within_6s,within_7s,within_8s,within_9s,within_10s,Total_Goals
0,Air Force,0,0,0,2,2,2,4,5,6,8,55
1,Alaska,0,1,1,3,4,6,6,6,8,9,54
2,Alaska Anchorage,0,0,0,0,0,1,1,2,2,2,55
3,American Intl,0,0,0,0,1,1,1,2,2,4,57
4,Arizona State,0,0,0,0,1,1,1,1,1,2,90


In [33]:
# Calculate the percentage for each goal duration
for sec in range(1, 11):
    col = f'within_{sec}s'
    pct_col = f'{col}_pct'
    teams_goal_counts[pct_col] = teams_goal_counts[col] / teams_goal_counts['Total_Goals']

# Rearrange columns in an interleaved order: original value then its percentage
cols = ['Primary_team_goal', 'Total_Goals'] + [
    item for sec in range(1, 11)
    for item in (f'within_{sec}s', f'within_{sec}s_pct')
]

team_goal_counts = teams_goal_counts[cols]


In [34]:
team_goal_counts.sample(10)

Unnamed: 0,Primary_team_goal,Total_Goals,within_1s,within_1s_pct,within_2s,within_2s_pct,within_3s,within_3s_pct,within_4s,within_4s_pct,...,within_6s,within_6s_pct,within_7s,within_7s_pct,within_8s,within_8s_pct,within_9s,within_9s_pct,within_10s,within_10s_pct
59,St. Thomas,75,0,0.0,0,0.0,0,0.0,1,0.013333,...,2,0.026667,2,0.026667,3,0.04,4,0.053333,5,0.066667
16,Colgate,80,0,0.0,1,0.0125,1,0.0125,3,0.0375,...,7,0.0875,7,0.0875,7,0.0875,7,0.0875,7,0.0875
60,Stonehill,57,0,0.0,0,0.0,0,0.0,0,0.0,...,2,0.035088,2,0.035088,3,0.052632,3,0.052632,5,0.087719
36,Michigan Tech,67,0,0.0,0,0.0,0,0.0,0,0.0,...,2,0.029851,2,0.029851,3,0.044776,4,0.059701,6,0.089552
11,Boston University,86,0,0.0,1,0.011628,2,0.023256,3,0.034884,...,5,0.05814,6,0.069767,7,0.081395,9,0.104651,9,0.104651
4,Arizona State,90,0,0.0,0,0.0,0,0.0,0,0.0,...,1,0.011111,1,0.011111,1,0.011111,1,0.011111,2,0.022222
12,Bowling Green,63,0,0.0,1,0.015873,1,0.015873,2,0.031746,...,2,0.031746,3,0.047619,3,0.047619,3,0.047619,5,0.079365
47,Omaha,67,0,0.0,0,0.0,0,0.0,0,0.0,...,3,0.044776,4,0.059701,4,0.059701,5,0.074627,7,0.104478
0,Air Force,55,0,0.0,0,0.0,0,0.0,2,0.036364,...,2,0.036364,4,0.072727,5,0.090909,6,0.109091,8,0.145455
9,Bentley,68,0,0.0,1,0.014706,1,0.014706,1,0.014706,...,1,0.014706,1,0.014706,2,0.029412,5,0.073529,7,0.102941


## Save Resulting Table as CSV

In [35]:
### Save to data folder
team_goal_counts.to_csv(temp_folder + '/faceoff_goal_counts.csv', index=False)

###### Look at table sorted by column

In [36]:
# ##3 Sort by Most within 5 seconds
# teams_goal_counts = teams_goal_counts.sort_values(by='within_5s', ascending=False)
# teams_goal_counts.head(10)

# teams_goal_counts = teams_goal_counts.sort_values(by='within_3s', ascending=False)
# teams_goal_counts.head(10)

# within_7s = teams_goal_counts.sort_values(by='within_7s', ascending=False)
# within_7s.head(10)

# within_10s = teams_goal_counts.sort_values(by='within_10s', ascending=False)
# within_10s.head(10)

# # any within 1s?
# within_1s = teams_goal_counts.sort_values(by='within_1s', ascending=False)
# within_1s.head(10)
# within_2s = teams_goal_counts.sort_values(by='within_2s', ascending=False)
# within_2s.head(10)
within_3s = teams_goal_counts.sort_values(by='within_3s', ascending=False)
within_3s.head(10)
# within_4s = teams_goal_counts.sort_values(by='within_4s', ascending=False)
# within_4s.head(10)
# within_5s = teams_goal_counts.sort_values(by='within_5s', ascending=False)
# within_5s.head(10)
# within_6s = teams_goal_counts.sort_values(by='within_6s', ascending=False)
# within_6s.head(10)
# within_7s = teams_goal_counts.sort_values(by='within_7s', ascending=False)
# within_7s.head(10)
# within_8s = teams_goal_counts.sort_values(by='within_8s', ascending=False)
# within_8s.head(10)
# within_9s = teams_goal_counts.sort_values(by='within_9s', ascending=False)
# within_9s.head(10)
# within_10s = teams_goal_counts.sort_values(by='within_10s', ascending=False)
# within_10s.head(10)

Unnamed: 0,Primary_team_goal,within_1s,within_2s,within_3s,within_4s,within_5s,within_6s,within_7s,within_8s,within_9s,...,within_1s_pct,within_2s_pct,within_3s_pct,within_4s_pct,within_5s_pct,within_6s_pct,within_7s_pct,within_8s_pct,within_9s_pct,within_10s_pct
55,Sacred Heart,1,1,5,6,6,6,6,8,11,...,0.010989,0.010989,0.054945,0.065934,0.065934,0.065934,0.065934,0.087912,0.120879,0.120879
43,Northeastern,0,1,3,3,3,3,4,5,6,...,0.0,0.015385,0.046154,0.046154,0.046154,0.046154,0.061538,0.076923,0.092308,0.092308
27,Long Island,0,2,3,3,4,8,10,11,13,...,0.0,0.025641,0.038462,0.038462,0.051282,0.102564,0.128205,0.141026,0.166667,0.179487
35,Michigan State,0,1,2,5,6,8,8,11,14,...,0.0,0.010638,0.021277,0.053191,0.06383,0.085106,0.085106,0.117021,0.148936,0.191489
49,Princeton,0,0,2,3,3,4,4,4,5,...,0.0,0.0,0.045455,0.068182,0.068182,0.090909,0.090909,0.090909,0.113636,0.113636
57,St. Cloud State,0,0,2,2,2,3,4,7,9,...,0.0,0.0,0.037037,0.037037,0.037037,0.055556,0.074074,0.12963,0.166667,0.166667
32,Merrimack,1,1,2,2,4,4,4,5,7,...,0.016667,0.016667,0.033333,0.033333,0.066667,0.066667,0.066667,0.083333,0.116667,0.116667
24,Holy Cross,0,1,2,4,5,5,7,7,8,...,0.0,0.012821,0.025641,0.051282,0.064103,0.064103,0.089744,0.089744,0.102564,0.115385
34,Michigan,0,0,2,2,3,4,5,6,7,...,0.0,0.0,0.022989,0.022989,0.034483,0.045977,0.057471,0.068966,0.08046,0.08046
11,Boston University,0,1,2,3,5,5,6,7,9,...,0.0,0.011628,0.023256,0.034884,0.05814,0.05814,0.069767,0.081395,0.104651,0.104651


### Goals Allowed Just after faceoffs

In [37]:
## Reset df to a clean copy of the full pbp data
df = pbp_raw_df



# df.head() # data check


In [38]:
# Fill the Secondary team column with the opposite team 
# If the Primary team is 'Home', the Secondary team is 'Away' and vice versa based on Game_ID
# Away is after third '-' in game_id. home is after 4th
# check primary team against game id and put opposite in secondary

# Create a new column 'Secondary_team' and initialize it with None
df['Secondary_team'] = None
# Loop through each row in the DataFrame
for index, row in df.iterrows():
    # Extract the game ID
    game_id = row['Game_ID']
    # Determine the home and away teams based on the game ID
    away_team = game_id.split('-')[3]
    home_team = game_id.split('-')[4]
    # Assign the opposite team to the 'Secondary_team' column based on the 'Primary_team'
    if row['Primary_team'] == home_team:
        df.at[index, 'Secondary_team'] = away_team
    else:
        df.at[index, 'Secondary_team'] = home_team


# Filter for relevant events: Faceoffs and Goals
faceoff_events = df[df['Event_type'] == 'Faceoff']
goal_events = df[df['Event_type'] == 'Goal']

# Check the first few rows to verify the new column
# df.head()

In [39]:

# Merge faceoff events with subsequent goal events within the same game and period
merged_allowed_df = pd.merge(
    faceoff_events[['Game_ID', 'Period', 'Time', 'Primary_team', 'Secondary_team']],
    goal_events[['Game_ID', 'Period', 'Time', 'Primary_team', 'Secondary_team']],
    on=['Game_ID', 'Period'],
    suffixes=('_faceoff', '_goal')
)

# Calculate the time difference between faceoff and goal
merged_allowed_df['time_diff'] = merged_allowed_df['Time_goal'] - merged_allowed_df['Time_faceoff']

# Filter only instances where the goal happens after the faceoff
merged_allowed_df = merged_allowed_df[merged_allowed_df['time_diff'] > 0]

merged_allowed_df.head()
# Count goals per team in different time frames
teams_goal_counts = merged_allowed_df.groupby('Secondary_team_goal')['time_diff'].agg(
    within_1s=lambda x: (x <= 1).sum(),
    within_2s=lambda x: (x <= 2).sum(),
    within_3s=lambda x: (x <= 3).sum(),
    within_4s=lambda x: (x <= 4).sum(),
    within_5s=lambda x: (x <= 5).sum(),
    within_6s=lambda x: (x <= 6).sum(),
    within_7s=lambda x: (x <= 7).sum(),
    within_8s=lambda x: (x <= 8).sum(),
    within_9s=lambda x: (x <= 9).sum(),
    within_10s=lambda x: (x <= 10).sum()
).reset_index()

# Add the total goals scored by each team
total_goals_per_team = goal_events['Secondary_team'].value_counts().reset_index()
total_goals_per_team.columns = ['Secondary_team', 'Total_Goals']

# Merge with the team goal counts
 

teams_goal_counts.head()



Unnamed: 0,Secondary_team_goal,within_1s,within_2s,within_3s,within_4s,within_5s,within_6s,within_7s,within_8s,within_9s,within_10s
0,Air Force,0,2,2,2,2,2,3,3,5,9
1,Alaska,0,0,0,1,2,2,2,2,2,3
2,Alaska Anchorage,1,1,3,3,4,4,4,6,8,9
3,American Intl,0,1,3,4,7,7,7,8,9,10
4,Arizona State,0,0,0,1,1,1,2,3,3,3


### AFTER PP SUCCESS - Within 10 seconds of end of PP