# Empty Net Analysis
## Look at team performance when pulling their goalie and vs an empty net

### Challenge
- CHN data counts pulling the goalie during a delayed penalty and records that as time vs empty net
    - this skews the count of empty net as almost every team has at least a small portion of time that shows as having an empty net each game when the team did not actually pull their goalie in an end game situation
- build a function that looks at the scoring summary for a game and makes a guess at weither a team was in a situation to pull their goalie for the end game
    - Conditions:
        - Down 1, 2, 3 goals within the last 5 minutes of the 3rd period
            - How to identify, dictionary for each game where the score changes are tracked and if the conditions described above are met tag the game as a possible late_game_EN

#### Setup - Paths and Dependencies

In [41]:
###### SETUP ######

# Dependencies
# Basics
import os
import sys
import time
import sqlite3
import pandas as pd
from collections import defaultdict


## File Paths
folder_prefix = ''
# folder_prefix = '..'
data_folder = os.path.join(folder_prefix, '..', 'data/') # Data Folder Path
temp_folder = os.path.join(folder_prefix,'..', 'TEMP/',) # Temp Folder Path
TEMP_FOLDER = temp_folder # Temp Folder Path as used in legacy code
output_folder = os.path.join(temp_folder, 'team_comp_output/') # Output Folder Path

################ DATABASE PATH ####################
db_path = os.path.join(data_folder, 'db', '2025_Jan_16_CLEAN.db') # Database Path

#### SCHOOL INFO FILE PATH ####
school_info_path = os.path.join(data_folder, 'arena_school_info.csv') # School Info Path


###### Paths important for plotting - logos, etc. ######
### FILL IN IF NECESSARY ###

#### Connect to DB and extract ness tables

In [42]:
## Load the database
conn = sqlite3.connect(db_path, isolation_level=None)

######## SCORING / GOAL SUMMARY ########
## SQL query to fetch
def extract_goal_summary(conn):
    """
    Extracts and preprocesses the goal summary data from the database.
    """
    query = """
        WITH UniqueGoals AS (
        SELECT DISTINCT Game_ID, Team, Period, Time, PP
        FROM scoring_summary
    )
    SELECT * FROM UniqueGoals;
    """
    goal_df = pd.read_sql(query, conn)
    return goal_df

# Convert string time to continuous time value (float of minutes)
def convert_to_continuous_time(row):
    """
    Converts period-based time to a continuous format (0-65 minutes).
    """
    period_offsets = {'1st Period': 0, '2nd Period': 20, '3rd Period': 40, 'Overtime': 60}
    minutes, seconds = map(int, row['Time'].split(':'))
    offset = period_offsets.get(row['Period'], 0)
    return offset + minutes + seconds / 60.0

## Extract the goal summary data
goal_df = extract_goal_summary(conn)
# Create a continuous time column
goal_df['continuous_time'] = goal_df.apply(convert_to_continuous_time, axis=1)

#### Check the data table
# goal_df.head()

In [43]:
############ GOALTENDER STATS ############
def extract_goalie_stats(conn):
    """
    Extracts and preprocesses the goalie stats data from the database.
    """
    query = """
        SELECT * FROM goalie_stats;
    """
    goalie_df = pd.read_sql(query, conn)
    return goalie_df

# Extract the goalie stats data
goalie_df = extract_goalie_stats(conn)

# Check the data table
# goalie_df.head()

#### Identify Games with Late Game Empty Net Situations

In [44]:
def evaluate_late_game_with_shutouts(goal_df, goalie_df):
# def evaluate_late_game_with_goalie_data(goal_df, goalie_df):
    """
    Ensure all teams in each game are explicitly represented by incorporating goalie data.
    
    Args:
        goal_df (pd.DataFrame): DataFrame containing game scoring data.
        goalie_df (pd.DataFrame): DataFrame containing game goalie data.
        
    Returns:
        pd.DataFrame: New DataFrame with columns ['Game_ID', 'Team', 'EN_likely', 'total_goals_regulation'].
    """
    # Initialize the result list
    result = []
    
    # Extract all unique games and their teams from the goalie_df
    all_games_teams = goalie_df.groupby('Game_ID')['Team'].unique()
    
    for game_id, teams_in_game in all_games_teams.items():
        # Initialize scores and likelihood for all teams in this game
        team_scores = {team: 0 for team in teams_in_game}
        en_likely = {team: False for team in teams_in_game}
        
        # Filter data for this game and exclude overtime goals
        regulation_group = goal_df[
            (goal_df['Game_ID'] == game_id) & (goal_df['continuous_time'] <= 60)
        ]
        
        # Update scores based on regulation goals
        for _, row in regulation_group.iterrows():
            team = row['Team']
            team_scores[team] += 1
        
        # Evaluate the final state of the game (end of regulation)
        for trailing_team, trailing_score in team_scores.items():
            for other_team, other_score in team_scores.items():
                if trailing_team != other_team:
                    score_diff = trailing_score - other_score
                    if score_diff < 0 and abs(score_diff) <= 3:  # Trailing by 1-3 goals
                        en_likely[trailing_team] = True
        
        # Append results for all teams in the game
        for team in teams_in_game:
            result.append({
                'Game_ID': game_id,
                'Team': team,
                'EN_likely': en_likely[team],
                'total_goals_regulation': team_scores.get(team, 0)
            })
    
    # Convert results to a DataFrame
    return pd.DataFrame(result)

# Apply the updated function to classify empty net scenarios with shutout handling
final_empty_net_scenarios = evaluate_late_game_with_shutouts(goal_df, goalie_df)

# Check for teams with zero total_goals_regulation in the results table
shutout_teams = final_empty_net_scenarios[
    final_empty_net_scenarios['total_goals_regulation'] == 0
]

# Display results to verify if shutout teams are included
shutout_teams.head()

Unnamed: 0,Game_ID,Team,EN_likely,total_goals_regulation
12,2024-10-05-American Int'l-Maine,American Intl,False,0
47,2024-10-06-Penn State-Alaska,Alaska,False,0
51,2024-10-06-St. Cloud State-St. Thomas,St Thomas,True,0
63,2024-10-11-Boston College-Michigan State,Michigan State,True,0
70,2024-10-11-Long Island-Augustana,Long Island,False,0


In [45]:
#### Verification using Notre Dame as an example - I know they were shut out twice (as of 1-13-24) including diring their game on 1-5 vs Penn State


### Seearch for instaces of Notre Dame in the data
final_empty_net_scenarios[final_empty_net_scenarios['Team'] == 'Notre Dame']

## Search Game_IDs containing Notre Dame
notre_dame_game_ids = final_empty_net_scenarios[final_empty_net_scenarios['Team'] == 'Notre Dame']['Game_ID'].unique()
print(notre_dame_game_ids)
print(len(notre_dame_game_ids))



['2024-10-11-Notre Dame-St. Lawrence' '2024-10-12-Notre Dame-Clarkson'
 '2024-10-18-Alaska-Notre Dame' '2024-10-19-Alaska-Notre Dame'
 '2024-10-25-Long Island-Notre Dame' '2024-10-26-Long Island-Notre Dame'
 '2024-11-01-Wisconsin-Notre Dame' '2024-11-02-Wisconsin-Notre Dame'
 '2024-11-08-Notre Dame-Michigan' '2024-11-09-Notre Dame-Michigan'
 '2024-11-15-Notre Dame-Michigan State'
 '2024-11-16-Notre Dame-Michigan State' '2024-11-22-Minnesota-Notre Dame'
 '2024-11-23-Minnesota-Notre Dame' '2024-11-29-Harvard-Notre Dame'
 '2024-11-30-Notre Dame-Boston University'
 '2024-12-13-Notre Dame-Ohio State' '2024-12-14-Notre Dame-Ohio State'
 '2025-01-03-Notre Dame-Penn State' '2025-01-05-Penn State-Notre Dame'
 '2025-01-10-Michigan-Notre Dame' '2025-01-11-Michigan-Notre Dame']
22


#### Evaluate the Goalie_df to identify when goalies were pulled in late game (leaving out delayed penalty pulls)

In [46]:
### Count the instances where a team pulled their goalie in a late game scenario
def count_actual_late_game_pulls(goalie_df, en_likely_df):
    """
    Count instances of actual late-game goalie pulls using EN_likely data to filter out delayed penalties.
    
    Args:
        goalie_df (pd.DataFrame): DataFrame containing goalie data with EMPTY NET records.
        en_likely_df (pd.DataFrame): DataFrame containing EN_likely information for each team and game.
        
    Returns:
        pd.DataFrame: Summary of actual late-game goalie pulls.
    """
    # Merge goalie_df with EN_likely data
    merged_df = goalie_df.merge(
        en_likely_df[['Game_ID', 'Team', 'EN_likely']],
        left_on=['Game_ID', 'Team'],
        right_on=['Game_ID', 'Team'],
        how='left'
    )
    
    # Filter for actual late-game pulls
    # Opponent's EN_likely must be True, and "Goalie" must be "EMPTY NET"
    actual_pulls = merged_df[
        (merged_df['Goalie'] == 'EMPTY NET') & (merged_df['EN_likely'] == True)
    ]
    
    # Count occurrences by team and game
    pulls_count = actual_pulls.groupby(['Game_ID', 'Team']).size().reset_index(name='Pulls_Count')
    
    return pulls_count

# Apply the function to count actual late-game pulls
actual_late_game_pulls = count_actual_late_game_pulls(goalie_df, final_empty_net_scenarios)

## Display the results
actual_late_game_pulls.head()



def aggregate_goalie_pulls_with_opponents(pulls_df, en_likely_df):
    """
    Create an aggregated table showing how many times each team pulled their goalie and how many times
    their opponents pulled their goalie, using EN_likely data for opponent inference.
    
    Args:
        pulls_df (pd.DataFrame): DataFrame with actual late-game goalie pulls per team and game.
        en_likely_df (pd.DataFrame): DataFrame containing EN_likely information for each team and game.
        
    Returns:
        pd.DataFrame: Aggregated table with team-level counts.
    """
    # Count the number of pulls for each team
    team_pulls = pulls_df.groupby('Team')['Pulls_Count'].sum().reset_index()
    team_pulls.rename(columns={'Pulls_Count': 'Times_Pulled_Goalie'}, inplace=True)
    
    # Infer opponents using the EN_likely data
    # Create a mapping of Game_ID -> Teams
    game_teams = en_likely_df.groupby('Game_ID')['Team'].apply(list).to_dict()
    
    # Initialize opponent pull counts
    opponent_pulls = {team: 0 for team in team_pulls['Team']}
    
    for _, row in pulls_df.iterrows():
        game_id = row['Game_ID']
        team = row['Team']
        pull_count = row['Pulls_Count']
        
        # Find opponent(s) for the game
        opponents = [t for t in game_teams[game_id] if t != team]
        for opponent in opponents:
            if opponent in opponent_pulls:
                opponent_pulls[opponent] += pull_count  # Add pull count to opponent
    
    # Add opponent pull counts to the team table
    team_pulls['Opponent_Pulled_Goalie'] = team_pulls['Team'].map(opponent_pulls)
    
    return team_pulls

# Apply the updated function with enhanced opponent inference logic
aggregated_goalie_pulls_with_opponents = aggregate_goalie_pulls_with_opponents(
    actual_late_game_pulls, final_empty_net_scenarios
)

# Display the updated aggregated table for verification
aggregated_goalie_pulls_with_opponents.head()

# Check Michigan State for verification
aggregated_goalie_pulls_with_opponents[aggregated_goalie_pulls_with_opponents['Team'] == 'Michigan State']


Unnamed: 0,Team,Times_Pulled_Goalie,Opponent_Pulled_Goalie
34,Michigan State,1,10


#### Aggrigate the total amount of EMPTY NET time and add to the results table

In [47]:
### Helper Function to normalize the time format (datetime freaks out when minute values are 60 or greater)

def normalize_minutes_column(minutes_series):
    """
    Normalize the Minutes column to a consistent timedelta format.
    
    Args:
        minutes_series (pd.Series): Series containing time strings.
        
    Returns:
        pd.Series: Series with time converted to timedelta.
    """
    normalized_minutes = []
    
    for time_str in minutes_series:
        try:
            # Handle "mm:ss" or "hh:mm:ss" formats
            if ':' in time_str:
                parts = time_str.split(':')
                if len(parts) == 2:  # mm:ss
                    normalized_minutes.append(f"00:{time_str}")
                elif len(parts) == 3:  # hh:mm:ss
                    normalized_minutes.append(time_str)
                else:
                    normalized_minutes.append("00:00:00")
            else:
                normalized_minutes.append("00:00:00")
        except Exception:
            normalized_minutes.append("00:00:00")
    
    # Convert to timedelta
    return pd.to_timedelta(normalized_minutes, errors='coerce')

# Normalize the Minutes column in goalie_df
goalie_df['Minutes'] = normalize_minutes_column(goalie_df['Minutes'])

# Verify the conversion worked without errors
goalie_df['Minutes'].head()



0   0 days 01:00:39
1   0 days 01:00:39
2   0 days 01:00:00
3   0 days 01:00:00
4   0 days 01:00:00
Name: Minutes, dtype: timedelta64[ns]

In [48]:
def add_pulled_goalie_times(aggregated_df, goalie_df, en_likely_df):
    """
    Add total time teams have had their goalie pulled and the total time their opponents
    have had their goalie pulled to the aggregated table.
    
    Args:
        aggregated_df (pd.DataFrame): Aggregated table with pull counts for teams and opponents.
        goalie_df (pd.DataFrame): DataFrame containing goalie data with EMPTY NET records.
        en_likely_df (pd.DataFrame): DataFrame containing EN_likely information for each team and game.
        
    Returns:
        pd.DataFrame: Updated aggregated table with time data.
    """
    # Calculate total time with goalie pulled for each team
    goalie_df['Minutes'] = pd.to_timedelta(goalie_df['Minutes'])  # Convert time to timedelta
    empty_net_time = goalie_df[goalie_df['Goalie'] == 'EMPTY NET']
    team_times = empty_net_time.groupby('Team')['Minutes'].sum().reset_index()
    team_times.rename(columns={'Minutes': 'Total_Time_Pulled'}, inplace=True)
    
    # Merge team times into the aggregated table
    aggregated_df = aggregated_df.merge(team_times, on='Team', how='left')
    aggregated_df['Total_Time_Pulled'] = aggregated_df['Total_Time_Pulled'].fillna(pd.Timedelta(0))
    
    # Calculate opponent times
    game_teams = en_likely_df.groupby('Game_ID')['Team'].apply(list).to_dict()
    opponent_times = {team: pd.Timedelta(0) for team in aggregated_df['Team']}
    
    for _, row in empty_net_time.iterrows():
        game_id = row['Game_ID']
        team = row['Team']
        time = row['Minutes']
        
        # Find opponent(s) for the game
        opponents = [t for t in game_teams[game_id] if t != team]
        for opponent in opponents:
            if opponent in opponent_times:
                opponent_times[opponent] += time
    
    # Add opponent times to the aggregated table
    aggregated_df['Opponent_Time_Pulled'] = aggregated_df['Team'].map(opponent_times)
    
    return aggregated_df

# Apply the function to calculate pulled goalie times
aggregated_with_times = add_pulled_goalie_times(
    aggregated_goalie_pulls_with_opponents, goalie_df, final_empty_net_scenarios
)
# Simplify the time format for display (Total_Time_Pulled and Opponent_Time_Pulled columns)
aggregated_with_times['Total_Time_Pulled'] = aggregated_with_times['Total_Time_Pulled'].dt.total_seconds()
aggregated_with_times['Opponent_Time_Pulled'] = aggregated_with_times['Opponent_Time_Pulled'].dt.total_seconds()
# Rename Time columns to reflect they are in seconds
aggregated_with_times.rename(columns={
    'Total_Time_Pulled': 'Total_Time_Pulled_Seconds',
    'Opponent_Time_Pulled': 'Opponent_Time_Pulled_Seconds'
}, inplace=True)



# Display the updated aggregated table with time data
aggregated_with_times.head()


Unnamed: 0,Team,Times_Pulled_Goalie,Opponent_Pulled_Goalie,Total_Time_Pulled_Seconds,Opponent_Time_Pulled_Seconds
0,Air Force,6,9,1265.0,1094.0
1,Alaska,4,5,1053.0,800.0
2,Alaska Anchorage,9,4,1416.0,848.0
3,American Intl,9,4,996.0,527.0
4,Arizona State,5,7,884.0,1164.0


#### Tally Team Empty Net, Extra Attacker goals each team has scored and allowed

In [49]:
## Load the scoring summary data into a dataframe for further analysis
def extract_scoring_summary(conn):
    """
    Extracts and preprocesses the scoring summary data from the database.
    """
    query = """
        SELECT * FROM scoring_summary;
    """
    scoring_df = pd.read_sql(query, conn)
    return scoring_df

# Extract the scoring summary data
scoring_df = extract_scoring_summary(conn)

# Display the scoring summary data
# scoring_df.head()


##### Process the scoring_df to tally EN goals for/ against and EA goals for/against

In [50]:
def calculate_goals_conditions(scoring_df):
# def calculate_goals_conditions_fixed(scoring_df):
    """
    Calculate EN and EA goals for and against each team.
    
    Args:
        scoring_df (pd.DataFrame): DataFrame containing scoring data.
        
    Returns:
        pd.DataFrame: DataFrame with counts of EN and EA goals for and against each team.
    """
    # Normalize PP column: Split multi-condition codes (e.g., "PP,EN") and explode them
    scoring_df['PP'] = scoring_df['PP'].fillna('')
    scoring_df['PP_Split'] = scoring_df['PP'].str.split(',')
    scoring_exploded = scoring_df.explode('PP_Split')
    
    # Create Opponent column
    scoring_exploded['Opponent'] = scoring_exploded.apply(
        lambda row: row['Away_Team'] if row['Team'] == row['Home_Team'] else row['Home_Team'],
        axis=1
    )
    
    # Filter for EN (Empty Net) and EA (Extra Attacker) goals
    en_goals = scoring_exploded[scoring_exploded['PP_Split'] == 'EN']
    ea_goals = scoring_exploded[scoring_exploded['PP_Split'] == 'EA']
    
    # Calculate EN and EA goals for each team
    en_goals_for = en_goals.groupby('Team').size().reset_index(name='EN_goals_for')
    ea_goals_for = ea_goals.groupby('Team').size().reset_index(name='EA_goals_for')
    
    # Calculate EN and EA goals against
    en_goals_against = en_goals.groupby('Opponent').size().reset_index(name='EN_goals_against')
    ea_goals_against = ea_goals.groupby('Opponent').size().reset_index(name='EA_goals_against')
    
    # Combine all counts into a single table
    goals_summary = (
        en_goals_for
        .merge(ea_goals_for, on='Team', how='outer')
        .merge(en_goals_against, left_on='Team', right_on='Opponent', how='outer')
        .merge(ea_goals_against, left_on='Team', right_on='Opponent', how='outer')
        .fillna(0)
    )
    
    # Drop redundant opponent columns and ensure data types are integers
    goals_summary = goals_summary.drop(columns=['Opponent_x', 'Opponent_y'])
    goals_summary[['EN_goals_for', 'EA_goals_for', 'EN_goals_against', 'EA_goals_against']] = (
        goals_summary[['EN_goals_for', 'EA_goals_for', 'EN_goals_against', 'EA_goals_against']]
        .astype(int)
    )
    
    return goals_summary

# Calculate EN and EA goals for and against each team
goals_summary = calculate_goals_conditions(scoring_df)

# Display the resulting summary for verification
goals_summary.head()


Unnamed: 0,Team,EN_goals_for,EA_goals_for,EN_goals_against,EA_goals_against
0,Air Force,3,2,5,2
1,Alaska,2,3,5,2
2,Alaska Anchorage,6,2,2,3
3,American Intl,1,1,4,2
4,Arizona State,4,4,3,2


In [51]:
### TEMP CLEANING STEP TO REMOVE THE Team: 0 ROWS and try to figure out what is going on with that

def clean_up_goals_summary(goals_summary):
    """
    Clean up the goals summary by removing rows with invalid or missing teams.
    
    Args:
        goals_summary (pd.DataFrame): DataFrame containing goals summary data.
        
    Returns:
        pd.DataFrame: Cleaned DataFrame with valid team rows only.
    """
    # Remove rows where Team is missing or invalid
    cleaned_summary = goals_summary[goals_summary['Team'] != 0].copy()
    
    return cleaned_summary

# Clean up the goals summary
goals_summary_cleaned = clean_up_goals_summary(goals_summary)

# Display the cleaned summary for verification
goals_summary_cleaned.head()

Unnamed: 0,Team,EN_goals_for,EA_goals_for,EN_goals_against,EA_goals_against
0,Air Force,3,2,5,2
1,Alaska,2,3,5,2
2,Alaska Anchorage,6,2,2,3
3,American Intl,1,1,4,2
4,Arizona State,4,4,3,2


#### Merge the EN/EA goal data in with the rest of the results
- NOTE as of 1-13-24 still have not done verification of the EN/EA goals for/against tallys

In [52]:
## Merge the goals_summary_cleaned with the aggregated_with_times table to get a final table for analysis

def merge_goals_with_aggregated(goals_summary, aggregated_df):
    """
    Merge the goals summary data with the aggregated goalie pull data.
    
    Args:
        goals_summary (pd.DataFrame): DataFrame containing goals summary data.
        aggregated_df (pd.DataFrame): DataFrame containing aggregated goalie pull data.
        
    Returns:
        pd.DataFrame: Merged DataFrame with goalie pull and goal data.
    """
    # Merge the two tables
    merged_df = aggregated_df.merge(goals_summary, on='Team', how='left')
    merged_df = merged_df.fillna(0)  # Fill missing values with 0
    
    return merged_df

# Merge the goals summary with the aggregated table
final_analysis_table = merge_goals_with_aggregated(goals_summary_cleaned, aggregated_with_times)

# Display the final analysis table for verification
final_analysis_table.head(10)

Unnamed: 0,Team,Times_Pulled_Goalie,Opponent_Pulled_Goalie,Total_Time_Pulled_Seconds,Opponent_Time_Pulled_Seconds,EN_goals_for,EA_goals_for,EN_goals_against,EA_goals_against
0,Air Force,6,9,1265.0,1094.0,3.0,2.0,5.0,2.0
1,Alaska,4,5,1053.0,800.0,2.0,3.0,5.0,2.0
2,Alaska Anchorage,9,4,1416.0,848.0,6.0,2.0,2.0,3.0
3,American Intl,9,4,996.0,527.0,1.0,1.0,4.0,2.0
4,Arizona State,5,7,884.0,1164.0,4.0,4.0,3.0,2.0
5,Army,9,4,841.0,622.0,1.0,0.0,8.0,0.0
6,Augustana,5,7,737.0,1045.0,5.0,1.0,0.0,1.0
7,Bemidji State,7,5,1210.0,571.0,1.0,3.0,3.0,3.0
8,Bentley,5,11,640.0,1404.0,7.0,0.0,3.0,2.0
9,Boston College,3,10,418.0,726.0,6.0,2.0,2.0,1.0


##### Add RAW Count of Times_Pulled and Opp_Pulled
- not sure this info will be useful for anything except debugging and error checking

In [53]:
def add_raw_empty_net_counts(final_analysis_table, goalie_df):
    """
    Add raw counts of EMPTY NET occurrences for each team and their opponents.
    
    Args:
        final_analysis_table (pd.DataFrame): Aggregated table with analysis metrics.
        goalie_df (pd.DataFrame): DataFrame containing goalie data.
        
    Returns:
        pd.DataFrame: Updated table with raw counts of EMPTY NET occurrences.
    """
    # Count occurrences of EMPTY NET for each team
    empty_net_counts = goalie_df[goalie_df['Goalie'] == 'EMPTY NET'].groupby('Team').size().reset_index(name='Raw_Times_Pulled')
    
    # Map Game_ID to teams to infer opponents
    game_team_mapping = goalie_df.groupby('Game_ID')['Team'].apply(list).to_dict()
    
    # Calculate raw opponent pulls
    opponent_counts = {team: 0 for team in final_analysis_table['Team']}
    for _, row in goalie_df[goalie_df['Goalie'] == 'EMPTY NET'].iterrows():
        game_id = row['Game_ID']
        team = row['Team']
        opponents = [t for t in game_team_mapping.get(game_id, []) if t != team]
        for opponent in opponents:
            if opponent in opponent_counts:
                opponent_counts[opponent] += 1
    
    # Merge raw counts into the final analysis table
    final_analysis_table = final_analysis_table.merge(empty_net_counts, on='Team', how='left')
    final_analysis_table['Raw_Times_Pulled'] = final_analysis_table['Raw_Times_Pulled'].fillna(0).astype(int)
    final_analysis_table['Raw_Opponent_Pulled'] = final_analysis_table['Team'].map(opponent_counts).fillna(0).astype(int)
    
    return final_analysis_table

# Apply the function to add raw counts
final_analysis_table_with_raw_counts = add_raw_empty_net_counts(final_analysis_table, goalie_df)

## Display the final analysis table with raw counts for verification
final_analysis_table_with_raw_counts.head(10)

Unnamed: 0,Team,Times_Pulled_Goalie,Opponent_Pulled_Goalie,Total_Time_Pulled_Seconds,Opponent_Time_Pulled_Seconds,EN_goals_for,EA_goals_for,EN_goals_against,EA_goals_against,Raw_Times_Pulled,Raw_Opponent_Pulled
0,Air Force,6,9,1265.0,1094.0,3.0,2.0,5.0,2.0,12,14
1,Alaska,4,5,1053.0,800.0,2.0,3.0,5.0,2.0,10,13
2,Alaska Anchorage,9,4,1416.0,848.0,6.0,2.0,2.0,3.0,12,18
3,American Intl,9,4,996.0,527.0,1.0,1.0,4.0,2.0,12,9
4,Arizona State,5,7,884.0,1164.0,4.0,4.0,3.0,2.0,9,18
5,Army,9,4,841.0,622.0,1.0,0.0,8.0,0.0,14,17
6,Augustana,5,7,737.0,1045.0,5.0,1.0,0.0,1.0,11,18
7,Bemidji State,7,5,1210.0,571.0,1.0,3.0,3.0,3.0,11,15
8,Bentley,5,11,640.0,1404.0,7.0,0.0,3.0,2.0,11,17
9,Boston College,3,10,418.0,726.0,6.0,2.0,2.0,1.0,7,13


#### Use Linescore Table to create a team overall results profile

In [54]:
#### Extrtact the linescore table for further analysis

def extract_linescore(conn):
    """
    Extracts and preprocesses the linescore data from the database.
    """
    query = """
        SELECT * FROM linescore;
    """
    linescore_df = pd.read_sql(query, conn)
    return linescore_df

# Extract the linescore data
linescore_df = extract_linescore(conn)

# Display the linescore data
linescore_df.head()

Unnamed: 0,Team,goals1,goals2,goals3,goals4,goalsT,shots1,shots2,shots3,shots4,...,PPG,PPO,FOW,FOL,FOW%,goals5,goals6,shots5,shots6,Game_ID
0,Michigan State,1,0,0,1,2,6,7,25,1,...,0,3,15,30,33.333333,0,0,0,0,2024-10-04-Michigan State-Lake Superior
1,Lake Superior,1,0,0,0,1,10,11,5,0,...,0,4,30,15,66.666667,0,0,0,0,2024-10-04-Michigan State-Lake Superior
2,Minnesota State,0,2,3,0,5,4,11,8,0,...,0,3,21,40,34.42623,0,0,0,0,2024-10-04-Minnesota State-Michigan
3,Michigan,0,1,1,0,2,13,12,9,0,...,1,2,40,21,65.57377,0,0,0,0,2024-10-04-Minnesota State-Michigan
4,Arizona State,2,2,4,0,8,9,4,11,0,...,2,2,23,36,38.983051,0,0,0,0,2024-10-04-Arizona State-Air Force


In [55]:
def calculate_team_stats_fixed(linescore_df):
    """
    Calculate seasonal performance metrics for each team from linescore data.
    
    Args:
        linescore_df (pd.DataFrame): DataFrame containing linescore data.
        
    Returns:
        pd.DataFrame: DataFrame summarizing team performance metrics.
    """
    # Add home/away indicator based on Game_ID
    linescore_df['Home_Team'] = linescore_df['Game_ID'].str.split('-').str[2]
    linescore_df['Away_Team'] = linescore_df['Game_ID'].str.split('-').str[3]
    linescore_df['is_home'] = linescore_df['Team'] == linescore_df['Home_Team']
    
    # Aggregate team stats
    team_stats = linescore_df.groupby('Team').agg(
        total_goals_scored=('goalsT', 'sum'),
        total_shots=('shots1', 'sum'),
        total_powerplay_goals=('PPG', 'sum'),
        total_powerplay_opportunities=('PPO', 'sum'),
        total_faceoffs_won=('FOW', 'sum'),
        total_faceoffs_lost=('FOL', 'sum'),
        total_games=('Game_ID', 'count')
    ).reset_index()
    
    # Calculate goals conceded by summing opponents' goals for each game
    linescore_df['goals_conceded'] = linescore_df.apply(
        lambda row: linescore_df.loc[
            (linescore_df['Game_ID'] == row['Game_ID']) & (linescore_df['Team'] != row['Team']), 'goalsT'
        ].sum(),
        axis=1
    )
    total_goals_conceded = linescore_df.groupby('Team')['goals_conceded'].sum().reset_index(name='total_goals_conceded')
    
    # Merge goals conceded into the main team stats
    team_stats = team_stats.merge(total_goals_conceded, on='Team', how='left')
    
    # Calculate additional metrics
    team_stats['powerplay_efficiency'] = (
        team_stats['total_powerplay_goals'] / team_stats['total_powerplay_opportunities']
    ).fillna(0) * 100
    team_stats['faceoff_win_percentage'] = (
        team_stats['total_faceoffs_won'] / 
        (team_stats['total_faceoffs_won'] + team_stats['total_faceoffs_lost'])
    ).fillna(0) * 100

    return team_stats


# Apply the corrected function to calculate seasonal stats
team_seasonal_stats_fixed = calculate_team_stats_fixed(linescore_df)

# Display the corrected seasonal team stats
team_seasonal_stats_fixed.head()


Unnamed: 0,Team,total_goals_scored,total_shots,total_powerplay_goals,total_powerplay_opportunities,total_faceoffs_won,total_faceoffs_lost,total_games,total_goals_conceded,powerplay_efficiency,faceoff_win_percentage
0,Air Force,44,215,11,72,732,680,24,62,15.277778,51.84136
1,Alaska,41,165,7,68,512,578,20,54,10.294118,46.972477
2,Alaska Anchorage,57,162,12,75,673,819,25,82,16.0,45.107239
3,American Intl,51,205,13,61,626,672,21,66,21.311475,48.228043
4,Arizona State,65,211,14,62,557,577,20,49,22.580645,49.118166


In [56]:
def calculate_team_stats_with_periods(linescore_df):
    """
    Calculate seasonal performance metrics, including goals scored and conceded per period.
    
    Args:
        linescore_df (pd.DataFrame): DataFrame containing linescore data.
        
    Returns:
        pd.DataFrame: DataFrame summarizing team performance metrics, including period stats.
    """
    # Add home/away indicator based on Game_ID
    linescore_df['Home_Team'] = linescore_df['Game_ID'].str.split('-').str[2]
    linescore_df['Away_Team'] = linescore_df['Game_ID'].str.split('-').str[3]
    linescore_df['is_home'] = linescore_df['Team'] == linescore_df['Home_Team']
    
    # Aggregate team stats
    team_stats = linescore_df.groupby('Team').agg(
        total_goals_scored=('goalsT', 'sum'),
        total_shots=('shots1', 'sum'),
        total_powerplay_goals=('PPG', 'sum'),
        total_powerplay_opportunities=('PPO', 'sum'),
        total_faceoffs_won=('FOW', 'sum'),
        total_faceoffs_lost=('FOL', 'sum'),
        total_games=('Game_ID', 'count'),
        goals_scored_1=('goals1', 'sum'),
        goals_scored_2=('goals2', 'sum'),
        goals_scored_3=('goals3', 'sum'),
        goals_scored_4=('goals4', 'sum')
    ).reset_index()
    
    # Calculate goals conceded per period
    for period in ['1', '2', '3', '4']:
        period_col = f'goals{period}'
        linescore_df[f'goals_conceded_{period}'] = linescore_df.apply(
            lambda row: linescore_df.loc[
                (linescore_df['Game_ID'] == row['Game_ID']) & (linescore_df['Team'] != row['Team']),
                period_col
            ].sum(),
            axis=1
        )
    
    # Aggregate goals conceded stats
    goals_conceded = linescore_df.groupby('Team').agg(
        goals_conceded_1=('goals_conceded_1', 'sum'),
        goals_conceded_2=('goals_conceded_2', 'sum'),
        goals_conceded_3=('goals_conceded_3', 'sum'),
        goals_conceded_4=('goals_conceded_4', 'sum')
    ).reset_index()
    

    # Aggregate shots taken per period
    team_stats['shots_1'] = linescore_df.groupby('Team')['shots1'].sum().values
    team_stats['shots_2'] = linescore_df.groupby('Team')['shots2'].sum().values
    team_stats['shots_3'] = linescore_df.groupby('Team')['shots3'].sum().values
    team_stats['shots_4'] = linescore_df.groupby('Team')['shots4'].sum().values

    # Calculate shots conceded per period
    for period in ['1', '2', '3', '4']:
        shots_col = f'shots{period}'
        linescore_df[f'shots_conceded_{period}'] = linescore_df.apply(
            lambda row: linescore_df.loc[
                (linescore_df['Game_ID'] == row['Game_ID']) & (linescore_df['Team'] != row['Team']),
                shots_col
            ].sum(),
            axis=1
        )

    # Aggregate shots conceded stats
    shots_conceded = linescore_df.groupby('Team').agg(
        shots_conceded_1=('shots_conceded_1', 'sum'),
        shots_conceded_2=('shots_conceded_2', 'sum'),
        shots_conceded_3=('shots_conceded_3', 'sum'),
        shots_conceded_4=('shots_conceded_4', 'sum')
    ).reset_index()

    # Merge shots conceded stats into team stats
    team_stats = team_stats.merge(shots_conceded, on='Team', how='left')

    # Merge goals conceded stats into team stats
    team_stats = team_stats.merge(goals_conceded, on='Team', how='left')
    
    # Calculate additional metrics
    team_stats['powerplay_efficiency'] = (
        team_stats['total_powerplay_goals'] / team_stats['total_powerplay_opportunities']
    ).fillna(0) * 100
    team_stats['faceoff_win_percentage'] = (
        team_stats['total_faceoffs_won'] / 
        (team_stats['total_faceoffs_won'] + team_stats['total_faceoffs_lost'])
    ).fillna(0) * 100

    return team_stats


# Apply the extended function to calculate seasonal stats with periods
team_seasonal_stats_with_periods = calculate_team_stats_with_periods(linescore_df)

# Display the updated seasonal stats with period metrics
team_seasonal_stats_with_periods.head()


Unnamed: 0,Team,total_goals_scored,total_shots,total_powerplay_goals,total_powerplay_opportunities,total_faceoffs_won,total_faceoffs_lost,total_games,goals_scored_1,goals_scored_2,...,shots_conceded_1,shots_conceded_2,shots_conceded_3,shots_conceded_4,goals_conceded_1,goals_conceded_2,goals_conceded_3,goals_conceded_4,powerplay_efficiency,faceoff_win_percentage
0,Air Force,44,215,11,72,732,680,24,13,11,...,223,226,220,4,14,20,26,2,15.277778,51.84136
1,Alaska,41,165,7,68,512,578,20,12,13,...,165,213,176,18,15,21,16,2,10.294118,46.972477
2,Alaska Anchorage,57,162,12,75,673,819,25,16,20,...,235,216,232,16,23,33,23,3,16.0,45.107239
3,American Intl,51,205,13,61,626,672,21,10,17,...,235,204,226,8,21,20,23,2,21.311475,48.228043
4,Arizona State,65,211,14,62,557,577,20,14,18,...,161,201,203,11,11,14,21,3,22.580645,49.118166


#### Create table with Team Record

In [57]:
def calculate_team_records_corrected(linescore_df):
    """
    Calculate win, loss, and tie records for each team, including home and away records.
    
    Args:
        linescore_df (pd.DataFrame): DataFrame containing linescore data.
        
    Returns:
        pd.DataFrame: DataFrame summarizing team records for the season.
    """
    # Identify home and away teams from Game_ID
    linescore_df['Home_Team'] = linescore_df['Game_ID'].str.split('-').str[3]
    linescore_df['Away_Team'] = linescore_df['Game_ID'].str.split('-').str[2]
    
    # Determine if the team is playing at home or away
    linescore_df['is_home'] = linescore_df['Team'] == linescore_df['Home_Team']
    
    # Determine game outcomes (Win, Loss, Tie) by comparing goals
    linescore_df['opponent_goals'] = linescore_df.apply(
        lambda row: linescore_df.loc[
            (linescore_df['Game_ID'] == row['Game_ID']) & (linescore_df['Team'] != row['Team']), 'goalsT'
        ].values[0],
        axis=1
    )
    linescore_df['result'] = linescore_df.apply(
        lambda row: 'Win' if row['goalsT'] > row['opponent_goals']
        else ('Loss' if row['goalsT'] < row['opponent_goals'] else 'Tie'),
        axis=1
    )
    
    # Separate home and away records
    home_records = linescore_df[linescore_df['is_home']].groupby('Team')['result'].value_counts().unstack(fill_value=0)
    home_records = home_records.rename(columns=lambda x: f"home_{x.lower()}").reset_index()
    
    away_records = linescore_df[~linescore_df['is_home']].groupby('Team')['result'].value_counts().unstack(fill_value=0)
    away_records = away_records.rename(columns=lambda x: f"away_{x.lower()}").reset_index()
    
    # Overall records
    overall_records = linescore_df.groupby('Team')['result'].value_counts().unstack(fill_value=0)
    overall_records = overall_records.rename(columns=lambda x: f"overall_{x.lower()}").reset_index()
    
    # Merge all records into a single table
    records_summary = overall_records.merge(home_records, on='Team', how='left').merge(away_records, on='Team', how='left')
    
    # Fill NaN values for numeric columns only
    numeric_columns = records_summary.select_dtypes(include=['number']).columns
    records_summary[numeric_columns] = records_summary[numeric_columns].fillna(0).astype(int)
    
    return records_summary


# Apply the corrected function to calculate team records
team_records_corrected = calculate_team_records_corrected(linescore_df)

## Reorder columns for better readability ### Overall: W - L -T, Home: W - L - T, Away: W - L - T
team_records_final = team_records_corrected[['Team', 'overall_win', 'overall_loss', 'overall_tie', 
                'home_win', 'home_loss', 'home_tie', 
                'away_win', 'away_loss', 'away_tie']]


# Display the resulting team records for verification
team_records_final.tail()


result,Team,overall_win,overall_loss,overall_tie,home_win,home_loss,home_tie,away_win,away_loss,away_tie
59,Union,12,8,1,7,4,0,5,4,1
60,Vermont,8,10,3,5,4,1,3,6,2
61,Western Michigan,13,4,1,5,3,0,8,1,1
62,Wisconsin,9,12,1,4,5,0,5,7,1
63,Yale,4,10,1,3,4,1,1,6,0


In [58]:
### Look at Michigan State for verification
team_records_final[team_records_final['Team'] == 'Michigan State']

result,Team,overall_win,overall_loss,overall_tie,home_win,home_loss,home_tie,away_win,away_loss,away_tie
34,Michigan State,18,2,2,9,1,2,9,1,0


# HOTFIX - 1-16-25 
- home and away labels are fliped - need to address above

- for now just renaming columns

In [59]:
#### HOTFIX 1-16-25

## Rename the home and away recprd columns to correct the misslabeling issue from above
## home_win ->away_win, away_win -> home_win, etc.

team_records_final.rename(columns={
    'home_win': 'away_win',
    'home_loss': 'away_loss',
    'home_tie': 'away_tie',
    'away_win': 'home_win',
    'away_loss': 'home_loss',
    'away_tie': 'home_tie'
}, inplace=True)

# Reorder columns for better readability ### Overall: W - L -T, Home: W - L - T, Away: W - L - T
team_records_final = team_records_final[['Team', 'overall_win', 'overall_loss', 'overall_tie', 
                'home_win', 'home_loss', 'home_tie', 
                'away_win', 'away_loss', 'away_tie']]

In [60]:
### Look at Michigan State for verification
team_records_final[team_records_final['Team'] == 'Michigan State']

result,Team,overall_win,overall_loss,overall_tie,home_win,home_loss,home_tie,away_win,away_loss,away_tie
34,Michigan State,18,2,2,9,1,0,9,1,2


### END OF DATA TRANFORMATION - START ANALYSIS

In [61]:
# Relevent tables for analysis

# final_analysis_table_with_raw_counts

# team_seasonal_stats_with_periods

team_records_final

result,Team,overall_win,overall_loss,overall_tie,home_win,home_loss,home_tie,away_win,away_loss,away_tie
0,Air Force,11,12,1,9,6,0,2,6,1
1,Alaska,7,9,4,2,4,0,5,5,4
2,Alaska Anchorage,5,17,3,1,8,2,4,9,1
3,American Intl,7,13,1,7,13,1,0,0,0
4,Arizona State,11,8,1,6,3,1,5,5,0
...,...,...,...,...,...,...,...,...,...,...
59,Union,12,8,1,5,4,1,7,4,0
60,Vermont,8,10,3,3,6,2,5,4,1
61,Western Michigan,13,4,1,8,1,1,5,3,0
62,Wisconsin,9,12,1,5,7,1,4,5,0
