# Empty Net Analysis
## Look at team performance when pulling their goalie and vs an empty net

### Challenge
- CHN data counts pulling the goalie during a delayed penalty and records that as time vs empty net
    - this skews the count of empty net as almost every team has at least a small portion of time that shows as having an empty net each game when the team did not actually pull their goalie in an end game situation
- build a function that looks at the scoring summary for a game and makes a guess at weither a team was in a situation to pull their goalie for the end game
    - Conditions:
        - Down 1, 2, 3 goals within the last 5 minutes of the 3rd period
            - How to identify, dictionary for each game where the score changes are tracked and if the conditions described above are met tag the game as a possible late_game_EN

#### Setup - Paths and Dependencies

In [1]:
###### SETUP ######

# Dependencies
# Basics
import os
import sys
import time
import sqlite3
import pandas as pd
from collections import defaultdict


## File Paths
folder_prefix = ''
# folder_prefix = '..'
data_folder = os.path.join(folder_prefix, '..', 'data/') # Data Folder Path
temp_folder = os.path.join(folder_prefix,'..', 'TEMP/',) # Temp Folder Path
TEMP_FOLDER = temp_folder # Temp Folder Path as used in legacy code
output_folder = os.path.join(temp_folder, 'team_comp_output/') # Output Folder Path

################ DATABASE PATH ####################
db_path = os.path.join(data_folder, 'db', '2025_Jan_13_CLEAN.db') # Database Path

#### SCHOOL INFO FILE PATH ####
school_info_path = os.path.join(data_folder, 'arena_school_info.csv') # School Info Path


###### Paths important for plotting - logos, etc. ######
### FILL IN IF NECESSARY ###

#### Connect to DB and extract ness tables

In [2]:
## Load the database
conn = sqlite3.connect(db_path, isolation_level=None)

######## SCORING / GOAL SUMMARY ########
## SQL query to fetch
def extract_goal_summary(conn):
    """
    Extracts and preprocesses the goal summary data from the database.
    """
    query = """
        WITH UniqueGoals AS (
        SELECT DISTINCT Game_ID, Team, Period, Time, PP
        FROM scoring_summary
    )
    SELECT * FROM UniqueGoals;
    """
    goal_df = pd.read_sql(query, conn)
    return goal_df

# Convert string time to continuous time value (float of minutes)
def convert_to_continuous_time(row):
    """
    Converts period-based time to a continuous format (0-65 minutes).
    """
    period_offsets = {'1st Period': 0, '2nd Period': 20, '3rd Period': 40, 'Overtime': 60}
    minutes, seconds = map(int, row['Time'].split(':'))
    offset = period_offsets.get(row['Period'], 0)
    return offset + minutes + seconds / 60.0

## Extract the goal summary data
goal_df = extract_goal_summary(conn)
# Create a continuous time column
goal_df['continuous_time'] = goal_df.apply(convert_to_continuous_time, axis=1)

#### Check the data table
# goal_df.head()

In [3]:
############ GOALTENDER STATS ############
def extract_goalie_stats(conn):
    """
    Extracts and preprocesses the goalie stats data from the database.
    """
    query = """
        SELECT * FROM goalie_stats;
    """
    goalie_df = pd.read_sql(query, conn)
    return goalie_df

# Extract the goalie stats data
goalie_df = extract_goalie_stats(conn)

# Check the data table
# goalie_df.head()

#### Identify Games with Late Game Empty Net Situations

In [4]:
######### TRY 4
def evaluate_late_game_scenarios(goal_df):
    """
    Refine the logic to evaluate the final state of regulation, ensuring empty-net likelihood is assessed
    even if no events occur in the last 5 minutes of regulation.
    
    Args:
        goal_df (pd.DataFrame): DataFrame containing game scoring data.
        
    Returns:
        pd.DataFrame: New DataFrame with columns ['Game_ID', 'Team', 'EN_likely', 'total_goals_regulation'].
    """
    # Initialize the result list
    result = []
    
    # Group by Game_ID to process each game independently
    grouped = goal_df.groupby('Game_ID')
    
    for game_id, group in grouped:
        # Filter out overtime goals (continuous_time > 60)
        regulation_group = group[group['continuous_time'] <= 60].sort_values(by='continuous_time')
        
        # Track running scores for each team
        team_scores = {team: 0 for team in regulation_group['Team'].unique()}
        en_likely = {team: False for team in regulation_group['Team'].unique()}
        
        # Update scores based on regulation goals
        for _, row in regulation_group.iterrows():
            team = row['Team']
            team_scores[team] += 1
        
        # Evaluate the final state of the game (end of regulation)
        for trailing_team, trailing_score in team_scores.items():
            for other_team, other_score in team_scores.items():
                if trailing_team != other_team:
                    score_diff = trailing_score - other_score
                    if score_diff < 0 and abs(score_diff) <= 3:  # Trailing by 1-3 goals
                        en_likely[trailing_team] = True
        
        # Append results for both teams
        for team, total_goals in team_scores.items():
            result.append({
                'Game_ID': game_id,
                'Team': team,
                'EN_likely': en_likely[team],
                'total_goals_regulation': total_goals
            })
    
    # Convert results to a DataFrame
    return pd.DataFrame(result)


############# TRY 3

# def evaluate_late_game_scenarios(goal_df):
#     """
#     Refine the logic to exclude overtime goals from affecting empty-net likelihood.
    
#     Args:
#         goal_df (pd.DataFrame): DataFrame containing game scoring data.
        
#     Returns:
#         pd.DataFrame: New DataFrame with columns ['Game_ID', 'Team', 'EN_likely', 'total_goals'].
#     """
#     # Initialize the result list
#     result = []
    
#     # Group by Game_ID to process each game independently
#     grouped = goal_df.groupby('Game_ID')
    
#     for game_id, group in grouped:
#         # Filter out overtime goals (continuous_time > 60)
#         regulation_group = group[group['continuous_time'] <= 60].sort_values(by='continuous_time')
        
#         # Track running scores for each team
#         team_scores = {team: 0 for team in regulation_group['Team'].unique()}
#         en_likely = {team: False for team in regulation_group['Team'].unique()}
        
#         # Update scores based on regulation goals
#         for _, row in regulation_group.iterrows():
#             team = row['Team']
#             team_scores[team] += 1
        
#         # Evaluate the last 5 minutes of regulation for empty-net conditions
#         for _, row in regulation_group.iterrows():
#             if row['continuous_time'] >= 55:  # Last 5 minutes of regulation
#                 for trailing_team, trailing_score in team_scores.items():
#                     for other_team, other_score in team_scores.items():
#                         if trailing_team != other_team:
#                             score_diff = trailing_score - other_score
#                             if score_diff < 0 and abs(score_diff) <= 3:  # Trailing by 1-3 goals
#                                 en_likely[trailing_team] = True
        
#         # Append results for both teams
#         for team, total_goals in team_scores.items():
#             result.append({
#                 'Game_ID': game_id,
#                 'Team': team,
#                 'EN_likely': en_likely[team],
#                 'total_goals_regulation': total_goals
#             })
    
#     # Convert results to a DataFrame
#     return pd.DataFrame(result)

# Apply the improved function to classify empty net scenarios
final_empty_net_scenarios = evaluate_late_game_scenarios(goal_df)

# Check the data table
final_empty_net_scenarios.tail(10)

Unnamed: 0,Game_ID,Team,EN_likely,total_goals_regulation
1205,2025-01-11-Robert Morris-Holy Cross,Holy Cross,False,2
1206,2025-01-11-Robert Morris-Holy Cross,Robert Morris,False,2
1207,2025-01-11-St. Cloud State-Minnesota Duluth,Minnesota Duluth,False,5
1208,2025-01-11-St. Cloud State-Minnesota Duluth,St Cloud State,True,2
1209,2025-01-11-Union-St. Lawrence,Union,False,8
1210,2025-01-11-Union-St. Lawrence,St Lawrence,True,5
1211,2025-01-11-Vermont-Boston University,Vermont,True,4
1212,2025-01-11-Vermont-Boston University,Boston University,False,7
1213,2025-01-11-Yale-Harvard,Harvard,False,3
1214,2025-01-11-Yale-Harvard,Yale,True,1


In [5]:
### Seearch for instaces of Notre Dame in the data
final_empty_net_scenarios[final_empty_net_scenarios['Team'] == 'Notre Dame']

## Search Game_IDs containing Notre Dame
notre_dame_game_ids = final_empty_net_scenarios[final_empty_net_scenarios['Team'] == 'Notre Dame']['Game_ID'].unique()
print(notre_dame_game_ids)
print(len(notre_dame_game_ids))



['2024-10-11-Notre Dame-St. Lawrence' '2024-10-12-Notre Dame-Clarkson'
 '2024-10-18-Alaska-Notre Dame' '2024-10-25-Long Island-Notre Dame'
 '2024-10-26-Long Island-Notre Dame' '2024-11-01-Wisconsin-Notre Dame'
 '2024-11-02-Wisconsin-Notre Dame' '2024-11-08-Notre Dame-Michigan'
 '2024-11-09-Notre Dame-Michigan' '2024-11-15-Notre Dame-Michigan State'
 '2024-11-16-Notre Dame-Michigan State' '2024-11-22-Minnesota-Notre Dame'
 '2024-11-23-Minnesota-Notre Dame' '2024-11-29-Harvard-Notre Dame'
 '2024-11-30-Notre Dame-Boston University'
 '2024-12-13-Notre Dame-Ohio State' '2024-12-14-Notre Dame-Ohio State'
 '2025-01-03-Notre Dame-Penn State' '2025-01-10-Michigan-Notre Dame'
 '2025-01-11-Michigan-Notre Dame']
20


In [6]:
######### TRY 2 - UNDER COUNTING #########

# def classify_empty_net_scenarios(goal_df):
#     """
#     Create a table identifying games and teams where pulling the goalie would be logical.
    
#     Args:
#         goal_df (pd.DataFrame): DataFrame containing game scoring data.
        
#     Returns:
#         pd.DataFrame: New DataFrame with columns ['Game_ID', 'Team', 'EN_likely'].
#     """
#     # Initialize the result list
#     result = []
    
#     # Group by Game_ID to process each game independently
#     grouped = goal_df.groupby('Game_ID')
    
#     for game_id, group in grouped:
#         # Sort by continuous_time for accurate score tracking
#         group = group.sort_values(by='continuous_time')
        
#         # Track running scores for each team
#         team_scores = {team: 0 for team in group['Team'].unique()}
#         en_likely = {team: False for team in group['Team'].unique()}
        
#         for _, row in group.iterrows():
#             team = row['Team']
#             team_scores[team] += 1  # Increment score for scoring team
            
#             # Check for potential empty-net situation (last 5 minutes)
#             if row['continuous_time'] >= 55:
#                 for other_team in team_scores:
#                     if other_team != team:
#                         score_diff = team_scores[team] - team_scores[other_team]
#                         if score_diff < 0 and abs(score_diff) <= 3:  # Trailing by 1-3 goals
#                             en_likely[other_team] = True
        
#         # Append results for both teams
#         for team in team_scores.keys():
#             result.append({'Game_ID': game_id, 'Team': team, 'EN_likely': en_likely[team]})
    
#     # Convert results to a DataFrame
#     return pd.DataFrame(result)

# # Apply the function to classify empty net scenarios
# empty_net_scenarios = classify_empty_net_scenarios(goal_df)

# # Check the data table
# empty_net_scenarios.head(25)

# # Value counts for empty net scenarios
# # empty_net_scenarios['EN_likely'].value_counts()



In [7]:
### FIRST ATTEMPT AT IDENTIFYING EMPTY NET SCENARIOS ###
## NOT SOUND LOGIC

# def identify_late_game_empty_net(goal_df):
#     """
#     Identify games where a trailing team might pull their goalie in the last 5 minutes of regulation.
    
#     Args:
#         goal_df (pd.DataFrame): DataFrame containing game scoring data.
        
#     Returns:
#         pd.DataFrame: Updated DataFrame with a new column 'late_game_EN' indicating potential empty net scenarios.
#     """
#     # Create a copy to avoid modifying the original DataFrame
#     df = goal_df.copy()
    
#     # Initialize a column to mark potential late-game empty net scenarios
#     df['late_game_EN'] = False
    
#     # Sort data for accurate scoring tracking
#     df = df.sort_values(by=['Game_ID', 'continuous_time']).reset_index(drop=True)
    
#     # Dictionary to track running scores
#     game_scores = {}

#     for idx, row in df.iterrows():
#         game_id = row['Game_ID']
#         team = row['Team']
#         time = row['continuous_time']
        
#         # Initialize game score tracking if encountering a new game
#         if game_id not in game_scores:
#             game_scores[game_id] = {}

#         # Initialize team score tracking if encountering a new team in the game
#         if team not in game_scores[game_id]:
#             game_scores[game_id][team] = 0

#         # Update the team's score
#         game_scores[game_id][team] += 1

#         # Check late-game condition (last 5 minutes of 3rd period or overtime)
#         if time >= 55:  # 55 minutes onwards is the final 5 minutes of regulation
#             # Calculate the score difference for each team
#             scores = game_scores[game_id]
#             for other_team, other_score in scores.items():
#                 if other_team != team:  # Compare with opposing team
#                     score_diff = other_score - scores[team]
#                     if score_diff > 0 and score_diff <= 3:  # Team is trailing by 1-3 goals
#                         df.at[idx, 'late_game_EN'] = True

#     return df

# # Apply the function to the goal_df
# goal_df_with_late_game_en = identify_late_game_empty_net(goal_df)

# # Check the data table
# goal_df_with_late_game_en.head(40)
# # goal_df_with_late_game_en.info()

# # # Value Count of late game empty net scenarios
# # goal_df_with_late_game_en['late_game_EN'].value_counts()