# Empty Net Analysis
## Look at team performance when pulling their goalie and vs an empty net

### Challenge
- CHN data counts pulling the goalie during a delayed penalty and records that as time vs empty net
    - this skews the count of empty net as almost every team has at least a small portion of time that shows as having an empty net each game when the team did not actually pull their goalie in an end game situation
- build a function that looks at the scoring summary for a game and makes a guess at weither a team was in a situation to pull their goalie for the end game
    - Conditions:
        - Down 1, 2, 3 goals within the last 5 minutes of the 3rd period
            - How to identify, dictionary for each game where the score changes are tracked and if the conditions described above are met tag the game as a possible late_game_EN

#### Setup - Paths and Dependencies

In [1]:
###### SETUP ######

# Dependencies
# Basics
import os
import sys
import time
import sqlite3
import pandas as pd
from collections import defaultdict


## File Paths
folder_prefix = ''
# folder_prefix = '..'
data_folder = os.path.join(folder_prefix, '..', 'data/') # Data Folder Path
temp_folder = os.path.join(folder_prefix,'..', 'TEMP/',) # Temp Folder Path
TEMP_FOLDER = temp_folder # Temp Folder Path as used in legacy code
output_folder = os.path.join(temp_folder, 'team_comp_output/') # Output Folder Path

################ DATABASE PATH ####################
db_path = os.path.join(data_folder, 'db', '2025_Jan_13_CLEAN.db') # Database Path

#### SCHOOL INFO FILE PATH ####
school_info_path = os.path.join(data_folder, 'arena_school_info.csv') # School Info Path


###### Paths important for plotting - logos, etc. ######
### FILL IN IF NECESSARY ###

#### Connect to DB and extract ness tables

In [2]:
## Load the database
conn = sqlite3.connect(db_path, isolation_level=None)

######## SCORING / GOAL SUMMARY ########
## SQL query to fetch
def extract_goal_summary(conn):
    """
    Extracts and preprocesses the goal summary data from the database.
    """
    query = """
        WITH UniqueGoals AS (
        SELECT DISTINCT Game_ID, Team, Period, Time, PP
        FROM scoring_summary
    )
    SELECT * FROM UniqueGoals;
    """
    goal_df = pd.read_sql(query, conn)
    return goal_df

# Convert string time to continuous time value (float of minutes)
def convert_to_continuous_time(row):
    """
    Converts period-based time to a continuous format (0-65 minutes).
    """
    period_offsets = {'1st Period': 0, '2nd Period': 20, '3rd Period': 40, 'Overtime': 60}
    minutes, seconds = map(int, row['Time'].split(':'))
    offset = period_offsets.get(row['Period'], 0)
    return offset + minutes + seconds / 60.0

## Extract the goal summary data
goal_df = extract_goal_summary(conn)
# Create a continuous time column
goal_df['continuous_time'] = goal_df.apply(convert_to_continuous_time, axis=1)

#### Check the data table
# goal_df.head()

In [3]:
############ GOALTENDER STATS ############
def extract_goalie_stats(conn):
    """
    Extracts and preprocesses the goalie stats data from the database.
    """
    query = """
        SELECT * FROM goalie_stats;
    """
    goalie_df = pd.read_sql(query, conn)
    return goalie_df

# Extract the goalie stats data
goalie_df = extract_goalie_stats(conn)

# Check the data table
# goalie_df.head()

#### Identify Games with Late Game Empty Net Situations

In [4]:
def evaluate_late_game_with_shutouts(goal_df, goalie_df):
# def evaluate_late_game_with_goalie_data(goal_df, goalie_df):
    """
    Ensure all teams in each game are explicitly represented by incorporating goalie data.
    
    Args:
        goal_df (pd.DataFrame): DataFrame containing game scoring data.
        goalie_df (pd.DataFrame): DataFrame containing game goalie data.
        
    Returns:
        pd.DataFrame: New DataFrame with columns ['Game_ID', 'Team', 'EN_likely', 'total_goals_regulation'].
    """
    # Initialize the result list
    result = []
    
    # Extract all unique games and their teams from the goalie_df
    all_games_teams = goalie_df.groupby('Game_ID')['Team'].unique()
    
    for game_id, teams_in_game in all_games_teams.items():
        # Initialize scores and likelihood for all teams in this game
        team_scores = {team: 0 for team in teams_in_game}
        en_likely = {team: False for team in teams_in_game}
        
        # Filter data for this game and exclude overtime goals
        regulation_group = goal_df[
            (goal_df['Game_ID'] == game_id) & (goal_df['continuous_time'] <= 60)
        ]
        
        # Update scores based on regulation goals
        for _, row in regulation_group.iterrows():
            team = row['Team']
            team_scores[team] += 1
        
        # Evaluate the final state of the game (end of regulation)
        for trailing_team, trailing_score in team_scores.items():
            for other_team, other_score in team_scores.items():
                if trailing_team != other_team:
                    score_diff = trailing_score - other_score
                    if score_diff < 0 and abs(score_diff) <= 3:  # Trailing by 1-3 goals
                        en_likely[trailing_team] = True
        
        # Append results for all teams in the game
        for team in teams_in_game:
            result.append({
                'Game_ID': game_id,
                'Team': team,
                'EN_likely': en_likely[team],
                'total_goals_regulation': team_scores.get(team, 0)
            })
    
    # Convert results to a DataFrame
    return pd.DataFrame(result)

# Apply the updated function to classify empty net scenarios with shutout handling
final_empty_net_scenarios = evaluate_late_game_with_shutouts(goal_df, goalie_df)

# Check for teams with zero total_goals_regulation in the results table
shutout_teams = final_empty_net_scenarios[
    final_empty_net_scenarios['total_goals_regulation'] == 0
]

# Display results to verify if shutout teams are included
shutout_teams.head()

Unnamed: 0,Game_ID,Team,EN_likely,total_goals_regulation
12,2024-10-05-American Int'l-Maine,American Intl,False,0
47,2024-10-06-Penn State-Alaska,Alaska,False,0
51,2024-10-06-St. Cloud State-St. Thomas,St Thomas,True,0
63,2024-10-11-Boston College-Michigan State,Michigan State,True,0
70,2024-10-11-Long Island-Augustana,Long Island,False,0


In [6]:
### Seearch for instaces of Notre Dame in the data
final_empty_net_scenarios[final_empty_net_scenarios['Team'] == 'Notre Dame']

## Search Game_IDs containing Notre Dame
notre_dame_game_ids = final_empty_net_scenarios[final_empty_net_scenarios['Team'] == 'Notre Dame']['Game_ID'].unique()
print(notre_dame_game_ids)
print(len(notre_dame_game_ids))



['2024-10-11-Notre Dame-St. Lawrence' '2024-10-12-Notre Dame-Clarkson'
 '2024-10-18-Alaska-Notre Dame' '2024-10-19-Alaska-Notre Dame'
 '2024-10-25-Long Island-Notre Dame' '2024-10-26-Long Island-Notre Dame'
 '2024-11-01-Wisconsin-Notre Dame' '2024-11-02-Wisconsin-Notre Dame'
 '2024-11-08-Notre Dame-Michigan' '2024-11-09-Notre Dame-Michigan'
 '2024-11-15-Notre Dame-Michigan State'
 '2024-11-16-Notre Dame-Michigan State' '2024-11-22-Minnesota-Notre Dame'
 '2024-11-23-Minnesota-Notre Dame' '2024-11-29-Harvard-Notre Dame'
 '2024-11-30-Notre Dame-Boston University'
 '2024-12-13-Notre Dame-Ohio State' '2024-12-14-Notre Dame-Ohio State'
 '2025-01-03-Notre Dame-Penn State' '2025-01-05-Penn State-Notre Dame'
 '2025-01-10-Michigan-Notre Dame' '2025-01-11-Michigan-Notre Dame']
22
