## SQL Notebook for Current Year Database

In [47]:
## Dependencies
import os
import sys
import time

import numpy as np
import pandas as pd
import sqlite3

# Database Path
db_path = os.path.join('..', 'data', 'db', 'FEB_22_Current_YTD_Stats.db')

# Connect to the database
conn = sqlite3.connect(db_path)



In [48]:
# Get the schema of the scoring_summary table
schema_query = "PRAGMA table_info(scoring_summary);"
scoring_summary_schema = conn.execute(schema_query).fetchall()

# Display the schema of the scoring_summary table
scoring_summary_schema


[(0, 'Period', 'TEXT', 0, None, 0),
 (1, 'Team', 'TEXT', 0, None, 0),
 (2, 'PP', 'TEXT', 0, None, 0),
 (3, 'Player', 'TEXT', 0, None, 0),
 (4, 'Player_Goals', 'INTEGER', 0, None, 0),
 (5, 'Assist1', 'TEXT', 0, None, 0),
 (6, 'Assist2', 'TEXT', 0, None, 0),
 (7, 'Time', 'TEXT', 0, None, 0),
 (8, 'Game_ID', 'TEXT', 0, None, 0),
 (9, 'Away_Team', 'TEXT', 0, None, 0),
 (10, 'Home_Team', 'TEXT', 0, None, 0)]

In [49]:
# Reconnect to the database to perform the analysis
conn = sqlite3.connect(db_path)

# SQL to create a temporary view for empty net goals scored (EN goals for)
create_en_goals_for_view = """
CREATE TEMP VIEW IF NOT EXISTS en_goals_for AS
SELECT Team, COUNT(*) AS EN_Goals_For
FROM scoring_summary
WHERE PP = 'EN'
GROUP BY Team;
"""

## Doesn't work
# # SQL to create a temporary view for empty net goals allowed (EN goals against)
# create_en_goals_against_view = """
# CREATE TEMP VIEW IF NOT EXISTS en_goals_against AS
# SELECT Team, COUNT(*) AS EN_Goals_Against
# FROM scoring_summary
# WHERE PP = 'EN'
# GROUP BY Team;
# """

# Execute the SQL commands to create the views
conn.execute(create_en_goals_for_view)
conn.execute(create_en_goals_against_view)

# Now, combine these views to create a summary table for empty net goals for and against each team
en_goals_summary_query = """
SELECT 
    f.Team AS Team,
    f.EN_Goals_For,
    a.EN_Goals_Against
FROM 
    en_goals_for f
JOIN 
    en_goals_against a ON f.Team = a.Team;
"""

# Execute the query to get the empty net goals summary
en_goals_summary = conn.execute(en_goals_summary_query).fetchall()

# Since there might be a mistake in how we calculate "against" (it's actually the same calculation as "for" at the moment),
# let's just display the "for" results for now and then correct our approach.
en_goals_for_query = """
SELECT Team, COUNT(*) AS EN_Goals_For
FROM scoring_summary
WHERE PP = 'EN'
GROUP BY Team;
"""

en_goals_for = conn.execute(en_goals_for_query).fetchall()

# Close the database connection
conn.close()

# Sort the results by the number of empty net goals for
en_goals_for.sort(key=lambda x: x[1], reverse=True)

# Display the results for empty net goals "for", to verify our approach before correcting the "against" part
# en_goals_for


# Calculate time each team has lead, trailed and been tied

In [50]:
from collections import defaultdict

# Initialize a dictionary to hold the game states (leading, trailing, tied) durations for each team
team_durations = defaultdict(lambda: {'leading': 0, 'trailing': 0, 'tied': 0})

# Correct the calculation for Seconds_Remaining
def correct_time_to_seconds(time_str, period):
    minutes, seconds = map(int, time_str.split(':'))
    total_period_seconds = 5*60 if period == 'Overtime' else 20*60
    total_seconds = total_period_seconds - (minutes*60 + seconds)
    return total_seconds


# Correcting the mistake in the function to use the updated "Corrected_Seconds_Remaining" column
def calculate_game_state_durations_corrected(game_df):
    # Sort the dataframe by Period and Corrected_Seconds_Remaining descending to get chronological order of goals
    game_df = game_df.sort_values(by=['Period', 'Corrected_Seconds_Remaining'], ascending=[True, False])
    
    # Resetting the initial conditions with the corrected seconds remaining
    score = defaultdict(int)  # Tracks the score for each team
    last_state = {'team': None, 'state': 'tied', 'time': 20*60}  # Initial state at the start of the game
    
    # Iterate over each goal in the game
    for index, row in game_df.iterrows():
        scoring_team = row['Team']
        # Update scores
        score[scoring_team] += 1
        
        # Determine the current state after the goal
        state = 'tied' if len(set(score.values())) == 1 else 'leading'
        time_remaining = row['Corrected_Seconds_Remaining']
        
        # If state has changed, update the durations for the involved teams
        if state != last_state['state'] or scoring_team != last_state['team']:
            # Time spent in the previous state
            time_spent = last_state['time'] - time_remaining
            
            if last_state['state'] == 'leading':
                # The last team that scored was leading
                team_durations[last_state['team']]['leading'] += time_spent
                # The other team(s) were trailing
                for team, team_score in score.items():
                    if team != last_state['team']:
                        team_durations[team]['trailing'] += time_spent
            elif last_state['state'] == 'tied':
                # All teams were tied
                for team in score.keys():
                    team_durations[team]['tied'] += time_spent
            
            # Update last state
            last_state = {'team': scoring_team, 'state': state, 'time': time_remaining}
    
    # After the last goal, update durations until the end of the game (including potential overtime)
    final_time_spent = last_state['time']  # Time from last goal to the end of the game
    if last_state['state'] == 'leading':
        team_durations[last_state['team']]['leading'] += final_time_spent
        for team in score.keys():
            if team != last_state['team']:
                team_durations[team]['trailing'] += final_time_spent
    elif last_state['state'] == 'tied':
        for team in score.keys():
            team_durations[team]['tied'] += final_time_spent


In [51]:
## Connect to database and extract the scoring_summary table

# Connect to the database
conn = sqlite3.connect(db_path)

# SQL query to get the scoring summary for a game
game_query = """
SELECT *
FROM scoring_summary;
"""

# Execute the query to get the scoring summary
scoring_summary_df = pd.read_sql(game_query, conn)

# Apply the corrected conversion to the dataframe
scoring_summary_df['Corrected_Seconds_Remaining'] = scoring_summary_df.apply(
    lambda row: correct_time_to_seconds(row['Time'], row['Period']), axis=1)


# scoring_summary.head()

In [52]:


# Reset the team durations dictionary before recalculating with the corrected time
team_durations = defaultdict(lambda: {'leading': 0, 'trailing': 0, 'tied': 0})

# Recalculate game state durations with the corrected seconds remaining calculation
for game_id, game_df in scoring_summary_df.groupby('Game_ID'):
    calculate_game_state_durations_corrected(game_df)

# Convert the recalculated results to a DataFrame for easier viewing
team_durations_corrected_df = pd.DataFrame.from_dict(team_durations, orient='index')
# Convert the time back to minutes for easier interpretation
team_durations_corrected_df = team_durations_corrected_df.div(60)

# Calculate the total time for each team
team_durations_corrected_df['Total_Time'] = team_durations_corrected_df.sum(axis=1)
# Calculate the percentage of time spent in each state
team_durations_corrected_df['Leading_%'] = team_durations_corrected_df['leading'] / team_durations_corrected_df['Total_Time']
team_durations_corrected_df['Trailing_%'] = team_durations_corrected_df['trailing'] / team_durations_corrected_df['Total_Time']
team_durations_corrected_df['Tied_%'] = team_durations_corrected_df['tied'] / team_durations_corrected_df['Total_Time']
team_durations_corrected_df


Unnamed: 0,leading,trailing,tied,Total_Time,Leading_%,Trailing_%,Tied_%
Massachusetts,101.283333,85.733333,263.583333,450.600000,0.224774,0.190265,0.584961
American Int'l,93.066667,119.450000,295.450000,507.966667,0.183214,0.235153,0.581633
Wisconsin,111.550000,73.716667,326.600000,511.866667,0.217928,0.144015,0.638057
Boston College,142.466667,61.616667,320.333333,524.416667,0.271667,0.117496,0.610837
Quinnipiac,212.800000,46.750000,252.016667,511.566667,0.415977,0.091386,0.492637
...,...,...,...,...,...,...,...
Dartmouth,64.850000,72.616667,229.016667,366.483333,0.176952,0.198145,0.624903
Cornell,104.133333,59.850000,269.116667,433.100000,0.240437,0.138190,0.621373
Brown,93.283333,98.983333,146.183333,338.450000,0.275619,0.292461,0.431920
Yale,73.000000,110.533333,151.200000,334.733333,0.218084,0.330213,0.451703


In [53]:
## Sort by the % of time leading
# team_durations_corrected_df.sort_values(by='Leading_%', ascending=False)

# ## Sort by the % of time trailing
# team_durations_corrected_df.sort_values(by='Trailing_%', ascending=False)

# ## Sort by the % of time tied
# team_durations_corrected_df.sort_values(by='Tied_%', ascending=False)

## Close the database connection
conn.close()

# Display Results
team_durations_corrected_df.sort_values(by='Leading_%', ascending=False)

Unnamed: 0,leading,trailing,tied,Total_Time,Leading_%,Trailing_%,Tied_%
Denver,249.433333,-28.550000,262.750000,483.633333,0.515749,-0.059032,0.543283
Quinnipiac,212.800000,46.750000,252.016667,511.566667,0.415977,0.091386,0.492637
Miami,162.516667,109.550000,141.150000,413.216667,0.393296,0.265115,0.341588
Ohio State,147.850000,125.666667,129.666667,403.183333,0.366707,0.311686,0.321607
Lake Superior,134.016667,50.383333,208.083333,392.483333,0.341458,0.128371,0.530171
...,...,...,...,...,...,...,...
Augustana,39.483333,119.366667,307.883333,466.733333,0.084595,0.255749,0.659656
Rensselaer,34.866667,195.616667,268.283333,498.766667,0.069906,0.392201,0.537893
Bemidji State,33.733333,138.100000,333.100000,504.933333,0.066807,0.273501,0.659691
Western Michigan,30.466667,111.416667,326.666667,468.550000,0.065023,0.237790,0.697186
