## SQL Notebook for Current Year Database

In [1]:
## Dependencies
import os
import sys
import time

import numpy as np
import pandas as pd
import sqlite3

# Database Path
db_path = os.path.join('..', 'data', 'db', 'FEB_22_Current_YTD_Stats.db')

# Connect to the database
conn = sqlite3.connect(db_path)



# Team Empty Net and Extra Attacker Goals

In [11]:
import pandas as pd

# Load the dataset
conn = sqlite3.connect(db_path)
df = pd.read_sql_query("SELECT * FROM scoring_summary;", conn)



# Ensure 'PP' column is treated as a string, converting NaN values to empty strings for easier processing
df['PP'] = df['PP'].fillna('')

# Initialize columns for EN and EA goals scored and given up
df['EN_scored'] = df['PP'].apply(lambda x: 'EN' in x).astype(int)
df['EA_scored'] = df['PP'].apply(lambda x: 'EA' in x).astype(int)

# Create a mapping of teams to goals given up
goals_given_up = {team: {'EN_given_up': 0, 'EA_given_up': 0} for team in df['Team'].unique()}

# Iterate over the rows to update the goals given up
for index, row in df.iterrows():
    conceding_team = row['Away_Team'] if row['Team'] == row['Home_Team'] else row['Home_Team']
    if 'EN' in row['PP']:
        goals_given_up[conceding_team]['EN_given_up'] += 1
    if 'EA' in row['PP']:
        goals_given_up[conceding_team]['EA_given_up'] += 1

# Convert the mapping to a DataFrame
goals_given_up_df = pd.DataFrame.from_dict(goals_given_up, orient='index').reset_index()
goals_given_up_df.rename(columns={'index': 'Team'}, inplace=True)

# Merge the scored goals with the given up goals
result_df = df.groupby('Team').agg({'EN_scored': 'sum', 'EA_scored': 'sum'}).reset_index()
result = pd.merge(result_df, goals_given_up_df, on='Team')

print(result)


                Team  EN_scored  EA_scored  EN_given_up  EA_given_up
0          Air Force          5          2           12            3
1             Alaska          3          2            6            4
2   Alaska Anchorage          2          3            2            4
3     American Int'l          7          2            5            4
4      Arizona State          7          0            3            1
..               ...        ...        ...          ...          ...
59             Union          4          4           10            2
60           Vermont          4          1            4            1
61  Western Michigan         12          4            0            0
62         Wisconsin          8          4            2            2
63              Yale          4          0            4            4

[64 rows x 5 columns]


In [14]:
# 3 sort by EN scored
result = result.sort_values(by='EN_scored', ascending=False)

# Calculate the EN Goals Scored to EA Goals Given Up Ration
result['EN_vs_EA_Ratio'] = result['EN_scored'] / result['EA_given_up']

# calulate EA Goals Scored to EN Goals Given Up Ratio
result['EA_vs_EN_Ratio'] = result['EA_scored'] / result['EN_given_up']

result.head(20)



Unnamed: 0,Team,EN_scored,EA_scored,EN_given_up,EA_given_up,EN_vs_EA_Ratio,EA_vs_EN_Ratio
23,Holy Cross,15,2,4,2,7.5,0.5
61,Western Michigan,12,4,0,0,inf,inf
34,Michigan State,9,2,4,5,1.8,0.5
14,Clarkson,8,1,6,4,2.0,0.166667
62,Wisconsin,8,4,2,2,4.0,2.0
51,RIT,7,4,3,2,3.5,1.333333
3,American Int'l,7,2,5,4,1.75,0.4
4,Arizona State,7,0,3,1,7.0,0.0
44,Notre Dame,7,2,4,2,3.5,0.5
18,Cornell,7,1,1,1,7.0,1.0


In [2]:
# Get the schema of the scoring_summary table
schema_query = "PRAGMA table_info(scoring_summary);"
scoring_summary_schema = conn.execute(schema_query).fetchall()

# Display the schema of the scoring_summary table
scoring_summary_schema


[(0, 'Period', 'TEXT', 0, None, 0),
 (1, 'Team', 'TEXT', 0, None, 0),
 (2, 'PP', 'TEXT', 0, None, 0),
 (3, 'Player', 'TEXT', 0, None, 0),
 (4, 'Player_Goals', 'INTEGER', 0, None, 0),
 (5, 'Assist1', 'TEXT', 0, None, 0),
 (6, 'Assist2', 'TEXT', 0, None, 0),
 (7, 'Time', 'TEXT', 0, None, 0),
 (8, 'Game_ID', 'TEXT', 0, None, 0),
 (9, 'Away_Team', 'TEXT', 0, None, 0),
 (10, 'Home_Team', 'TEXT', 0, None, 0)]

## Calculate time each team - 5 game States 3+, 1-2 Tied, ect

In [3]:
import pandas as pd
from collections import defaultdict

# Load the dataset
conn = sqlite3.connect(db_path)
df = pd.read_sql_query("SELECT * FROM scoring_summary;", conn)

# Function to convert period time to total seconds remaining
def time_to_seconds(time_str, period):
    minutes, seconds = map(int, time_str.split(':'))
    if period == 'Overtime':
        return 5*60 - (minutes * 60 + seconds)
    else:
        return 20*60 - (minutes * 60 + seconds)

df['Seconds_Remaining'] = df.apply(lambda row: time_to_seconds(row['Time'], row['Period']), axis=1)

# Initialize a dictionary to hold the results
team_stats = defaultdict(lambda: defaultdict(lambda: {
    'leading by 3+': 0,
    'leading by 1-2': 0,
    'tied': 0,
    'trailing by 1-2': 0,
    'trailing by 3+': 0,
}))

# Determine the game state based on the score difference
def get_game_state(home_score, away_score):
    score_diff = home_score - away_score
    if score_diff == 0:
        return 'tied'
    elif score_diff == 1 or score_diff == 2:
        return 'leading by 1-2'
    elif score_diff >= 3:
        return 'leading by 3+'
    elif score_diff == -1 or score_diff == -2:
        return 'trailing by 1-2'
    else:  # score_diff <= -3
        return 'trailing by 3+'

# Process each game to calculate time spent in each game state
for game_id, game_df in df.groupby('Game_ID'):
    game_df_sorted = game_df.sort_values(by=['Period', 'Seconds_Remaining'], ascending=[True, True])

    scores = {'Home': 0, 'Away': 0}
    last_event_time = 0
    current_period = '1st Period'
    for _, row in game_df_sorted.iterrows():
        if row['Period'] != current_period:
            last_event_time = 0
            current_period = row['Period']

        team_type = 'Home' if row['Team'] == row['Home_Team'] else 'Away'
        scores[team_type] += 1
        
        time_spent = row['Seconds_Remaining'] - last_event_time
        if time_spent < 0:
            time_spent = -time_spent

        game_state = get_game_state(scores['Home'], scores['Away'])
        
        # Update stats based on game state and whether the team is home or away
        if team_type == 'Home':
            team_stats[row['Home_Team']]['home'][game_state] += time_spent
            team_stats[row['Away_Team']]['away'][get_game_state(scores['Away'], scores['Home'])] += time_spent
        else:
            team_stats[row['Away_Team']]['away'][game_state] += time_spent
            team_stats[row['Home_Team']]['home'][get_game_state(scores['Home'], scores['Away'])] += time_spent

        last_event_time = row['Seconds_Remaining']

    # Handle time at the end of the period/game
    final_seconds = 5*60 if current_period == 'Overtime' else 20*60
    final_time_spent = final_seconds - last_event_time
    final_game_state = get_game_state(scores['Home'], scores['Away'])
    if team_type == 'Home':
        team_stats[row['Home_Team']]['home'][final_game_state] += final_time_spent
        team_stats[row['Away_Team']]['away'][get_game_state(scores['Away'], scores['Home'])] += final_time_spent
    else:
        team_stats[row['Away_Team']]['away'][final_game_state] += final_time_spent
        team_stats[row['Home_Team']]['home'][get_game_state(scores['Home'], scores['Away'])] += final_time_spent

# Initialize overall stats
for team in team_stats.keys():
    team_stats[team]['overall'] = {state: 0 for state in team_stats[team]['home'].keys()}
    for context in ['home', 'away']:
        for state in team_stats[team][context].keys():
            team_stats[team]['overall'][state] += team_stats[team][context][state]

# Convert the results to a DataFrame
results_df = pd.DataFrame.from_dict({(team, context): stats
                                     for team, team_data in team_stats.items()
                                     for context, stats in team_data.items()},
                                    orient='index').reset_index()
results_df.columns = ['Team', 'Context'] + list(team_stats[next(iter(team_stats))]['home'].keys())

# Calculate the total time spent leading and trailing
results_df['Total_Lead'] = results_df['leading by 1-2'] + results_df['leading by 3+']
results_df['Total_Down'] = results_df['trailing by 1-2'] + results_df['trailing by 3+']

# Calulate the total time
results_df['Total_Time'] = results_df['Total_Lead'] + results_df['tied'] + results_df['Total_Down']

# Calculate percentage of time spent leading and trailing
results_df['Pct_Lead'] = (results_df['Total_Lead'] / results_df['Total_Time']) *100
results_df['Pct_Tied'] = (results_df['tied'] / results_df['Total_Time']) *100
results_df['Pct_Down'] = (results_df['Total_Down'] / results_df['Total_Time']) *100

# Rename the column names
results_df = results_df.rename(columns={'leading by 1-2': 'Lead_1-2',
                                        'leading by 3+': 'Lead_3+',
                                        'tied': 'Tied',
                                        'trailing by 1-2': 'Down_1-2',
                                        'trailing by 3+': 'Down_3+'})

# Convert all time columns to minutes from seconds
results_df['Total_Time'] = results_df['Total_Time'] / 60
results_df['Total_Lead'] = results_df['Total_Lead'] / 60
results_df['Tied'] = results_df['Tied'] / 60
results_df['Total_Down'] = results_df['Total_Down'] / 60
results_df['Lead_1-2'] = results_df['Lead_1-2'] / 60
results_df['Lead_3+'] = results_df['Lead_3+'] / 60
results_df['Down_1-2'] = results_df['Down_1-2'] / 60
results_df['Down_3+'] = results_df['Down_3+'] / 60


print(results_df)


               Team  Context    Lead_3+    Lead_1-2        Tied    Down_1-2  \
0     Massachusetts     home  75.516667  206.233333  128.433333  126.266667   
1     Massachusetts     away   0.000000   53.933333   67.633333  231.900000   
2     Massachusetts  overall  75.516667  260.166667  196.066667  358.166667   
3    American Int'l     away  36.616667   98.816667  111.966667  340.750000   
4    American Int'l     home  79.250000  208.466667  103.283333  118.716667   
..              ...      ...        ...         ...         ...         ...   
187           Brown     away  20.150000   20.550000   35.150000  207.883333   
188           Brown  overall  50.683333  169.450000  147.633333  319.600000   
189       Princeton     away  19.583333   72.933333   94.333333  286.666667   
190       Princeton     home  16.683333  101.983333   98.650000  143.600000   
191       Princeton  overall  36.266667  174.916667  192.983333  430.266667   

        Down_3+  Total_Lead  Total_Down   Total_Tim

In [5]:

# calculate the percentage of time spent in each game state
for state in ['Lead_3+', 'Lead_1-2', 'Tied', 'Down_1-2', 'Down_3+']:
    results_df[f'Pct_{state}'] = results_df[state] / results_df['Total_Time'] * 100


# show reults
results_df.head(10)

# 3 Output the results to a CSV file
results_df.to_csv('../TEMP/team_state_time.csv', index=False)



## Calculate time each team has lead, trailed and been tied
### This block is only 3 game stats (Lead, Tied, Trail)

In [None]:
# ## Working

# import pandas as pd
# from collections import defaultdict

# conn = sqlite3.connect(db_path)
# df = pd.read_sql_query("SELECT * FROM scoring_summary;", conn)

# # Function to convert period time to total seconds remaining
# def time_to_seconds(time_str, period):
#     minutes, seconds = map(int, time_str.split(':'))
#     if period == 'Overtime':
#         return 5*60 - (minutes * 60 + seconds)
#     else:
#         return 20*60 - (minutes * 60 + seconds)

# df['Seconds_Remaining'] = df.apply(lambda row: time_to_seconds(row['Time'], row['Period']), axis=1)

# # Initialize a dictionary to hold the results
# team_stats = defaultdict(lambda: defaultdict(lambda: {'leading': 0, 'tied': 0, 'trailing': 0}))

# # Process each game to calculate time spent in each game state
# for game_id, game_df in df.groupby('Game_ID'):
#     # Sort events by period and seconds remaining, ensuring correct chronological order
#     game_df_sorted = game_df.sort_values(by=['Period', 'Seconds_Remaining'], ascending=[True, True])

#     scores = {'Home': 0, 'Away': 0}
#     last_event_time = 0  # Initialize to start of game
#     current_period = '1st Period'  # Initialize to the first period
#     for _, row in game_df_sorted.iterrows():
#         if row['Period'] != current_period:  # New period
#             last_event_time = 0  # Reset time at the start of a new period
#             current_period = row['Period']

#         team_type = 'Home' if row['Team'] == row['Home_Team'] else 'Away'
#         scores[team_type] += 1  # Update score for the scoring team
        
#         # Calculate time spent since last event
#         time_spent = row['Seconds_Remaining'] - last_event_time
#         if time_spent < 0:
#             time_spent = -time_spent  # Correct negative time spent values
        
#         # Determine the current game state
#         if scores['Home'] > scores['Away']:
#             team_stats[row['Home_Team']]['home']['leading'] += time_spent
#             team_stats[row['Away_Team']]['away']['trailing'] += time_spent
#         elif scores['Home'] < scores['Away']:
#             team_stats[row['Away_Team']]['away']['leading'] += time_spent
#             team_stats[row['Home_Team']]['home']['trailing'] += time_spent
#         else:  # Tied
#             team_stats[row['Home_Team']]['home']['tied'] += time_spent
#             team_stats[row['Away_Team']]['away']['tied'] += time_spent

#         last_event_time = row['Seconds_Remaining']  # Update for next iteration

#     # Handle time at the end of the period/game
#     final_seconds = 5*60 if current_period == 'Overtime' else 20*60
#     final_time_spent = final_seconds - last_event_time
#     if scores['Home'] > scores['Away']:
#         team_stats[row['Home_Team']]['home']['leading'] += final_time_spent
#         team_stats[row['Away_Team']]['away']['trailing'] += final_time_spent
#     elif scores['Home'] < scores['Away']:
#         team_stats[row['Away_Team']]['away']['leading'] += final_time_spent
#         team_stats[row['Home_Team']]['home']['trailing'] += final_time_spent
#     else:  # Tied
#         team_stats[row['Home_Team']]['home']['tied'] += final_time_spent
#         team_stats[row['Away_Team']]['away']['tied'] += final_time_spent

# # Combine home and away stats to get overall stats
# for team in team_stats.keys():
#     team_stats[team]['overall'] = {'leading': 0, 'tied': 0, 'trailing': 0}
#     for context in ['home', 'away']:
#         for state in ['leading', 'tied', 'trailing']:
#             team_stats[team]['overall'][state] += team_stats[team][context][state]

# # Convert the results to a DataFrame
# results_df = pd.DataFrame.from_dict({(team, context): stats
#                                      for team, team_data in team_stats.items()
#                                      for context, stats in team_data.items()},
#                                     orient='index').reset_index()
# results_df.columns = ['Team', 'Context', 'Leading', 'Tied', 'Trailing']

# print(results_df)

In [None]:
# ## Convert the values in the DataFrame to minutes
# results_df[['Leading', 'Tied', 'Trailing']] /= 60

# # calculate percentage of time spent in each game state
# results_df['Total'] = results_df['Leading'] + results_df['Tied'] + results_df['Trailing']
# results_df['Leading %'] = results_df['Leading'] / results_df['Total'] * 100
# results_df['Tied %'] = results_df['Tied'] / results_df['Total'] * 100
# results_df['Trailing %'] = results_df['Trailing'] / results_df['Total'] * 100



In [None]:
results_df.head(10)

In [None]:
# # Sort the results by the percentage of time spent leading
# results_df.sort_values(by='Leading %', ascending=True, inplace=True)

# # Display the results
# results_df.head(10)

In [None]:
# Output the results to a CSV file
# results_df.to_csv('../TEMP/team_game_state_times_3_state.csv', index=False)