## SQL Notebook for Current Year Database

In [1]:
## Dependencies
import os
import sys
import time

import numpy as np
import pandas as pd
import sqlite3

# Database Path
db_path = os.path.join('..', 'data', 'db', 'FEB_22_Current_YTD_Stats.db')

# Connect to the database
conn = sqlite3.connect(db_path)



# Calculate Empty Net Time for and against for each team

## CHN Stats count goalies pulled during delayed penalties as empty nets
### haven't figured out a way to differentiate between times teams pulled goalies at end of game to make up a deff 

In [2]:
import pandas as pd


# # Query
query = 'SELECT * FROM goalie_stats'

# Load the data
goalie_stats_df_new = pd.read_sql_query(query, conn)

# Function to convert "Minutes" to seconds for easier calculations
def minutes_to_seconds(minute_str):
    if pd.isna(minute_str):
        return 0
    try:
        mins, secs = map(int, minute_str.split(':'))
        return mins * 60 + secs
    except ValueError:  # In case of any unexpected format
        return 0

goalie_stats_df_new['Seconds'] = goalie_stats_df_new['Minutes'].apply(minutes_to_seconds)

# Calculate each team's total number of games
total_games_per_team = goalie_stats_df_new.groupby('Team')['Game_ID'].nunique().reset_index(name='Total_Games')

# 3 Create a Column for the opponent's team for each goal. It will be the Team from Home_Team and Away_Team that is not the Team of the goalie
goalie_stats_df_new['Opponent'] = np.where(goalie_stats_df_new['Team'] == goalie_stats_df_new['Home_Team'], goalie_stats_df_new['Away_Team'], goalie_stats_df_new['Home_Team'])

goalie_stats_df_new.head(20)




Unnamed: 0,Team,Goalie,SV,GA,Minutes,Game_ID,Away_Team,Home_Team,Seconds,Opponent
0,Lake Superior,Ethan Langenegger,32,5,60:00,2023-10-07-Lake Superior-Michigan State,Lake Superior,Michigan State,3600,Michigan State
1,Michigan State,Trey Augustine,29,2,60:00,2023-10-07-Lake Superior-Michigan State,Lake Superior,Michigan State,3600,Lake Superior
2,Clarkson,Austin Roden,27,1,60:00,2023-10-07-Clarkson-Notre Dame,Clarkson,Notre Dame,3600,Notre Dame
3,Notre Dame,EMPTY NET,0,1,1:25,2023-10-07-Clarkson-Notre Dame,Clarkson,Notre Dame,85,Clarkson
4,Notre Dame,Ryan Bischel,22,2,58:35,2023-10-07-Clarkson-Notre Dame,Clarkson,Notre Dame,3515,Clarkson
5,RIT,EMPTY NET,0,0,0:27,2023-10-07-RIT-St. Lawrence,RIT,St. Lawrence,27,St. Lawrence
6,RIT,Tommy Scarfone,30,4,59:33,2023-10-07-RIT-St. Lawrence,RIT,St. Lawrence,3573,St. Lawrence
7,St. Lawrence,Ben Kraws,26,3,60:00,2023-10-07-RIT-St. Lawrence,RIT,St. Lawrence,3600,RIT
8,St. Thomas,Aaron Trotter,33,4,61:11,2023-10-07-St. Thomas-St. Cloud State,St. Thomas,St. Cloud State,3671,St. Cloud State
9,St. Cloud State,Dominic Basse,25,5,61:11,2023-10-07-St. Thomas-St. Cloud State,St. Thomas,St. Cloud State,3671,St. Thomas


In [3]:
# output CSV
goalie_stats_df_new.to_csv(os.path.join('..', 'TEMP', 'goalie_stats.csv'), index=False)



In [4]:
# # Conver Empty Net Time to Minutes
# final_summary['Empty_Net_Time_Min'] = final_summary['Empty_Net_Time_Sec'] / 60
# final_summary['Opponent_Empty_Net_Time_Min'] = final_summary['Opponent_Empty_Net_Time_Sec'] / 60

# # Calc Averages for Empty Net Time
# final_summary['Avg_Empty_Net_Time_Min'] = final_summary['Empty_Net_Time_Min'] / final_summary['Times_Team_Pulled_Goalie']
# final_summary['Avg_Opponent_Empty_Net_Time_Min'] = final_summary['Opponent_Empty_Net_Time_Min'] / final_summary['Times_Opponent_Pulled_Goalie']


# final_summary.head(20)

# Team Empty Net and Extra Attacker Goals

In [5]:
import pandas as pd

# Load the dataset
conn = sqlite3.connect(db_path)
df = pd.read_sql_query("SELECT * FROM scoring_summary;", conn)



# Ensure 'PP' column is treated as a string, converting NaN values to empty strings for easier processing
df['PP'] = df['PP'].fillna('')

# Initialize columns for EN and EA goals scored and given up
df['EN_scored'] = df['PP'].apply(lambda x: 'EN' in x).astype(int)
df['EA_scored'] = df['PP'].apply(lambda x: 'EA' in x).astype(int)

# Create a mapping of teams to goals given up
goals_given_up = {team: {'EN_given_up': 0, 'EA_given_up': 0} for team in df['Team'].unique()}

# Iterate over the rows to update the goals given up
for index, row in df.iterrows():
    conceding_team = row['Away_Team'] if row['Team'] == row['Home_Team'] else row['Home_Team']
    if 'EN' in row['PP']:
        goals_given_up[conceding_team]['EN_given_up'] += 1
    if 'EA' in row['PP']:
        goals_given_up[conceding_team]['EA_given_up'] += 1

# Convert the mapping to a DataFrame
goals_given_up_df = pd.DataFrame.from_dict(goals_given_up, orient='index').reset_index()
goals_given_up_df.rename(columns={'index': 'Team'}, inplace=True)

# Merge the scored goals with the given up goals
result_df = df.groupby('Team').agg({'EN_scored': 'sum', 'EA_scored': 'sum'}).reset_index()
result = pd.merge(result_df, goals_given_up_df, on='Team')

print(result)


                Team  EN_scored  EA_scored  EN_given_up  EA_given_up
0          Air Force          5          2           12            3
1             Alaska          3          2            6            4
2   Alaska Anchorage          2          3            2            4
3     American Int'l          7          2            5            4
4      Arizona State          7          0            3            1
..               ...        ...        ...          ...          ...
59             Union          4          4           10            2
60           Vermont          4          1            4            1
61  Western Michigan         12          4            0            0
62         Wisconsin          8          4            2            2
63              Yale          4          0            4            4

[64 rows x 5 columns]


In [6]:
# 3 sort by EN scored
result = result.sort_values(by='EN_scored', ascending=False)

# Calculate the EN Goals Scored to EA Goals Given Up Ration
result['EN_vs_EA_Ratio'] = result['EN_scored'] / result['EA_given_up']

# calulate EA Goals Scored to EN Goals Given Up Ratio
result['EA_vs_EN_Ratio'] = result['EA_scored'] / result['EN_given_up']

result.head(20)



Unnamed: 0,Team,EN_scored,EA_scored,EN_given_up,EA_given_up,EN_vs_EA_Ratio,EA_vs_EN_Ratio
23,Holy Cross,15,2,4,2,7.5,0.5
61,Western Michigan,12,4,0,0,inf,inf
34,Michigan State,9,2,4,5,1.8,0.5
62,Wisconsin,8,4,2,2,4.0,2.0
14,Clarkson,8,1,6,4,2.0,0.166667
51,RIT,7,4,3,2,3.5,1.333333
3,American Int'l,7,2,5,4,1.75,0.4
4,Arizona State,7,0,3,1,7.0,0.0
44,Notre Dame,7,2,4,2,3.5,0.5
9,Boston College,7,1,2,2,3.5,0.5


In [7]:
# Get the schema of the scoring_summary table
schema_query = "PRAGMA table_info(scoring_summary);"
scoring_summary_schema = conn.execute(schema_query).fetchall()

# Display the schema of the scoring_summary table
scoring_summary_schema


[(0, 'Period', 'TEXT', 0, None, 0),
 (1, 'Team', 'TEXT', 0, None, 0),
 (2, 'PP', 'TEXT', 0, None, 0),
 (3, 'Player', 'TEXT', 0, None, 0),
 (4, 'Player_Goals', 'INTEGER', 0, None, 0),
 (5, 'Assist1', 'TEXT', 0, None, 0),
 (6, 'Assist2', 'TEXT', 0, None, 0),
 (7, 'Time', 'TEXT', 0, None, 0),
 (8, 'Game_ID', 'TEXT', 0, None, 0),
 (9, 'Away_Team', 'TEXT', 0, None, 0),
 (10, 'Home_Team', 'TEXT', 0, None, 0)]

# NEW VERSION FRIDAY AFTERNOON

In [8]:
# df = pd.read_sql_query("SELECT * FROM scoring_summary;", conn)

# df.head()

In [9]:
import pandas as pd
from collections import defaultdict

# Reload the example data
df = pd.read_sql_query("SELECT * FROM scoring_summary;", conn)

# Correcting and refining the approach based on the provided code and objectives


# Adjust the convert_to_continuous_time function to correctly map each period's time
def convert_to_continuous_time(row):
    period_to_minutes = {'1st Period': 0, '2nd Period': 20, '3rd Period': 40, 'Overtime': 60}
    minutes, seconds = map(int, row['Time'].split(':'))
    if row['Period'] != 'Overtime':
        continuous_time = period_to_minutes[row['Period']] + minutes + (seconds / 60)
    else:
        continuous_time = 60 + minutes + (seconds / 60)  # Overtime starts at 60 minutes
    return continuous_time

# Apply the conversion to create a continuous timeline column
df['Continuous_Time'] = df.apply(convert_to_continuous_time, axis=1)

# Initialize a structure for tracking game states and times with corrected logic
team_game_states_corrected = defaultdict(lambda: defaultdict(float))

# Process each game with refined logic
for game_id, game_df in df.groupby('Game_ID'):
    scores = defaultdict(int)  # Reset scores for each game
    game_df_sorted = game_df.sort_values(by='Continuous_Time')
    last_time = 0  # Reset time at the start of each game
    last_state = 'tied'  # Initial game state
    
    for i, row in game_df_sorted.iterrows():
        # Update scores based on who scored the goal
        scores[row['Team']] += 1
        
        # Calculate time spent since last update
        current_time = row['Continuous_Time']
        time_spent = current_time - last_time
        
        # Update the time spent in the last game state
        team_game_states_corrected[row['Home_Team']][last_state] += time_spent
        team_game_states_corrected[row['Away_Team']][last_state] += time_spent
        
        # Determine new game state
        new_state = get_game_state(scores[row['Home_Team']], scores[row['Away_Team']])
        
        # Prepare for next iteration
        last_state = new_state
        last_time = current_time
    
    # Final update for time from last goal to the end of the game or overtime
    final_time = 65 if 'Overtime' in game_df_sorted['Period'].values else 60
    final_time_spent = final_time - last_time
    team_game_states_corrected[row['Home_Team']][last_state] += final_time_spent
    team_game_states_corrected[row['Away_Team']][last_state] += final_time_spent

# Convert the corrected team game states into a DataFrame for analysis
final_results_corrected = []
for team, states in team_game_states_corrected.items():
    for state, time in states.items():
        final_results_corrected.append([team, state, time])  # Keeping time in minutes

corrected_final_df = pd.DataFrame(final_results_corrected, columns=['Team', 'Game_State', 'Time_Spent_Minutes'])

corrected_final_df.head()


NameError: name 'get_game_state' is not defined

In [None]:
## OUTPUT CSV
corrected_final_df.to_csv(os.path.join('..', 'TEMP', 'corrected_final.csv'), index=False)

In [10]:
import pandas as pd
from collections import defaultdict

# Reload the example data
df = pd.read_sql_query("SELECT * FROM scoring_summary;", conn)

# Define function to convert period and timestamp into a continuous timeline
def convert_to_continuous_time(row):
    period_to_minutes = {'1st Period': 0, '2nd Period': 20, '3rd Period': 40, 'Overtime': 60}
    minutes, seconds = map(int, row['Time'].split(':'))
    continuous_time = period_to_minutes[row['Period']] + (20 - minutes) + (seconds / 60)
    return continuous_time if row['Period'] != 'Overtime' else continuous_time + 5  # Adjust for overtime

# Apply the conversion to create a continuous timeline column
df['Continuous_Time'] = df.apply(convert_to_continuous_time, axis=1)

# Function to adjust the continuous time for overtime consideration
def adjust_overtime_time(row, last_goal_time):
    if 'Overtime' in row['Period']:
        return min(last_goal_time, 65)  # Cap at 65 or the time the goal was scored if earlier
    return row['Continuous_Time']

# Updated get_game_state function for granular game states
def get_game_state(home_score, away_score):
    score_diff = home_score - away_score
    if score_diff == 0:
        return 'tied'
    elif score_diff == 1:
        return 'leading by 1'
    elif score_diff == 2:
        return 'leading by 2'
    elif score_diff >= 3:
        return 'leading by 3+'
    elif score_diff == -1:
        return 'trailing by 1'
    elif score_diff == -2:
        return 'trailing by 2'
    elif score_diff <= -3:
        return 'trailing by 3+'

# Initialize a revised structure for tracking game states and times
team_game_states = defaultdict(lambda: defaultdict(float))

# Process each game with the refined logic
for game_id, game_df in df.groupby('Game_ID'):
    scores = {game_df.iloc[0]['Home_Team']: 0, game_df.iloc[0]['Away_Team']: 0}
    game_df['Continuous_Time'] = game_df.apply(lambda row: convert_to_continuous_time(row), axis=1)
    game_df_sorted = game_df.sort_values(by='Continuous_Time')
    
    last_state = 'tied'  # Initial game state
    last_time = 0  # Start of the game in continuous time
    for i, row in game_df_sorted.iterrows():
        current_time = adjust_overtime_time(row, game_df_sorted['Continuous_Time'].iloc[-1])
        time_spent = current_time - last_time
        
        # Update the time spent in the last state for both teams
        team_game_states[row['Home_Team']][last_state] += time_spent
        team_game_states[row['Away_Team']][last_state] += time_spent
        
        # Update scores and determine the new state
        if row['Team'] == row['Home_Team']:
            scores[row['Home_Team']] += 1
        else:
            scores[row['Away_Team']] += 1
        
        new_state = get_game_state(scores[row['Home_Team']], scores[row['Away_Team']])
        last_state = new_state
        last_time = current_time
    
    # Handle the final stretch after the last goal or to the end of the game
    final_time = adjust_overtime_time(row, 65) - last_time
    team_game_states[row['Home_Team']][last_state] += final_time
    team_game_states[row['Away_Team']][last_state] += final_time

# Convert the team game states dictionary into a DataFrame for analysis
final_results = []
for team, states in team_game_states.items():
    for state, time in states.items():
        final_results.append([team, state, time / 60])  # Convert seconds to minutes

final_df = pd.DataFrame(final_results, columns=['Team', 'Game_State', 'Time_Spent_Minutes'])

final_df.head()


Unnamed: 0,Team,Game_State,Time_Spent_Minutes
0,Massachusetts,tied,11.152222
1,Massachusetts,leading by 1,5.131667
2,Massachusetts,leading by 2,2.169167
3,Massachusetts,leading by 3+,1.020833
4,Massachusetts,trailing by 1,2.672778


In [None]:
final_df.tail(20)

In [None]:
# # Re-initialize and re-process everything in one go to avoid previous errors

# # Re-import pandas and defaultdict in case of kernel reset
# import pandas as pd
# from collections import defaultdict

# # Reload the example data
# df = pd.read_sql_query("SELECT * FROM scoring_summary;", conn)

# # Re-define the time_to_seconds function
# def time_to_seconds(time_str, period):
#     minutes, seconds = map(int, time_str.split(':'))
#     if period == 'Overtime':
#         return 5*60 - (minutes * 60 + seconds)
#     else:
#         return 20*60 - (minutes * 60 + seconds)

# df['Seconds_Remaining'] = df.apply(lambda row: time_to_seconds(row['Time'], row['Period']), axis=1)

# # Updated get_game_state function for granular game states
# def get_game_state(home_score, away_score):
#     score_diff = home_score - away_score
#     if score_diff == 0:
#         return 'tied'
#     elif score_diff == 1:
#         return 'leading by 1'
#     elif score_diff == 2:
#         return 'leading by 2'
#     elif score_diff >= 3:
#         return 'leading by 3+'
#     elif score_diff == -1:
#         return 'trailing by 1'
#     elif score_diff == -2:
#         return 'trailing by 2'
#     elif score_diff <= -3:
#         return 'trailing by 3+'

# # Re-initialize team_stats with the new game state bins
# team_stats = defaultdict(lambda: defaultdict(lambda: {
#     'leading by 1': 0,
#     'leading by 2': 0,
#     'leading by 3+': 0,
#     'tied': 0,
#     'trailing by 1': 0,
#     'trailing by 2': 0,
#     'trailing by 3+': 0,
# }))

# # Re-process each game with the updated logic
# for game_id, game_df in df.groupby('Game_ID'):
#     game_df_sorted = game_df.sort_values(by=['Period', 'Seconds_Remaining'], ascending=[True, True])

#     scores = {'Home': 0, 'Away': 0}
#     last_event_time = 0
#     current_period = '1st Period'
#     for _, row in game_df_sorted.iterrows():
#         if row['Period'] != current_period:
#             last_event_time = 0  # Reset last event time at the start of each period
#             current_period = row['Period']

#         team_type = 'Home' if row['Team'] == row['Home_Team'] else 'Away'
#         scores[team_type] += 1

#         time_spent = row['Seconds_Remaining'] - last_event_time
#         if time_spent < 0:
#             time_spent = -time_spent

#         game_state = get_game_state(scores['Home'], scores['Away'])

#         # Update stats based on game state and whether the team is home or away
#         if team_type == 'Home':
#             team_stats[row['Home_Team']]['home'][game_state] += time_spent
#             team_stats[row['Away_Team']]['away'][game_state] += time_spent
#         else:
#             team_stats[row['Away_Team']]['away'][game_state] += time_spent
#             team_stats[row['Home_Team']]['home'][game_state] += time_spent

#         last_event_time = row['Seconds_Remaining']

#     # Handle time at the end of the period/game
#     final_seconds = 5*60 if current_period == 'Overtime' else 20*60
#     final_time_spent = final_seconds - last_event_time
#     final_game_state = get_game_state(scores['Home'], scores['Away'])
#     team_stats[row['Home_Team']]['home'][final_game_state] += final_time_spent
#     team_stats[row['Away_Team']]['away'][final_game_state] += final_time_spent

# # Update overall stats
# for team in team_stats.keys():
#     team_stats[team]['overall'] = {state: 0 for state in team_stats[team]['home'].keys()}
#     for context in ['home', 'away']:
#         for state in team_stats[team][context].keys():
#             team_stats[team]['overall'][state] += team_stats[team][context][state]

# # Convert the results to a DataFrame
# results_df = pd.DataFrame.from_dict({(team, context): stats
#                                      for team, team_data in team_stats.items()
#                                      for context, stats in team_data.items()},
#                                     orient='index').reset_index()
# results_df.columns = ['Team', 'Context'] + list(team_stats[next(iter(team_stats))]['home'].keys())

# # Convert time columns to minutes
# for column in ['leading by 1', 'leading by 2', 'leading by 3+', 'tied', 'trailing by 1', 'trailing by 2', 'trailing by 3+']:
#     results_df[column] /= 60

# # Display the results
# results_df.head(20)


In [None]:
## OUTPUT CSV FOR INSPECTION
results_df.to_csv(os.path.join('..', 'TEMP', 'TEST_team_stats.csv'), index=False)

## Add an opponent column to each row

In [None]:
import pandas as pd
from collections import defaultdict

# Load the dataset
conn = sqlite3.connect(db_path)
df = pd.read_sql_query("SELECT * FROM scoring_summary;", conn)

In [None]:
## Create an opponent column (Opponent = Home_Team if Team == Away_Team else Away_Team)
df['Opponent'] = df.apply(lambda x: x['Home_Team'] if x['Team'] == x['Away_Team'] else x['Away_Team'], axis=1)


In [None]:
# Function to convert period time to total seconds remaining
def time_to_seconds(time_str, period):
    minutes, seconds = map(int, time_str.split(':'))
    if period == 'Overtime':
        return 5*60 - (minutes * 60 + seconds)
    else:
        return 20*60 - (minutes * 60 + seconds)
    

df['Seconds_Remaining'] = df.apply(lambda row: time_to_seconds(row['Time'], row['Period']), axis=1)

# Initialize a dictionary to hold the results
team_stats = defaultdict(lambda: defaultdict(lambda: {
    'Lead_3+': 0,
    'Lead_2': 0,
    'Lead_1': 0,
    'tied': 0,
    'Down_1': 0,
    'Down_2': 0,
    'Down_3+': 0,
}))


# Determine the game state based on the score difference
def get_game_state(team_score, opp_score):
    score_diff = team_score - opp_score
    if score_diff == 0:
        return 'tied'
    elif score_diff == 1:
        return 'Lead_1'
    elif score_diff == 2:
        return 'Lead_2'
    elif score_diff >= 3:
        return 'Lead_3+'
    elif score_diff == -1:
        return 'Down_1'
    elif score_diff == -2:
        return 'Down_2'
    elif score_diff <= -3:
        return 'Down_3+'
    else:
        return 'unknown'





In [None]:
# team_stats.head(20)

## Calculate time for each team - 5 game States (3+, 1-2 Tied, -1 or 2, -3+0

In [None]:
### OLDER VERSION

# # Function to convert period time to total seconds remaining
# def time_to_seconds(time_str, period):
#     minutes, seconds = map(int, time_str.split(':'))
#     if period == 'Overtime':
#         return 5*60 - (minutes * 60 + seconds)
#     else:
#         return 20*60 - (minutes * 60 + seconds)

# df['Seconds_Remaining'] = df.apply(lambda row: time_to_seconds(row['Time'], row['Period']), axis=1)

# # Initialize a dictionary to hold the results
# team_stats = defaultdict(lambda: defaultdict(lambda: {
#     'leading by 3+': 0,
#     'leading by 1-2': 0,
#     'tied': 0,
#     'trailing by 1-2': 0,
#     'trailing by 3+': 0,
# }))

# # Determine the game state based on the score difference
# def get_game_state(home_score, away_score):
#     score_diff = home_score - away_score
#     if score_diff == 0:
#         return 'tied'
#     elif score_diff == 1 or score_diff == 2:
#         return 'leading by 1-2'
#     elif score_diff >= 3:
#         return 'leading by 3+'
#     elif score_diff == -1 or score_diff == -2:
#         return 'trailing by 1-2'
#     elif score_diff <= -3:  # score_diff <= -3
#         return 'trailing by 3+'
#     else:
#         return 'unknown'

# # Process each game to calculate time spent in each game state
# for game_id, game_df in df.groupby('Game_ID'):
#     game_df_sorted = game_df.sort_values(by=['Period', 'Seconds_Remaining'], ascending=[True, True])

#     scores = {'Home': 0, 'Away': 0}
#     last_event_time = 0
#     current_period = '1st Period'
#     for _, row in game_df_sorted.iterrows():
#         if row['Period'] != current_period:
#             last_event_time = 0
#             current_period = row['Period']

#         team_type = 'Home' if row['Team'] == row['Home_Team'] else 'Away'
#         scores[team_type] += 1
        
#         time_spent = row['Seconds_Remaining'] - last_event_time
#         if time_spent < 0:
#             time_spent = -time_spent

#         game_state = get_game_state(scores['Home'], scores['Away'])
        
#         # Update stats based on game state and whether the team is home or away
#         if team_type == 'Home':
#             team_stats[row['Home_Team']]['home'][game_state] += time_spent
#             team_stats[row['Away_Team']]['away'][get_game_state(scores['Away'], scores['Home'])] += time_spent
#         else:
#             team_stats[row['Away_Team']]['away'][game_state] += time_spent
#             team_stats[row['Home_Team']]['home'][get_game_state(scores['Home'], scores['Away'])] += time_spent

#         last_event_time = row['Seconds_Remaining']

#     # Handle time at the end of the period/game
#     final_seconds = 5*60 if current_period == 'Overtime' else 20*60
#     final_time_spent = final_seconds - last_event_time
#     final_game_state = get_game_state(scores['Home'], scores['Away'])
#     if team_type == 'Home':
#         team_stats[row['Home_Team']]['home'][final_game_state] += final_time_spent
#         team_stats[row['Away_Team']]['away'][get_game_state(scores['Away'], scores['Home'])] += final_time_spent
#     else:
#         team_stats[row['Away_Team']]['away'][final_game_state] += final_time_spent
#         team_stats[row['Home_Team']]['home'][get_game_state(scores['Home'], scores['Away'])] += final_time_spent

# # Initialize overall stats
# for team in team_stats.keys():
#     team_stats[team]['overall'] = {state: 0 for state in team_stats[team]['home'].keys()}
#     for context in ['home', 'away']:
#         for state in team_stats[team][context].keys():
#             team_stats[team]['overall'][state] += team_stats[team][context][state]

# # Convert the results to a DataFrame
# results_df = pd.DataFrame.from_dict({(team, context): stats
#                                      for team, team_data in team_stats.items()
#                                      for context, stats in team_data.items()},
#                                     orient='index').reset_index()
# results_df.columns = ['Team', 'Context'] + list(team_stats[next(iter(team_stats))]['home'].keys())

# # Calculate the total time spent leading and trailing
# results_df['Total_Lead'] = results_df['leading by 1-2'] + results_df['leading by 3+']
# results_df['Total_Down'] = results_df['trailing by 1-2'] + results_df['trailing by 3+']

# # Calulate the total time
# results_df['Total_Time'] = results_df['Total_Lead'] + results_df['tied'] + results_df['Total_Down']

# # Calculate percentage of time spent leading and trailing
# results_df['Pct_Lead'] = (results_df['Total_Lead'] / results_df['Total_Time']) *100
# results_df['Pct_Tied'] = (results_df['tied'] / results_df['Total_Time']) *100
# results_df['Pct_Down'] = (results_df['Total_Down'] / results_df['Total_Time']) *100

# # Rename the column names
# results_df = results_df.rename(columns={'leading by 1-2': 'Lead_1-2',
#                                         'leading by 3+': 'Lead_3+',
#                                         'tied': 'Tied',
#                                         'trailing by 1-2': 'Down_1-2',
#                                         'trailing by 3+': 'Down_3+'})

# # Convert all time columns to minutes from seconds
# results_df['Total_Time'] = results_df['Total_Time'] / 60
# results_df['Total_Lead'] = results_df['Total_Lead'] / 60
# results_df['Tied'] = results_df['Tied'] / 60
# results_df['Total_Down'] = results_df['Total_Down'] / 60
# results_df['Lead_1-2'] = results_df['Lead_1-2'] / 60
# results_df['Lead_3+'] = results_df['Lead_3+'] / 60
# results_df['Down_1-2'] = results_df['Down_1-2'] / 60
# results_df['Down_3+'] = results_df['Down_3+'] / 60


# print(results_df)


In [None]:

# calculate the percentage of time spent in each game state
for state in ['Lead_3+', 'Lead_1-2', 'Tied', 'Down_1-2', 'Down_3+']:
    results_df[f'Pct_{state}'] = results_df[state] / results_df['Total_Time'] * 100


# show reults
results_df.head(10)

tag = 'v3'

# 3 Output the results to a CSV file
results_df.to_csv(f'../TEMP/{tag}team_state_time.csv', index=False)



## Calculate time each team has lead, trailed and been tied
### This block is only 3 game stats (Lead, Tied, Trail)

In [None]:
# ## Working

# import pandas as pd
# from collections import defaultdict

# conn = sqlite3.connect(db_path)
# df = pd.read_sql_query("SELECT * FROM scoring_summary;", conn)

# # Function to convert period time to total seconds remaining
# def time_to_seconds(time_str, period):
#     minutes, seconds = map(int, time_str.split(':'))
#     if period == 'Overtime':
#         return 5*60 - (minutes * 60 + seconds)
#     else:
#         return 20*60 - (minutes * 60 + seconds)

# df['Seconds_Remaining'] = df.apply(lambda row: time_to_seconds(row['Time'], row['Period']), axis=1)

# # Initialize a dictionary to hold the results
# team_stats = defaultdict(lambda: defaultdict(lambda: {'leading': 0, 'tied': 0, 'trailing': 0}))

# # Process each game to calculate time spent in each game state
# for game_id, game_df in df.groupby('Game_ID'):
#     # Sort events by period and seconds remaining, ensuring correct chronological order
#     game_df_sorted = game_df.sort_values(by=['Period', 'Seconds_Remaining'], ascending=[True, True])

#     scores = {'Home': 0, 'Away': 0}
#     last_event_time = 0  # Initialize to start of game
#     current_period = '1st Period'  # Initialize to the first period
#     for _, row in game_df_sorted.iterrows():
#         if row['Period'] != current_period:  # New period
#             last_event_time = 0  # Reset time at the start of a new period
#             current_period = row['Period']

#         team_type = 'Home' if row['Team'] == row['Home_Team'] else 'Away'
#         scores[team_type] += 1  # Update score for the scoring team
        
#         # Calculate time spent since last event
#         time_spent = row['Seconds_Remaining'] - last_event_time
#         if time_spent < 0:
#             time_spent = -time_spent  # Correct negative time spent values
        
#         # Determine the current game state
#         if scores['Home'] > scores['Away']:
#             team_stats[row['Home_Team']]['home']['leading'] += time_spent
#             team_stats[row['Away_Team']]['away']['trailing'] += time_spent
#         elif scores['Home'] < scores['Away']:
#             team_stats[row['Away_Team']]['away']['leading'] += time_spent
#             team_stats[row['Home_Team']]['home']['trailing'] += time_spent
#         else:  # Tied
#             team_stats[row['Home_Team']]['home']['tied'] += time_spent
#             team_stats[row['Away_Team']]['away']['tied'] += time_spent

#         last_event_time = row['Seconds_Remaining']  # Update for next iteration

#     # Handle time at the end of the period/game
#     final_seconds = 5*60 if current_period == 'Overtime' else 20*60
#     final_time_spent = final_seconds - last_event_time
#     if scores['Home'] > scores['Away']:
#         team_stats[row['Home_Team']]['home']['leading'] += final_time_spent
#         team_stats[row['Away_Team']]['away']['trailing'] += final_time_spent
#     elif scores['Home'] < scores['Away']:
#         team_stats[row['Away_Team']]['away']['leading'] += final_time_spent
#         team_stats[row['Home_Team']]['home']['trailing'] += final_time_spent
#     else:  # Tied
#         team_stats[row['Home_Team']]['home']['tied'] += final_time_spent
#         team_stats[row['Away_Team']]['away']['tied'] += final_time_spent

# # Combine home and away stats to get overall stats
# for team in team_stats.keys():
#     team_stats[team]['overall'] = {'leading': 0, 'tied': 0, 'trailing': 0}
#     for context in ['home', 'away']:
#         for state in ['leading', 'tied', 'trailing']:
#             team_stats[team]['overall'][state] += team_stats[team][context][state]

# # Convert the results to a DataFrame
# results_df = pd.DataFrame.from_dict({(team, context): stats
#                                      for team, team_data in team_stats.items()
#                                      for context, stats in team_data.items()},
#                                     orient='index').reset_index()
# results_df.columns = ['Team', 'Context', 'Leading', 'Tied', 'Trailing']

# print(results_df)

In [None]:
# ## Convert the values in the DataFrame to minutes
# results_df[['Leading', 'Tied', 'Trailing']] /= 60

# # calculate percentage of time spent in each game state
# results_df['Total'] = results_df['Leading'] + results_df['Tied'] + results_df['Trailing']
# results_df['Leading %'] = results_df['Leading'] / results_df['Total'] * 100
# results_df['Tied %'] = results_df['Tied'] / results_df['Total'] * 100
# results_df['Trailing %'] = results_df['Trailing'] / results_df['Total'] * 100



In [None]:
results_df.head(10)

In [None]:
# # Sort the results by the percentage of time spent leading
# results_df.sort_values(by='Leading %', ascending=True, inplace=True)

# # Display the results
# results_df.head(10)

In [None]:
# Output the results to a CSV file
# results_df.to_csv('../TEMP/team_game_state_times_3_state.csv', index=False)