## Cacluate The amount of times each team has spent tied, leading, trailing, ect 

In [12]:
import pandas as pd

# path to single game file
# path = '../TEMP/single_game_scoring.csv'
# path = '../TEMP/ss_example_3.csv'

def calculate_game_states_dataframe_ot_tie_corrected_v2(file_path):
    data = pd.read_csv(file_path)

    def period_elapsed_to_seconds(period, time_str):
        minutes, seconds = map(int, time_str.split(':'))
        elapsed_seconds = minutes * 60 + seconds
        
        if '1st' in period:
            return elapsed_seconds
        elif '2nd' in period:
            return 20 * 60 + elapsed_seconds
        elif '3rd' in period:
            return 40 * 60 + elapsed_seconds
        else:  # Overtime
            return 60 * 60 + elapsed_seconds

    data['Total_Seconds'] = data.apply(lambda row: period_elapsed_to_seconds(row['Period'], row['Time']), axis=1)

    teams = list(set(data['Home_Team'].unique().tolist() + data['Away_Team'].unique().tolist()))
    team_states = {team: {'Tied': 0, 'Lead by 1': 0, 'Lead by 2': 0, 'Lead by 3+': 0,
                          'Down by 1': 0, 'Down by 2': 0, 'Down by 3+': 0} for team in teams}
    
    current_score = {teams[0]: 0, teams[1]: 0}
    last_timestamp = 0
    game_end = 3 * 20 * 60  # Default game end, will adjust if OT is detected
    overtime_played = False

    first_goal_time = data.sort_values(by='Total_Seconds').iloc[0]['Total_Seconds'] if not data.empty else 0
    team_states[teams[0]]['Tied'] = first_goal_time
    team_states[teams[1]]['Tied'] = first_goal_time
    last_timestamp = first_goal_time

    for index, row in data.sort_values(by='Total_Seconds').iterrows():
        current_score[row['Team']] += 1
        lead_diff = current_score[teams[0]] - current_score[teams[1]]
        time_elapsed = row['Total_Seconds'] - last_timestamp

        if 'Overtime' in row['Period']:
            overtime_played = True
            game_end = row['Total_Seconds']  # Adjust game end to when OT goal is scored

        if lead_diff == 0:
            for team in teams:
                team_states[team]['Tied'] += time_elapsed
        else:
            leading_team, trailing_team = (teams[0], teams[1]) if lead_diff > 0 else (teams[1], teams[0])
            lead_abs = abs(lead_diff)
            update_states = lambda lead: (f'Lead by {min(lead, 3)}+' if lead >= 3 else f'Lead by {lead}',
                                          f'Down by {min(lead, 3)}+' if lead >= 3 else f'Down by {lead}')
            state_lead, state_trail = update_states(lead_abs)
            team_states[leading_team][state_lead] += time_elapsed
            team_states[trailing_team][state_trail] += time_elapsed
        
        last_timestamp = row['Total_Seconds']

    # Determine if game ended in a tie without explicit OT goal and adjust game_end correctly
    lead_diff_final = current_score[teams[0]] - current_score[teams[1]]
    if lead_diff_final == 0 and not overtime_played:
        # Adjust for games that end in a tie without 'OT' period goals
        overtime_played = True
        game_end = 65 * 60  # Adjust game end to include 5 minutes of OT

    final_time_elapsed = game_end - last_timestamp
    if lead_diff_final == 0 and not data.empty:
        for team in teams:
            team_states[team]['Tied'] += final_time_elapsed

    team_states_df = pd.DataFrame.from_dict(team_states, orient='index').reset_index().rename(columns={'index': 'Team'})
    
    return team_states_df

# Re-generate the DataFrame with the corrected function for the new file
# game_states_df_ot_tie_corrected_v2 = calculate_game_states_dataframe_ot_tie_corrected_v2(path)
# game_states_df_ot_tie_corrected_v2


# Start Here

In [13]:
### Path to Target Database file
import os
import sqlite3
import pandas as pd

# path to database file using os module
# db_path = os.path.join('..', 'data', 'db', 'CHN_YTD_Stats.db') # Path to most recent cleaned DB File
db_path = os.path.join('..', 'data', 'db', '2022_Game_Stats_Cleaned.db') # Path to most recent cleaned DB File
conn = sqlite3.connect(db_path)

# verify the connection
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")

<sqlite3.Cursor at 0x1c9c04184c0>

In [14]:
## HOTFIX FOR OLD DB FILE - Add Home_Team and Away_Team to  scoring summary table in the db by spliting the Game_ID
## Home split '-' -1
## Away split '-' -2

# # example # Step 2: Create new columns for Home and Away Teams by parsing Game_ID
# df_game_details['Away_Team'] = df_game_details['Game_ID'].apply(lambda x: x.split('-')[3])
# df_game_details['Home_Team'] = df_game_details['Game_ID'].apply(lambda x: x.split('-')[4])



In [15]:
# season_file_path = '../TEMP/scoring_summary_old.csv'
# season_file_path = '../TEMP/Season_scoring.csv'

# extract scoring table from database
scoring_query = "SELECT * FROM scoring_summary"
scoring_df = pd.read_sql_query(scoring_query, conn)

## HOTFIX FOR OLD DB FILEs - Add Home_Team and Away_Team to  scoring summary table in the db by spliting the Game_ID
scoring_df['Away_Team'] = scoring_df['Game_ID'].apply(lambda x: x.split('-')[3])
scoring_df['Home_Team'] = scoring_df['Game_ID'].apply(lambda x: x.split('-')[4])


from tqdm import tqdm  # For progress tracking




# Function to process each game in the dataset and aggregate game states
def process_season_game_states(season_data):
    # season_data = pd.read_csv(file_path)
    unique_games = season_data['Game_ID'].unique()
    
    # Initialize an empty DataFrame for aggregating results
    season_aggregate_states = pd.DataFrame()
    
    for game_id in tqdm(unique_games, desc="Processing Games"):
        game_data = season_data[season_data['Game_ID'] == game_id]
        
        # Save individual game data to a temporary CSV to use with the existing function
        temp_game_path = '../TEMP/temp_game.csv'
        game_data.to_csv(temp_game_path, index=False)
        
        # Calculate game states for the current game
        game_states_df = calculate_game_states_dataframe_ot_tie_corrected_v2(temp_game_path)
        
        # Aggregate the results
        if season_aggregate_states.empty:
            season_aggregate_states = game_states_df
        else:
            season_aggregate_states = season_aggregate_states.merge(game_states_df, on='Team', how='outer')
            for state in ['Tied', 'Lead by 1', 'Lead by 2', 'Lead by 3+', 'Down by 1', 'Down by 2', 'Down by 3+']:
                season_aggregate_states[state] = season_aggregate_states[f'{state}_x'].fillna(0) + season_aggregate_states[f'{state}_y'].fillna(0)
                season_aggregate_states.drop([f'{state}_x', f'{state}_y'], axis=1, inplace=True)
    
    return season_aggregate_states

# Process the season dataset
season_aggregate_states_df = process_season_game_states(scoring_df)

season_aggregate_states_df

Processing Games:   0%|          | 1/1109 [00:00<00:15, 71.41it/s]


KeyError: 'Lowell'

In [None]:
# Sort by Down by 3+ time to find the teams that were trailing by 3+ goals the most
# season_aggregate_stats_df.sort_values(by='Down by 3+', ascending=True).head(10)
season_aggregate_stats_df = season_aggregate_states_df

In [None]:
### Calculate the percentages for the different game states
# Calculate the total game time for each team
season_aggregate_stats_df['Total_Time'] = season_aggregate_stats_df['Tied'] + season_aggregate_stats_df['Lead by 1'] + season_aggregate_stats_df['Lead by 2'] + season_aggregate_stats_df['Lead by 3+'] + season_aggregate_stats_df['Down by 1'] + season_aggregate_stats_df['Down by 2'] + season_aggregate_stats_df['Down by 3+']

# create column of all leads and all deficits
season_aggregate_stats_df['All_Lead'] = season_aggregate_stats_df['Lead by 1'] + season_aggregate_stats_df['Lead by 2'] + season_aggregate_stats_df['Lead by 3+']
season_aggregate_stats_df['All_Deficit'] = season_aggregate_stats_df['Down by 1'] + season_aggregate_stats_df['Down by 2'] + season_aggregate_stats_df['Down by 3+']

# Calculate the percentages for each game state
for state in ['Tied', 'Lead by 1', 'Lead by 2', 'Lead by 3+', 'Down by 1', 'Down by 2', 'Down by 3+', 'All_Lead', 'All_Deficit']:
    season_aggregate_stats_df[f'PCT_{state}'] = season_aggregate_stats_df[state] / season_aggregate_stats_df['Total_Time']

# Display the DataFrame with the calculated percentages
# season_aggregate_stats_df

In [None]:
# Convert all time columns from seconds to minutes
for state in ['Tied', 'Lead by 1', 'Lead by 2', 'Lead by 3+', 'Down by 1', 'Down by 2', 'Down by 3+', 'All_Lead', 'All_Deficit', 'Total_Time']:
    season_aggregate_stats_df[state] = season_aggregate_stats_df[state] / 60

# Display the DataFrame with the times converted to minutes
season_aggregate_stats_df


In [None]:
## Output the results to a CSV file
# output_file_path = '../TEMP/lead_Trail_season_aggregate_states.csv' # Original Path - power BI report is tied to tis file location
output_file_path = '../data/2022_Lead_Trail_season_aggregate_stats.csv'
season_aggregate_stats_df.to_csv(output_file_path, index=False)