## Cacluate The amount of times each team has spent tied, leading, trailing, ect 

In [1]:
import pandas as pd

# path to single game file
# path = '../TEMP/single_game_scoring.csv'
# path = '../TEMP/ss_example_3.csv'

def calculate_game_states_dataframe_ot_tie_corrected_v2(file_path):
    data = pd.read_csv(file_path)

    def period_elapsed_to_seconds(period, time_str):
        minutes, seconds = map(int, time_str.split(':'))
        elapsed_seconds = minutes * 60 + seconds
        
        if '1st' in period:
            return elapsed_seconds
        elif '2nd' in period:
            return 20 * 60 + elapsed_seconds
        elif '3rd' in period:
            return 40 * 60 + elapsed_seconds
        else:  # Overtime
            return 60 * 60 + elapsed_seconds

    data['Total_Seconds'] = data.apply(lambda row: period_elapsed_to_seconds(row['Period'], row['Time']), axis=1)

    teams = list(set(data['Home_Team'].unique().tolist() + data['Away_Team'].unique().tolist()))
    team_states = {team: {'Tied': 0, 'Lead by 1': 0, 'Lead by 2': 0, 'Lead by 3+': 0,
                          'Down by 1': 0, 'Down by 2': 0, 'Down by 3+': 0} for team in teams}
    
    current_score = {teams[0]: 0, teams[1]: 0}
    last_timestamp = 0
    game_end = 3 * 20 * 60  # Default game end, will adjust if OT is detected
    overtime_played = False

    first_goal_time = data.sort_values(by='Total_Seconds').iloc[0]['Total_Seconds'] if not data.empty else 0
    team_states[teams[0]]['Tied'] = first_goal_time
    team_states[teams[1]]['Tied'] = first_goal_time
    last_timestamp = first_goal_time

    for index, row in data.sort_values(by='Total_Seconds').iterrows():
        current_score[row['Team']] += 1
        lead_diff = current_score[teams[0]] - current_score[teams[1]]
        time_elapsed = row['Total_Seconds'] - last_timestamp

        if 'Overtime' in row['Period']:
            overtime_played = True
            game_end = row['Total_Seconds']  # Adjust game end to when OT goal is scored

        if lead_diff == 0:
            for team in teams:
                team_states[team]['Tied'] += time_elapsed
        else:
            leading_team, trailing_team = (teams[0], teams[1]) if lead_diff > 0 else (teams[1], teams[0])
            lead_abs = abs(lead_diff)
            update_states = lambda lead: (f'Lead by {min(lead, 3)}+' if lead >= 3 else f'Lead by {lead}',
                                          f'Down by {min(lead, 3)}+' if lead >= 3 else f'Down by {lead}')
            state_lead, state_trail = update_states(lead_abs)
            team_states[leading_team][state_lead] += time_elapsed
            team_states[trailing_team][state_trail] += time_elapsed
        
        last_timestamp = row['Total_Seconds']

    # Determine if game ended in a tie without explicit OT goal and adjust game_end correctly
    lead_diff_final = current_score[teams[0]] - current_score[teams[1]]
    if lead_diff_final == 0 and not overtime_played:
        # Adjust for games that end in a tie without 'OT' period goals
        overtime_played = True
        game_end = 65 * 60  # Adjust game end to include 5 minutes of OT

    final_time_elapsed = game_end - last_timestamp
    if lead_diff_final == 0 and not data.empty:
        for team in teams:
            team_states[team]['Tied'] += final_time_elapsed

    team_states_df = pd.DataFrame.from_dict(team_states, orient='index').reset_index().rename(columns={'index': 'Team'})
    
    return team_states_df

# Re-generate the DataFrame with the corrected function for the new file
# game_states_df_ot_tie_corrected_v2 = calculate_game_states_dataframe_ot_tie_corrected_v2(path)
# game_states_df_ot_tie_corrected_v2


# Start Here

In [2]:
### Path to Target Database file
import os
import sqlite3
import pandas as pd

# path to database file using os module
# db_path = os.path.join('..', 'data', 'db', 'CHN_YTD_Stats.db') # Path to most recent cleaned DB File
db_path = os.path.join('..', 'data', 'db', '2022_Full_Stats.db') # Path to most recent cleaned DB File
conn = sqlite3.connect(db_path)

# verify the connection
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")

<sqlite3.Cursor at 0x1597f84d840>

In [3]:
## HOTFIX FOR OLD DB FILE - Add Home_Team and Away_Team to  scoring summary table in the db by spliting the Game_ID
## Home split '-' -1
## Away split '-' -2

# # example # Step 2: Create new columns for Home and Away Teams by parsing Game_ID
# df_game_details['Away_Team'] = df_game_details['Game_ID'].apply(lambda x: x.split('-')[3])
# df_game_details['Home_Team'] = df_game_details['Game_ID'].apply(lambda x: x.split('-')[4])



In [4]:
# season_file_path = '../TEMP/scoring_summary_old.csv'
# season_file_path = '../TEMP/Season_scoring.csv'

# extract scoring table from database
scoring_query = "SELECT * FROM scoring_summary"
scoring_df = pd.read_sql_query(scoring_query, conn)

## HOTFIX FOR OLD DB FILEs - Add Home_Team and Away_Team to  scoring summary table in the db by spliting the Game_ID
scoring_df['Away_Team'] = scoring_df['Game_ID'].apply(lambda x: x.split('-')[3])
scoring_df['Home_Team'] = scoring_df['Game_ID'].apply(lambda x: x.split('-')[4])


from tqdm import tqdm  # For progress tracking




# Function to process each game in the dataset and aggregate game states
def process_season_game_states(season_data):
    # season_data = pd.read_csv(file_path)
    unique_games = season_data['Game_ID'].unique()
    
    # Initialize an empty DataFrame for aggregating results
    season_aggregate_states = pd.DataFrame()
    
    for game_id in tqdm(unique_games, desc="Processing Games"):
        game_data = season_data[season_data['Game_ID'] == game_id]
        
        # Save individual game data to a temporary CSV to use with the existing function
        temp_game_path = '../TEMP/temp_game.csv'
        game_data.to_csv(temp_game_path, index=False)
        
        # Calculate game states for the current game
        game_states_df = calculate_game_states_dataframe_ot_tie_corrected_v2(temp_game_path)
        
        # Aggregate the results
        if season_aggregate_states.empty:
            season_aggregate_states = game_states_df
        else:
            season_aggregate_states = season_aggregate_states.merge(game_states_df, on='Team', how='outer')
            for state in ['Tied', 'Lead by 1', 'Lead by 2', 'Lead by 3+', 'Down by 1', 'Down by 2', 'Down by 3+']:
                season_aggregate_states[state] = season_aggregate_states[f'{state}_x'].fillna(0) + season_aggregate_states[f'{state}_y'].fillna(0)
                season_aggregate_states.drop([f'{state}_x', f'{state}_y'], axis=1, inplace=True)
    
    return season_aggregate_states

# Process the season dataset
season_aggregate_states_df = process_season_game_states(scoring_df)

season_aggregate_states_df

Processing Games: 100%|██████████| 1109/1109 [00:18<00:00, 58.90it/s]


Unnamed: 0,Team,Tied,Lead by 1,Lead by 2,Lead by 3+,Down by 1,Down by 2,Down by 3+
0,Miami,46221.0,5418.0,9005.0,5833.0,11391.0,14242.0,26863.0
1,Ferris State,43207.0,19623.0,9623.0,7418.0,13571.0,11359.0,18590.0
2,Mass. Lowell,57348.0,14120.0,9974.0,6864.0,13476.0,10315.0,3233.0
3,St. Lawrence,44405.0,12473.0,13751.0,7488.0,12365.0,13490.0,12365.0
4,St. Cloud State,60131.0,11371.0,19018.0,14098.0,13242.0,10590.0,8002.0
...,...,...,...,...,...,...,...,...
56,Dartmouth,36771.0,5557.0,3025.0,4134.0,21299.0,12885.0,13668.0
57,Cornell,38792.0,9441.0,16219.0,20851.0,6745.0,9790.0,4586.0
58,Yale,33640.0,2483.0,7250.0,8920.0,11415.0,12680.0,23456.0
59,Brown,34555.0,13601.0,6436.0,8758.0,9528.0,12394.0,15076.0


In [5]:
# Sort by Down by 3+ time to find the teams that were trailing by 3+ goals the most
# season_aggregate_stats_df.sort_values(by='Down by 3+', ascending=True).head(10)
season_aggregate_stats_df = season_aggregate_states_df

In [6]:
### Calculate the percentages for the different game states
# Calculate the total game time for each team
season_aggregate_stats_df['Total_Time'] = season_aggregate_stats_df['Tied'] + season_aggregate_stats_df['Lead by 1'] + season_aggregate_stats_df['Lead by 2'] + season_aggregate_stats_df['Lead by 3+'] + season_aggregate_stats_df['Down by 1'] + season_aggregate_stats_df['Down by 2'] + season_aggregate_stats_df['Down by 3+']

# create column of all leads and all deficits
season_aggregate_stats_df['All_Lead'] = season_aggregate_stats_df['Lead by 1'] + season_aggregate_stats_df['Lead by 2'] + season_aggregate_stats_df['Lead by 3+']
season_aggregate_stats_df['All_Deficit'] = season_aggregate_stats_df['Down by 1'] + season_aggregate_stats_df['Down by 2'] + season_aggregate_stats_df['Down by 3+']

# Calculate the percentages for each game state
for state in ['Tied', 'Lead by 1', 'Lead by 2', 'Lead by 3+', 'Down by 1', 'Down by 2', 'Down by 3+', 'All_Lead', 'All_Deficit']:
    season_aggregate_stats_df[f'PCT_{state}'] = season_aggregate_stats_df[state] / season_aggregate_stats_df['Total_Time']

# Display the DataFrame with the calculated percentages
# season_aggregate_stats_df

In [7]:
# Convert all time columns from seconds to minutes
for state in ['Tied', 'Lead by 1', 'Lead by 2', 'Lead by 3+', 'Down by 1', 'Down by 2', 'Down by 3+', 'All_Lead', 'All_Deficit', 'Total_Time']:
    season_aggregate_stats_df[state] = season_aggregate_stats_df[state] / 60

# Display the DataFrame with the times converted to minutes
season_aggregate_stats_df


Unnamed: 0,Team,Tied,Lead by 1,Lead by 2,Lead by 3+,Down by 1,Down by 2,Down by 3+,Total_Time,All_Lead,All_Deficit,PCT_Tied,PCT_Lead by 1,PCT_Lead by 2,PCT_Lead by 3+,PCT_Down by 1,PCT_Down by 2,PCT_Down by 3+,PCT_All_Lead,PCT_All_Deficit
0,Miami,770.350000,90.300000,150.083333,97.216667,189.850000,237.366667,447.716667,1982.883333,337.600000,874.933333,0.388500,0.045540,0.075689,0.049028,0.095744,0.119708,0.225791,0.170257,0.441243
1,Ferris State,720.116667,327.050000,160.383333,123.633333,226.183333,189.316667,309.833333,2056.516667,611.066667,725.333333,0.350163,0.159031,0.077988,0.060118,0.109984,0.092057,0.150659,0.297137,0.352700
2,Mass. Lowell,955.800000,235.333333,166.233333,114.400000,224.600000,171.916667,53.883333,1922.166667,515.966667,450.400000,0.497251,0.122431,0.086482,0.059516,0.116847,0.089439,0.028033,0.268430,0.234319
3,St. Lawrence,740.083333,207.883333,229.183333,124.800000,206.083333,224.833333,206.083333,1938.950000,561.866667,637.000000,0.381693,0.107214,0.118200,0.064365,0.106286,0.115956,0.106286,0.289779,0.328528
4,St. Cloud State,1002.183333,189.516667,316.966667,234.966667,220.700000,176.500000,133.366667,2274.200000,741.450000,530.566667,0.440675,0.083333,0.139375,0.103318,0.097045,0.077610,0.058643,0.326027,0.233298
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,Dartmouth,612.850000,92.616667,50.416667,68.900000,354.983333,214.750000,227.800000,1622.316667,211.933333,797.533333,0.377762,0.057089,0.031077,0.042470,0.218813,0.132372,0.140416,0.130636,0.491602
57,Cornell,646.533333,157.350000,270.316667,347.516667,112.416667,163.166667,76.433333,1773.733333,775.183333,352.016667,0.364504,0.088711,0.152400,0.195924,0.063379,0.091991,0.043092,0.437035,0.198461
58,Yale,560.666667,41.383333,120.833333,148.666667,190.250000,211.333333,390.933333,1664.066667,310.883333,792.516667,0.336926,0.024869,0.072613,0.089339,0.114328,0.126998,0.234926,0.186821,0.476253
59,Brown,575.916667,226.683333,107.266667,145.966667,158.800000,206.566667,251.266667,1672.466667,479.916667,616.633333,0.344352,0.135538,0.064137,0.087276,0.094950,0.123510,0.150237,0.286951,0.368697


In [8]:
## Output the results to a CSV file
# output_file_path = '../TEMP/lead_Trail_season_aggregate_states.csv' # Original Path - power BI report is tied to tis file location
output_file_path = '../data/2022_Lead_Trail_season_aggregate_stats.csv'
season_aggregate_stats_df.to_csv(output_file_path, index=False)