## Cacluate The amount of times each team has spent tied, leading, trailing, ect 

In [51]:
import pandas as pd

# path to single game file
path = '../TEMP/single_game_scoring.csv'
# path = '../TEMP/ss_example_3.csv'

def calculate_game_states_dataframe_ot_tie_corrected_v2(file_path):
    data = pd.read_csv(file_path)

    def period_elapsed_to_seconds(period, time_str):
        minutes, seconds = map(int, time_str.split(':'))
        elapsed_seconds = minutes * 60 + seconds
        
        if '1st' in period:
            return elapsed_seconds
        elif '2nd' in period:
            return 20 * 60 + elapsed_seconds
        elif '3rd' in period:
            return 40 * 60 + elapsed_seconds
        else:  # Overtime
            return 60 * 60 + elapsed_seconds

    data['Total_Seconds'] = data.apply(lambda row: period_elapsed_to_seconds(row['Period'], row['Time']), axis=1)

    teams = list(set(data['Home_Team'].unique().tolist() + data['Away_Team'].unique().tolist()))
    team_states = {team: {'Tied': 0, 'Lead by 1': 0, 'Lead by 2': 0, 'Lead by 3+': 0,
                          'Down by 1': 0, 'Down by 2': 0, 'Down by 3+': 0} for team in teams}
    
    current_score = {teams[0]: 0, teams[1]: 0}
    last_timestamp = 0
    game_end = 3 * 20 * 60  # Default game end, will adjust if OT is detected
    overtime_played = False

    first_goal_time = data.sort_values(by='Total_Seconds').iloc[0]['Total_Seconds'] if not data.empty else 0
    team_states[teams[0]]['Tied'] = first_goal_time
    team_states[teams[1]]['Tied'] = first_goal_time
    last_timestamp = first_goal_time

    for index, row in data.sort_values(by='Total_Seconds').iterrows():
        current_score[row['Team']] += 1
        lead_diff = current_score[teams[0]] - current_score[teams[1]]
        time_elapsed = row['Total_Seconds'] - last_timestamp

        if 'Overtime' in row['Period']:
            overtime_played = True
            game_end = row['Total_Seconds']  # Adjust game end to when OT goal is scored

        if lead_diff == 0:
            for team in teams:
                team_states[team]['Tied'] += time_elapsed
        else:
            leading_team, trailing_team = (teams[0], teams[1]) if lead_diff > 0 else (teams[1], teams[0])
            lead_abs = abs(lead_diff)
            update_states = lambda lead: (f'Lead by {min(lead, 3)}+' if lead >= 3 else f'Lead by {lead}',
                                          f'Down by {min(lead, 3)}+' if lead >= 3 else f'Down by {lead}')
            state_lead, state_trail = update_states(lead_abs)
            team_states[leading_team][state_lead] += time_elapsed
            team_states[trailing_team][state_trail] += time_elapsed
        
        last_timestamp = row['Total_Seconds']

    # Determine if game ended in a tie without explicit OT goal and adjust game_end correctly
    lead_diff_final = current_score[teams[0]] - current_score[teams[1]]
    if lead_diff_final == 0 and not overtime_played:
        # Adjust for games that end in a tie without 'OT' period goals
        overtime_played = True
        game_end = 65 * 60  # Adjust game end to include 5 minutes of OT

    final_time_elapsed = game_end - last_timestamp
    if lead_diff_final == 0 and not data.empty:
        for team in teams:
            team_states[team]['Tied'] += final_time_elapsed

    team_states_df = pd.DataFrame.from_dict(team_states, orient='index').reset_index().rename(columns={'index': 'Team'})
    
    return team_states_df

# Re-generate the DataFrame with the corrected function for the new file
game_states_df_ot_tie_corrected_v2 = calculate_game_states_dataframe_ot_tie_corrected_v2(path)
game_states_df_ot_tie_corrected_v2


Unnamed: 0,Team,Tied,Lead by 1,Lead by 2,Lead by 3+,Down by 1,Down by 2,Down by 3+
0,Michigan State,1105,327,1294,620,0,0,0
1,Lake Superior,1105,0,0,0,327,1294,620


In [52]:
# season_file_path = '../TEMP/scoring_summary_old.csv'
season_file_path = '../TEMP/Season_scoring.csv'

from tqdm import tqdm  # For progress tracking




# Function to process each game in the dataset and aggregate game states
def process_season_game_states(file_path):
    season_data = pd.read_csv(file_path)
    unique_games = season_data['Game_ID'].unique()
    
    # Initialize an empty DataFrame for aggregating results
    season_aggregate_states = pd.DataFrame()
    
    for game_id in tqdm(unique_games, desc="Processing Games"):
        game_data = season_data[season_data['Game_ID'] == game_id]
        
        # Save individual game data to a temporary CSV to use with the existing function
        temp_game_path = '../TEMP/temp_game.csv'
        game_data.to_csv(temp_game_path, index=False)
        
        # Calculate game states for the current game
        game_states_df = calculate_game_states_dataframe_ot_tie_corrected_v2(temp_game_path)
        
        # Aggregate the results
        if season_aggregate_states.empty:
            season_aggregate_states = game_states_df
        else:
            season_aggregate_states = season_aggregate_states.merge(game_states_df, on='Team', how='outer')
            for state in ['Tied', 'Lead by 1', 'Lead by 2', 'Lead by 3+', 'Down by 1', 'Down by 2', 'Down by 3+']:
                season_aggregate_states[state] = season_aggregate_states[f'{state}_x'].fillna(0) + season_aggregate_states[f'{state}_y'].fillna(0)
                season_aggregate_states.drop([f'{state}_x', f'{state}_y'], axis=1, inplace=True)
    
    return season_aggregate_states

# Process the season dataset
season_aggregate_states_df = process_season_game_states(season_file_path)

season_aggregate_states_df

Processing Games: 100%|██████████| 1006/1006 [00:14<00:00, 71.65it/s]


Unnamed: 0,Team,Tied,Lead by 1,Lead by 2,Lead by 3+,Down by 1,Down by 2,Down by 3+
0,Michigan State,38831.0,12273.0,17799.0,16049.0,7901.0,7809.0,9874.0
1,Lake Superior,44618.0,7158.0,12142.0,10163.0,13089.0,12215.0,8420.0
2,Notre Dame,39611.0,12298.0,9300.0,14070.0,13758.0,9925.0,13440.0
3,Clarkson,48088.0,14573.0,8768.0,7265.0,12851.0,9360.0,9947.0
4,RIT,44697.0,15777.0,15547.0,17196.0,6285.0,7925.0,3181.0
...,...,...,...,...,...,...,...,...
59,Dartmouth,46201.0,11842.0,6409.0,8093.0,7196.0,5014.0,9440.0
60,Yale,42071.0,4130.0,6330.0,5472.0,9553.0,10702.0,8206.0
61,Brown,43522.0,10080.0,6267.0,1444.0,5974.0,10274.0,16258.0
62,Cornell,43933.0,8889.0,13281.0,13734.0,8171.0,2629.0,1287.0


In [53]:
# Sort by Down by 3+ time to find the teams that were trailing by 3+ goals the most
# season_aggregate_stats_df.sort_values(by='Down by 3+', ascending=True).head(10)
season_aggregate_stats_df = season_aggregate_states_df

In [54]:
### Calculate the percentages for the different game states
# Calculate the total game time for each team
season_aggregate_stats_df['Total_Time'] = season_aggregate_stats_df['Tied'] + season_aggregate_stats_df['Lead by 1'] + season_aggregate_stats_df['Lead by 2'] + season_aggregate_stats_df['Lead by 3+'] + season_aggregate_stats_df['Down by 1'] + season_aggregate_stats_df['Down by 2'] + season_aggregate_stats_df['Down by 3+']

# create column of all leads and all deficits
season_aggregate_stats_df['All_Lead'] = season_aggregate_stats_df['Lead by 1'] + season_aggregate_stats_df['Lead by 2'] + season_aggregate_stats_df['Lead by 3+']
season_aggregate_stats_df['All_Deficit'] = season_aggregate_stats_df['Down by 1'] + season_aggregate_stats_df['Down by 2'] + season_aggregate_stats_df['Down by 3+']

# Calculate the percentages for each game state
for state in ['Tied', 'Lead by 1', 'Lead by 2', 'Lead by 3+', 'Down by 1', 'Down by 2', 'Down by 3+', 'All_Lead', 'All_Deficit']:
    season_aggregate_stats_df[f'PCT_{state}'] = season_aggregate_stats_df[state] / season_aggregate_stats_df['Total_Time']

# Display the DataFrame with the calculated percentages
# season_aggregate_stats_df

In [55]:
# Convert all time columns from seconds to minutes
for state in ['Tied', 'Lead by 1', 'Lead by 2', 'Lead by 3+', 'Down by 1', 'Down by 2', 'Down by 3+', 'All_Lead', 'All_Deficit', 'Total_Time']:
    season_aggregate_stats_df[state] = season_aggregate_stats_df[state] / 60

# Display the DataFrame with the times converted to minutes
season_aggregate_stats_df


Unnamed: 0,Team,Tied,Lead by 1,Lead by 2,Lead by 3+,Down by 1,Down by 2,Down by 3+,Total_Time,All_Lead,All_Deficit,PCT_Tied,PCT_Lead by 1,PCT_Lead by 2,PCT_Lead by 3+,PCT_Down by 1,PCT_Down by 2,PCT_Down by 3+,PCT_All_Lead,PCT_All_Deficit
0,Michigan State,647.183333,204.550000,296.650000,267.483333,131.683333,130.150000,164.566667,1842.266667,768.683333,426.400000,0.351297,0.111032,0.161024,0.145193,0.071479,0.070647,0.089328,0.417249,0.231454
1,Lake Superior,743.633333,119.300000,202.366667,169.383333,218.150000,203.583333,140.333333,1796.750000,491.050000,562.066667,0.413877,0.066398,0.112629,0.094272,0.121414,0.113306,0.078104,0.273299,0.312824
2,Notre Dame,660.183333,204.966667,155.000000,234.500000,229.300000,165.416667,224.000000,1873.366667,594.466667,618.716667,0.352405,0.109411,0.082739,0.125176,0.122400,0.088299,0.119571,0.317325,0.330270
3,Clarkson,801.466667,242.883333,146.133333,121.083333,214.183333,156.000000,165.783333,1847.533333,510.100000,535.966667,0.433804,0.131464,0.079096,0.065538,0.115929,0.084437,0.089732,0.276098,0.290099
4,RIT,744.950000,262.950000,259.116667,286.600000,104.750000,132.083333,53.016667,1843.466667,808.666667,289.850000,0.404103,0.142639,0.140559,0.155468,0.056822,0.071649,0.028759,0.438666,0.157231
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,Dartmouth,770.016667,197.366667,106.816667,134.883333,119.933333,83.566667,157.333333,1569.916667,439.066667,360.833333,0.490483,0.125718,0.068040,0.085918,0.076395,0.053230,0.100218,0.279675,0.229842
60,Yale,701.183333,68.833333,105.500000,91.200000,159.216667,178.366667,136.766667,1441.066667,265.533333,474.350000,0.486572,0.047766,0.073210,0.063286,0.110485,0.123774,0.094907,0.184262,0.329166
61,Brown,725.366667,168.000000,104.450000,24.066667,99.566667,171.233333,270.966667,1563.650000,296.516667,541.766667,0.463893,0.107441,0.066799,0.015391,0.063676,0.109509,0.173291,0.189631,0.346476
62,Cornell,732.216667,148.150000,221.350000,228.900000,136.183333,43.816667,21.450000,1532.066667,598.400000,201.450000,0.477927,0.096699,0.144478,0.149406,0.088889,0.028600,0.014001,0.390584,0.131489


In [56]:
## Output the results to a CSV file
output_file_path = '../TEMP/lead_Trail_season_aggregate_states.csv'
season_aggregate_stats_df.to_csv(output_file_path, index=False)