## Cacluate The amount of times each team has spent tied, leading, trailing, ect 

In [32]:
import pandas as pd

# path to single game file
path = '../TEMP/single_game_scoring.csv'
path = '../TEMP/ss_example_3.csv'

def calculate_game_states_dataframe_ot_tie_corrected_v2(file_path):
    data = pd.read_csv(file_path)

    def period_elapsed_to_seconds(period, time_str):
        minutes, seconds = map(int, time_str.split(':'))
        elapsed_seconds = minutes * 60 + seconds
        
        if '1st' in period:
            return elapsed_seconds
        elif '2nd' in period:
            return 20 * 60 + elapsed_seconds
        elif '3rd' in period:
            return 40 * 60 + elapsed_seconds
        else:  # Overtime
            return 60 * 60 + elapsed_seconds

    data['Total_Seconds'] = data.apply(lambda row: period_elapsed_to_seconds(row['Period'], row['Time']), axis=1)

    teams = list(set(data['Home_Team'].unique().tolist() + data['Home_Team'].unique().tolist() + data['Away_Team'].unique().tolist()))
    team_states = {team: {'Tied': 0, 'Lead by 1': 0, 'Lead by 2': 0, 'Lead by 3+': 0,
                          'Down by 1': 0, 'Down by 2': 0, 'Down by 3+': 0} for team in teams}
    
    current_score = {teams[0]: 0, teams[1]: 0}
    last_timestamp = 0
    game_end = 3 * 20 * 60  # Default game end, will adjust if OT is detected
    overtime_played = False

    first_goal_time = data.sort_values(by='Total_Seconds').iloc[0]['Total_Seconds'] if not data.empty else 0
    team_states[teams[0]]['Tied'] = first_goal_time
    team_states[teams[1]]['Tied'] = first_goal_time
    last_timestamp = first_goal_time

    for index, row in data.sort_values(by='Total_Seconds').iterrows():
        current_score[row['Team']] += 1
        lead_diff = current_score[teams[0]] - current_score[teams[1]]
        time_elapsed = row['Total_Seconds'] - last_timestamp

        if 'OT' in row['Period']:
            overtime_played = True
            game_end = row['Total_Seconds']  # Adjust game end to when OT goal is scored

        if lead_diff == 0:
            for team in teams:
                team_states[team]['Tied'] += time_elapsed
        else:
            leading_team, trailing_team = (teams[0], teams[1]) if lead_diff > 0 else (teams[1], teams[0])
            lead_abs = abs(lead_diff)
            update_states = lambda lead: (f'Lead by {min(lead, 3)}+' if lead >= 3 else f'Lead by {lead}',
                                          f'Down by {min(lead, 3)}+' if lead >= 3 else f'Down by {lead}')
            state_lead, state_trail = update_states(lead_abs)
            team_states[leading_team][state_lead] += time_elapsed
            team_states[trailing_team][state_trail] += time_elapsed
        
        last_timestamp = row['Total_Seconds']

    # Determine if game ended in a tie without explicit OT goal and adjust game_end correctly
    lead_diff_final = current_score[teams[0]] - current_score[teams[1]]
    if lead_diff_final == 0 and not overtime_played:
        # Adjust for games that end in a tie without 'OT' period goals
        overtime_played = True
        game_end = 65 * 60  # Adjust game end to include 5 minutes of OT

    final_time_elapsed = game_end - last_timestamp
    if lead_diff_final == 0 and not data.empty:
        for team in teams:
            team_states[team]['Tied'] += final_time_elapsed

    team_states_df = pd.DataFrame.from_dict(team_states, orient='index').reset_index().rename(columns={'index': 'Team'})
    
    return team_states_df

# Re-generate the DataFrame with the corrected function for the new file
game_states_df_ot_tie_corrected_v2 = calculate_game_states_dataframe_ot_tie_corrected_v2(path)
game_states_df_ot_tie_corrected_v2


Unnamed: 0,Team,Tied,Lead by 1,Lead by 2,Lead by 3+,Down by 1,Down by 2,Down by 3+
0,St. Cloud State,2040,718,0,0,913,0,0
1,St. Thomas,2040,913,0,0,718,0,0


## Hotfix a problem from Cleaning script
Adding home/awy team name coulmns

In [33]:
file_path = '../TEMP/scoring_summary_current.csv'

season_data_t = pd.read_csv(file_path)

# Parse the Game_Id for home and away teams for new columns
# Away team is between 3rd and 4th dash, home team is after 4th dash
season_data_t['Away_Team'] = season_data_t['Game_ID'].apply(lambda x: x.split('-')[3])
season_data_t['Home_Team'] = season_data_t['Game_ID'].apply(lambda x: x.split('-')[4])

# Strip Home_Team and Away_Team of any leading or trailing whitespace
season_data_t['Home_Team'] = season_data_t['Home_Team'].str.strip()
season_data_t['Away_Team'] = season_data_t['Away_Team'].str.strip()

df = season_data_t

df.head(10)

Unnamed: 0,Period,Team,PP,Player,Player_Goals,Assist1,Assist2,Time,Game_ID,Away_Team,Home_Team
0,1st Period,Michigan State,,Nicolas Muller,1,Jeremy Davidson,David Gucciardi,18:25,2023-10-07-Lake Superior-Michigan State,Lake Superior,Michigan State
1,2nd Period,Michigan State,5x3,Isaac Howard,1,Nicolas Muller,,1:44,2023-10-07-Lake Superior-Michigan State,Lake Superior,Michigan State
2,2nd Period,Michigan State,,Nash Nienhuis,1,Tommi Männistö,Tanner Kelly,7:06,2023-10-07-Lake Superior-Michigan State,Lake Superior,Michigan State
3,2nd Period,Lake Superior,,Tyler Williams,1,Timo Bakos,Carter Batchelder,10:23,2023-10-07-Lake Superior-Michigan State,Lake Superior,Michigan State
4,2nd Period,Lake Superior,PS,Harrison Roy,1,,,15:50,2023-10-07-Lake Superior-Michigan State,Lake Superior,Michigan State
5,3rd Period,Michigan State,,Red Savage,1,Nash Nienhuis,Joey Larson,10:48,2023-10-07-Lake Superior-Michigan State,Lake Superior,Michigan State
6,3rd Period,Michigan State,,Viktor Hurtig,1,Red Savage,Gavin O'Connell,15:46,2023-10-07-Lake Superior-Michigan State,Lake Superior,Michigan State
7,1st Period,Clarkson,,Cody Monds,1,Ryan Taylor,Anthony Romano,15:47,2023-10-07-Clarkson-Notre Dame,Clarkson,Notre Dame
8,2nd Period,Notre Dame,,Landon Slaggert,1,Ryan Siedem,,4:54,2023-10-07-Clarkson-Notre Dame,Clarkson,Notre Dame
9,3rd Period,Clarkson,,Cody Monds,2,,,4:45,2023-10-07-Clarkson-Notre Dame,Clarkson,Notre Dame


In [34]:
# season_file_path = '../TEMP/scoring_summary_current.csv'

from tqdm import tqdm  # For progress tracking

# Function to process each game in the dataset and aggregate game states
def process_season_game_states(df):
    season_data = df
    unique_games = season_data['Game_ID'].unique()
    
    # Initialize an empty DataFrame for aggregating results
    season_aggregate_states = pd.DataFrame()
    
    for game_id in tqdm(unique_games, desc="Processing Games"):
        game_data = season_data[season_data['Game_ID'] == game_id]
        
        # Save individual game data to a temporary CSV to use with the existing function
        temp_game_path = '../TEMP/temp_game.csv'
        game_data.to_csv(temp_game_path, index=False)
        
        # Calculate game states for the current game
        game_states_df = calculate_game_states_dataframe_ot_tie_corrected_v2(temp_game_path)
        
        # Aggregate the results
        if season_aggregate_states.empty:
            season_aggregate_states = game_states_df
        else:
            season_aggregate_states = season_aggregate_states.merge(game_states_df, on='Team', how='outer')
            for state in ['Tied', 'Lead by 1', 'Lead by 2', 'Lead by 3+', 'Down by 1', 'Down by 2', 'Down by 3+']:
                season_aggregate_states[state] = season_aggregate_states[f'{state}_x'].fillna(0) + season_aggregate_states[f'{state}_y'].fillna(0)
                season_aggregate_states.drop([f'{state}_x', f'{state}_y'], axis=1, inplace=True)
    
    return season_aggregate_states

# Process the season dataset
season_aggregate_stats_df = process_season_game_states(df)

season_aggregate_stats_df


Processing Games: 100%|██████████| 1006/1006 [00:16<00:00, 60.62it/s]


Unnamed: 0,Team,Tied,Lead by 1,Lead by 2,Lead by 3+,Down by 1,Down by 2,Down by 3+
0,Lake Superior,26070.0,10711.0,0.0,23592.0,22871.0,0.0,24561.0
1,Michigan State,21816.0,14590.0,0.0,39388.0,15695.0,0.0,19047.0
2,Notre Dame,24942.0,15813.0,0.0,26485.0,17405.0,0.0,27757.0
3,Clarkson,25652.0,23159.0,0.0,18770.0,19421.0,0.0,23850.0
4,RIT,24600.0,22384.0,0.0,37331.0,13214.0,0.0,13079.0
...,...,...,...,...,...,...,...,...
59,Harvard,24768.0,14989.0,0.0,6518.0,13764.0,0.0,27598.0
60,Brown,28589.0,10978.0,0.0,12423.0,13604.0,0.0,28225.0
61,Yale,29455.0,9660.0,0.0,13918.0,10078.0,0.0,23353.0
62,Cornell,30676.0,14964.0,0.0,29218.0,10236.0,0.0,6830.0


In [35]:
# Sort by Down by 3+ time to find the teams that were trailing by 3+ goals the most
# season_aggregate_stats_df.sort_values(by='Down by 3+', ascending=True).head(10)


In [36]:
### Calculate the percentages for the different game states
# Calculate the total game time for each team
season_aggregate_stats_df['Total_Time'] = season_aggregate_stats_df['Tied'] + season_aggregate_stats_df['Lead by 1'] + season_aggregate_stats_df['Lead by 2'] + season_aggregate_stats_df['Lead by 3+'] + season_aggregate_stats_df['Down by 1'] + season_aggregate_stats_df['Down by 2'] + season_aggregate_stats_df['Down by 3+']

# create column of all leads and all deficits
season_aggregate_stats_df['All_Lead'] = season_aggregate_stats_df['Lead by 1'] + season_aggregate_stats_df['Lead by 2'] + season_aggregate_stats_df['Lead by 3+']
season_aggregate_stats_df['All_Deficit'] = season_aggregate_stats_df['Down by 1'] + season_aggregate_stats_df['Down by 2'] + season_aggregate_stats_df['Down by 3+']

# Calculate the percentages for each game state
for state in ['Tied', 'Lead by 1', 'Lead by 2', 'Lead by 3+', 'Down by 1', 'Down by 2', 'Down by 3+', 'All_Lead', 'All_Deficit']:
    season_aggregate_stats_df[f'PCT_{state}'] = season_aggregate_stats_df[state] / season_aggregate_stats_df['Total_Time']

# Display the DataFrame with the calculated percentages
# season_aggregate_stats_df

In [37]:
# Convert all time columns from seconds to minutes
for state in ['Tied', 'Lead by 1', 'Lead by 2', 'Lead by 3+', 'Down by 1', 'Down by 2', 'Down by 3+', 'All_Lead', 'All_Deficit', 'Total_Time']:
    season_aggregate_stats_df[state] = season_aggregate_stats_df[state] / 60

# Display the DataFrame with the times converted to minutes
season_aggregate_stats_df


Unnamed: 0,Team,Tied,Lead by 1,Lead by 2,Lead by 3+,Down by 1,Down by 2,Down by 3+,Total_Time,All_Lead,All_Deficit,PCT_Tied,PCT_Lead by 1,PCT_Lead by 2,PCT_Lead by 3+,PCT_Down by 1,PCT_Down by 2,PCT_Down by 3+,PCT_All_Lead,PCT_All_Deficit
0,Lake Superior,434.500000,178.516667,0.0,393.200000,381.183333,0.0,409.350000,1796.750000,571.716667,790.533333,0.241826,0.099355,0.0,0.218840,0.212152,0.0,0.227828,0.318195,0.439980
1,Michigan State,363.600000,243.166667,0.0,656.466667,261.583333,0.0,317.450000,1842.266667,899.633333,579.033333,0.197366,0.131993,0.0,0.356336,0.141990,0.0,0.172315,0.488330,0.314305
2,Notre Dame,415.700000,263.550000,0.0,441.416667,290.083333,0.0,462.616667,1873.366667,704.966667,752.700000,0.221900,0.140683,0.0,0.235627,0.154846,0.0,0.246944,0.376310,0.401790
3,Clarkson,427.533333,385.983333,0.0,312.833333,323.683333,0.0,397.500000,1847.533333,698.816667,721.183333,0.231408,0.208918,0.0,0.169325,0.175198,0.0,0.215152,0.378243,0.390349
4,RIT,410.000000,373.066667,0.0,622.183333,220.233333,0.0,217.983333,1843.466667,995.250000,438.216667,0.222407,0.202372,0.0,0.337507,0.119467,0.0,0.118246,0.539880,0.237713
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,Harvard,412.800000,249.816667,0.0,108.633333,229.400000,0.0,459.966667,1460.616667,358.450000,689.366667,0.282620,0.171035,0.0,0.074375,0.157057,0.0,0.314913,0.245410,0.471970
60,Brown,476.483333,182.966667,0.0,207.050000,226.733333,0.0,470.416667,1563.650000,390.016667,697.150000,0.304725,0.117013,0.0,0.132415,0.145003,0.0,0.300845,0.249427,0.445848
61,Yale,490.916667,161.000000,0.0,231.966667,167.966667,0.0,389.216667,1441.066667,392.966667,557.183333,0.340662,0.111723,0.0,0.160969,0.116557,0.0,0.270089,0.272692,0.386646
62,Cornell,511.266667,249.400000,0.0,486.966667,170.600000,0.0,113.833333,1532.066667,736.366667,284.433333,0.333710,0.162787,0.0,0.317850,0.111353,0.0,0.074301,0.480636,0.185653


In [38]:
## Output the results to a CSV file
output_file_path = '../TEMP/lead_Trail_season_aggregate_stats.csv'
season_aggregate_stats_df.to_csv(output_file_path, index=False)