# SCRATCH BOOK

### Load scoring table and make table of goals scored first and last minute of periods as well as total goals scored

In [123]:
# Dependencies

# Basics
import os
import sys
import time
import sqlite3
import pandas as pd
from collections import defaultdict


from matplotlib import font_manager


## Load The Exo 2 font in case of problems with the font
font_path = 'C:/Windows/Fonts/Exo 2.tff'
locations =['../data/Exo_2'] # Font Location

font_files = font_manager.findSystemFonts(fontpaths=locations)

for file in font_files:
    font_manager.fontManager.addfont(file)

    ## Set the date to appear on the source area of plots (the last date of the data)
last_game_date = '2025-01-05'

## File Paths
folder_prefix = ''
# folder_prefix = '..'
data_folder = os.path.join(folder_prefix, '..', 'data/') # Data Folder Path
temp_folder = os.path.join(folder_prefix,'..', 'TEMP/',) # Temp Folder Path
TEMP_FOLDER = temp_folder # Temp Folder Path as used in legacy code
output_folder = os.path.join(temp_folder, 'team_comp_output/') # Output Folder Path
# data\db\2024_Dec_10_CLEANED_OLD_METHOD.db
db_path = os.path.join(data_folder, 'db', '2025_Jan_07_test2_ROUGH.db') # Database Path
# db_path = os.path.join(data_folder, 'db', '2024_Dec_03_v4_ROUGH.db') # Database Path
# db_path = os.path.join(temp_folder, '2024_Dec_03_v3_ROUGH.db') # Database Path
image_folder = os.path.join(folder_prefix, '..', 'images/') # Image Folder Path
logo_folder = os.path.join(folder_prefix, image_folder, 'logos/') # Logo Folder Path
conference_logo_folder = os.path.join(folder_prefix, logo_folder, 'conference') # Conference Logo Folder Path
export_folder = os.path.join(folder_prefix, image_folder, 'export/') # Export Folder Path
background_folder = os.path.join(folder_prefix, image_folder, 'background/') # Background Folder Path

# Other paths
school_info_path = os.path.join(data_folder, 'arena_school_info.csv') # School Info Path



In [124]:
## Load the database
conn = sqlite3.connect(db_path, isolation_level=None)

# convert string time to continuous time
## SQL query to fetch
def extract_goal_summary(conn):
    """
    Extracts and preprocesses the goal summary data from the database.
    """
    query = """
        WITH UniqueGoals AS (
        SELECT DISTINCT Game_ID, Team, Period, Time, PP
        FROM scoring_summary
    )
    SELECT * FROM UniqueGoals;
    """
    goal_df = pd.read_sql(query, conn)
    return goal_df

# Convert string time to continuous time
def convert_to_continuous_time(row):
    """
    Converts period-based time to a continuous format (0-65 minutes).
    """
    period_offsets = {'1st Period': 0, '2nd Period': 20, '3rd Period': 40, 'Overtime': 60}
    minutes, seconds = map(int, row['Time'].split(':'))
    offset = period_offsets.get(row['Period'], 0)
    return offset + minutes + seconds / 60.0

## Load the data
goal_data = extract_goal_summary(conn)
# Create a continuous time column
goal_data['Cont_Time'] = goal_data.apply(convert_to_continuous_time, axis=1)

goal_data.head()


Unnamed: 0,Game_ID,Team,Period,Time,PP,Cont_Time
0,2024-10-04-Michigan State-Lake Superior,Michigan State,1st Period,13:57,,13.95
1,2024-10-04-Michigan State-Lake Superior,Lake Superior,1st Period,16:00,,16.0
2,2024-10-04-Michigan State-Lake Superior,Michigan State,Overtime,0:39,3x3,60.65
3,2024-10-04-Minnesota State-Michigan,Michigan,2nd Period,0:33,PP,20.55
4,2024-10-04-Minnesota State-Michigan,Minnesota State,2nd Period,4:25,,24.416667


In [125]:
## Identify Empty Net Goals (EN in the PP column) and flag in a new column
goal_data['EN'] = goal_data['PP'].apply(lambda x: 'EN' in x)


In [126]:
## First minute of frame time ranges
# 0-1, 20-21, 40-41, 60-61

# Last minute of frame time ranges
# 19-20, 39-40, 59-60, 64-65

# Define the function for categorizing goal types
def goal_type_first_last(row):
    """
    Categorizes goals as first minute, last minute, or other.
    """
    # Check if the goal is in the first minute
    if (row['Cont_Time'] < 1 or  # Before the end of the first minute of the game
        (row['Cont_Time'] > 20 and row['Cont_Time'] < 21) or  # Between 20:00 and 21:00 (1st period)
        (row['Cont_Time'] > 40 and row['Cont_Time'] < 41) or  # Between 40:00 and 41:00 (2nd period)
        (row['Cont_Time'] > 60 and row['Cont_Time'] < 61)):  # Between 60:00 and 61:00 (3rd period)
        return 'First Minute'

    # Check if the goal is in the last minute of a period
    elif (row['Cont_Time'] > 19 and row['Cont_Time'] < 20 or  # Between 19:00 and 20:00 (1st period)
          (row['Cont_Time'] > 39 and row['Cont_Time'] < 40) or  # Between 39:00 and 40:00 (2nd period)
          (row['Cont_Time'] > 59 and row['Cont_Time'] < 60) or  # Between 59:00 and 60:00 (3rd period)
          (row['Cont_Time'] > 64 and row['Cont_Time'] < 65)):  # Between 64:00 and 65:00 (overtime)
        return 'Last Minute'

    # Otherwise, categorize as 'Other'
    else:
        return 'Other'


# Create a new column for the goal type
goal_data['Goal_Type'] = goal_data.apply(goal_type_first_last, axis=1)


# Check distribution of goal types
# goal_data['Goal_Type'].value_counts()

goal_data.head()

Unnamed: 0,Game_ID,Team,Period,Time,PP,Cont_Time,EN,Goal_Type
0,2024-10-04-Michigan State-Lake Superior,Michigan State,1st Period,13:57,,13.95,False,Other
1,2024-10-04-Michigan State-Lake Superior,Lake Superior,1st Period,16:00,,16.0,False,Other
2,2024-10-04-Michigan State-Lake Superior,Michigan State,Overtime,0:39,3x3,60.65,False,First Minute
3,2024-10-04-Minnesota State-Michigan,Michigan,2nd Period,0:33,PP,20.55,False,First Minute
4,2024-10-04-Minnesota State-Michigan,Minnesota State,2nd Period,4:25,,24.416667,False,Other


In [127]:
## Do the Same thing to flag the goals that happened in the first and last 2 minutes of the period

def goal_type_first2_last_2(row):
    """
    Flag the goals that happened in the first and last 2 minutes of the period
    """

    if (row['Cont_Time'] < 2 or  # Before the end of the first minute of the game
        (row['Cont_Time'] > 20 and row['Cont_Time'] < 22) or  # Between 20:00 and 21:00 (1st period)
        (row['Cont_Time'] > 40 and row['Cont_Time'] < 42) or  # Between 40:00 and 41:00 (2nd period)
        (row['Cont_Time'] > 60 and row['Cont_Time'] < 62)):  # Between 60:00 and 61:00 (3rd period)
        return 'First 2 Minutes'

    # Check if the goal is in the last 2 minutes of a period
    elif (row['Cont_Time'] > 18 and row['Cont_Time'] < 20 or  # Between 18:00 and 20:00 (1st period)
          (row['Cont_Time'] > 38 and row['Cont_Time'] < 40) or  # Between 38:00 and 40:00 (2nd period)
          (row['Cont_Time'] > 58 and row['Cont_Time'] < 60) or  # Between 58:00 and 60:00 (3rd period)
          (row['Cont_Time'] > 63 and row['Cont_Time'] < 65)):  # Between 63:00 and 65:00 (overtime)
        return 'Last 2 Minutes'
    else:
        return 'Other'

# Create a new column to flag the goals that happened in the first and last 2 minutes of the period
goal_data['Goal_Type_2'] = goal_data.apply(goal_type_first2_last_2, axis=1)

goal_data.head()
# Check distribution of goal types
goal_data['Goal_Type_2'].value_counts()


Goal_Type_2
Other              2486
Last 2 Minutes      517
First 2 Minutes     319
Name: count, dtype: int64

In [128]:
# Check if The EN goals are in the last minute and flag them as such in new column
goal_data['EN_Last_Minute'] = (goal_data['EN'] & (goal_data['Goal_Type'] == 'Last Minute'))
# DO the same for the last 2 minutes
goal_data['EN_Last_2_Minutes'] = (goal_data['EN'] & (goal_data['Goal_Type_2'] == 'Last 2 Minutes'))

goal_data.head()

Unnamed: 0,Game_ID,Team,Period,Time,PP,Cont_Time,EN,Goal_Type,Goal_Type_2,EN_Last_Minute,EN_Last_2_Minutes
0,2024-10-04-Michigan State-Lake Superior,Michigan State,1st Period,13:57,,13.95,False,Other,Other,False,False
1,2024-10-04-Michigan State-Lake Superior,Lake Superior,1st Period,16:00,,16.0,False,Other,Other,False,False
2,2024-10-04-Michigan State-Lake Superior,Michigan State,Overtime,0:39,3x3,60.65,False,First Minute,First 2 Minutes,False,False
3,2024-10-04-Minnesota State-Michigan,Michigan,2nd Period,0:33,PP,20.55,False,First Minute,First 2 Minutes,False,False
4,2024-10-04-Minnesota State-Michigan,Minnesota State,2nd Period,4:25,,24.416667,False,Other,Other,False,False


### Count High Impact Goals (Goals scored within a minute or 2 of another goal being scored)

In [129]:
# Sort the data by Game_ID and Cont_Time for chronological processing
goal_data_sorted = goal_data.sort_values(by=['Game_ID', 'Cont_Time']).reset_index(drop=True)

# Display the sorted data to verify
goal_data_sorted.head()

# Group data by Game_ID to analyze each game separately
grouped = goal_data_sorted.groupby('Game_ID')


# Debugging the logic and fixing the issue
# Reinitialize dictionaries to track counts
team_quick_responses = defaultdict(int)
opponent_quick_responses = defaultdict(int)

# Process each game individually again
for game_id, game_data in grouped:
    # Reset index for easier row iteration
    game_data = game_data.reset_index(drop=True)
    
    # Iterate through goals in this game
    for i in range(len(game_data)):
        current_team = game_data.loc[i, 'Team']
        current_time = game_data.loc[i, 'Cont_Time']
        
        # Compare with subsequent goals in the same game
        for j in range(i + 1, len(game_data)):
            next_team = game_data.loc[j, 'Team']
            next_time = game_data.loc[j, 'Cont_Time']
            
            # If the time difference is more than 1 minute, stop checking
            if next_time - current_time > 1:
                break
            
            # If the same team scores again within 1 minute
            if next_team == current_team:
                team_quick_responses[current_team] += 1
            
            # If the opposing team scores within 1 minute
            elif next_team != current_team:
                opponent_quick_responses[current_team] += 1

# Create the results DataFrame with updated column labels
quick_responses_df = pd.DataFrame({
    'Team': list(set(goal_data_sorted['Team'])),
    'HI_Back_to_Back': [team_quick_responses[team] for team in set(goal_data_sorted['Team'])],
    'HI_Quick_Response': [opponent_quick_responses[team] for team in set(goal_data_sorted['Team'])]
})

quick_responses_df.head()

Unnamed: 0,Team,HI_Back_to_Back,HI_Quick_Response
0,Ferris State,2,1
1,Stonehill,1,1
2,Northern Michigan,0,0
3,Canisius,2,2
4,Long Island,0,2


In [130]:
# Reinitialize dictionaries to track the new counts
team_quick_responses_2 = defaultdict(int)
opponent_quick_responses_2 = defaultdict(int)

# Process each game again for the new 2-minute interval logic
for game_id, game_data in grouped:
    # Reset index for easier row iteration
    game_data = game_data.reset_index(drop=True)
    
    # Iterate through goals in this game
    for i in range(len(game_data)):
        current_team = game_data.loc[i, 'Team']
        current_time = game_data.loc[i, 'Cont_Time']
        
        # Compare with subsequent goals in the same game
        for j in range(i + 1, len(game_data)):
            next_team = game_data.loc[j, 'Team']
            next_time = game_data.loc[j, 'Cont_Time']
            
            # If the time difference is more than 2 minutes, stop checking for 2-minute responses
            if next_time - current_time > 2:
                break
            
            # If the same team scores again within 2 minutes
            if next_team == current_team:
                team_quick_responses_2[current_team] += 1
            
            # If the opposing team scores within 2 minutes
            elif next_team != current_team:
                opponent_quick_responses_2[current_team] += 1

# Add the new columns to the results DataFrame
quick_responses_df['HI_Back_to_Back_2'] = [
    team_quick_responses_2[team] for team in set(goal_data_sorted['Team'])
]
quick_responses_df['HI_Quick_Response_2'] = [
    opponent_quick_responses_2[team] for team in set(goal_data_sorted['Team'])
]

# Display the updated results
quick_responses_df.head()


Unnamed: 0,Team,HI_Back_to_Back,HI_Quick_Response,HI_Back_to_Back_2,HI_Quick_Response_2
0,Ferris State,2,1,2,4
1,Stonehill,1,1,3,3
2,Northern Michigan,0,0,0,1
3,Canisius,2,2,3,7
4,Long Island,0,2,2,5


In [131]:
def enforce_column_presence(goal_tally, columns):
    """
    Ensure that all required columns are present in the goal tally DataFrame.
    If a column is missing, add it and fill with zeros.
    """
    for col in columns:
        if col not in goal_tally.columns:
            goal_tally[col] = 0
    return goal_tally

def robust_final_tally(goal_data):
    """
    Tally the number of goals by team, ensuring all expected columns are enforced explicitly.
    """
    # Define expected columns for each group type
    expected_columns_type = ['First Minute', 'Last Minute', 'Other']
    expected_columns_type2 = ['First 2 Minutes', 'Last 2 Minutes']

    # Group and tally goals by Goal_Type
    goal_tally_type = goal_data.groupby(['Team', 'Goal_Type']).size().unstack(fill_value=0).reset_index()

    # Ensure all expected columns are present for Goal_Type
    goal_tally_type = enforce_column_presence(goal_tally_type, expected_columns_type)

    # Group and tally goals by Goal_Type_2
    goal_tally_type2 = goal_data.groupby(['Team', 'Goal_Type_2']).size().unstack(fill_value=0).reset_index()

    # Ensure all expected columns are present for Goal_Type_2
    goal_tally_type2 = enforce_column_presence(goal_tally_type2, expected_columns_type2)

    # Merge both grouped DataFrames
    goal_tally = pd.merge(goal_tally_type, goal_tally_type2, on='Team', how='outer').fillna(0)

    # Ensure all columns in the final DataFrame
    all_expected_columns = ['Team'] + expected_columns_type + expected_columns_type2
    goal_tally = enforce_column_presence(goal_tally, all_expected_columns)

    # Calculate total goals
    goal_tally['Total Goals'] = (goal_tally['First Minute'] +
                                 
                                 goal_tally['Last Minute'] +
                                 
                                 goal_tally['Other_x'])

    # Calculate percentages
    goal_tally['Pct First Minute'] = goal_tally['First Minute'] / goal_tally['Total Goals'].replace(0, 1)
    goal_tally['Pct First 2 Minutes'] = goal_tally['First 2 Minutes'] / goal_tally['Total Goals'].replace(0, 1)
    goal_tally['Pct Last Minute'] = goal_tally['Last Minute'] / goal_tally['Total Goals'].replace(0, 1)
    goal_tally['Pct Last 2 Minutes'] = goal_tally['Last 2 Minutes'] / goal_tally['Total Goals'].replace(0, 1)

    # Tally EN Last Minute and EN Last 2 Minutes
    en_last_minute_tally = goal_data[goal_data['EN_Last_Minute']].groupby('Team').size()
    en_last_2_minute_tally = goal_data[goal_data['EN_Last_2_Minutes']].groupby('Team').size()

    # Add EN tallies to the final DataFrame
    goal_tally['EN Last Minute'] = goal_tally['Team'].map(en_last_minute_tally).fillna(0).astype(int)
    goal_tally['EN Last 2 Minutes'] = goal_tally['Team'].map(en_last_2_minute_tally).fillna(0).astype(int)

    return goal_tally

# Apply the function to the data
team_goal_tally = robust_final_tally(goal_data)

#### Merge the High Impact goal data with the Team Goal Tally

In [132]:
# Merge the High Impact Goal data with the goal tally data
team_goal_tally = pd.merge(team_goal_tally, quick_responses_df, on='Team', how='outer')

# Reorganize the table into the following order
# Team, Total Goals, First minute, Pct First Minute, First 2 Minutes, Pct First 2 Minutes, 
# Last Minute, EN Last Minute, Pct Last Minute, Last 2 Minutes, EN Last 2 Minutes, Pct Last 2 Minutes, HI Back-to-Back, 
# HI Quick Response, HI Back-to-Back 2, HI Quick Response 2

# Define the column order
column_order = ['Team', 'Total Goals', 'First Minute', 'Pct First Minute', 'First 2 Minutes', 'Pct First 2 Minutes',
                'Last Minute', 'EN Last Minute', 'Pct Last Minute', 'Last 2 Minutes', 'EN Last 2 Minutes', 'Pct Last 2 Minutes',
                'HI_Back_to_Back', 'HI_Quick_Response', 'HI_Back_to_Back_2', 'HI_Quick_Response_2']

# Reorder the columns
team_goal_tally = team_goal_tally[column_order]


# Display the final DataFrame
team_goal_tally.head()

Unnamed: 0,Team,Total Goals,First Minute,Pct First Minute,First 2 Minutes,Pct First 2 Minutes,Last Minute,EN Last Minute,Pct Last Minute,Last 2 Minutes,EN Last 2 Minutes,Pct Last 2 Minutes,HI_Back_to_Back,HI_Quick_Response,HI_Back_to_Back_2,HI_Quick_Response_2
0,Air Force,40,1,0.025,2,0.05,5,0,0.125,9,0,0.225,1,1,3,4
1,Alaska,41,0,0.0,2,0.04878,1,0,0.02439,3,0,0.073171,1,0,3,4
2,Alaska Anchorage,52,3,0.057692,4,0.076923,4,1,0.076923,10,3,0.192308,1,2,1,7
3,American Intl,51,0,0.0,4,0.078431,5,1,0.098039,5,1,0.098039,2,1,4,1
4,Arizona State,58,5,0.086207,6,0.103448,13,3,0.224138,16,4,0.275862,3,2,5,3


In [133]:
######### ORIGINAL CODE #########
## Tally the number of goals by team and goal type
# Output a Table with the following columns:
# Team, First Minute Goals, Last Minute Goals, Other Goals, Total Goals, Pct First Minute, Pct Last Minute
# def tally_goals_by_team(goal_data):
#     """
#     Tally the number of goals by team and goal type.
#     """
#     # Group by team and goal type
#     goal_tally = goal_data.groupby(['Team', 'Goal_Type']).size().unstack().reset_index()

#     # Fill in missing columns
#     goal_tally = goal_tally.fillna(0)

#     # Calculate total goals
#     goal_tally['Total Goals'] = goal_tally['First Minute'] + goal_tally['Last Minute'] + goal_tally['Other']

#     # Calculate percentages
#     goal_tally['Pct First Minute'] = goal_tally['First Minute'] / goal_tally['Total Goals']
#     goal_tally['Pct Last Minute'] = goal_tally['Last Minute'] / goal_tally['Total Goals']
#     goal_tally['Pct Other'] = goal_tally['Other'] / goal_tally['Total Goals']

#     # Tally EN Last Minute Goals
#     goal_tally['EN Last Minute'] = goal_data[goal_data['EN_Last_Minute']].groupby('Team').size()

#     return goal_tally

# # Tally the goals by team
# team_goal_tally = tally_goals_by_team(goal_data)

# team_goal_tally.head()

# Value Counts for EN Last Minute Goals
# goal_data['EN_Last_Minute'].value_counts()

# team_goal_tally['EN Last Minute'].value_counts()


In [134]:
# Sort by First Minute Goals
team_goal_tally = team_goal_tally.sort_values('First Minute', ascending=False)

# Sort by Last Minute Goals
team_goal_tally = team_goal_tally.sort_values('Last Minute', ascending=False)

# sort by percentage of other goals (decending)


# sort by total goals
team_goal_tally = team_goal_tally.sort_values('Total Goals', ascending=False)

team_goal_tally.head(20)

Unnamed: 0,Team,Total Goals,First Minute,Pct First Minute,First 2 Minutes,Pct First 2 Minutes,Last Minute,EN Last Minute,Pct Last Minute,Last 2 Minutes,EN Last 2 Minutes,Pct Last 2 Minutes,HI_Back_to_Back,HI_Quick_Response,HI_Back_to_Back_2,HI_Quick_Response_2
36,Minnesota,91,5,0.054945,12,0.131868,7,4,0.076923,16,7,0.175824,3,0,12,2
20,Denver,75,1,0.013333,3,0.04,6,1,0.08,12,3,0.16,0,0,1,0
27,Maine,69,5,0.072464,9,0.130435,10,2,0.144928,13,4,0.188406,3,1,7,2
34,Michigan State,68,6,0.088235,8,0.117647,5,2,0.073529,7,3,0.102941,3,0,8,2
62,Wisconsin,67,3,0.044776,5,0.074627,8,0,0.119403,14,1,0.208955,1,2,4,5
54,Sacred Heart,66,5,0.075758,8,0.121212,4,0,0.060606,9,3,0.136364,3,2,9,5
29,Massachusetts,65,1,0.015385,5,0.076923,6,0,0.092308,13,1,0.2,4,0,7,0
40,Niagara,65,5,0.076923,9,0.138462,5,1,0.076923,11,2,0.169231,2,1,3,3
49,Providence,65,4,0.061538,7,0.107692,4,1,0.061538,8,3,0.123077,0,2,2,4
45,Ohio State,64,4,0.0625,7,0.109375,7,1,0.109375,11,3,0.171875,2,1,8,1


In [135]:
#### LONG ISLAND UNIVERSITY #### 1-8-24 IS MISSING
### PROVIDENCE HAS ONE MORE GOAL THAN CHN TABLES #### 1-8-24
######## SOMETHING MIGHT BE UP WITH OHIO STATE ########
#### DOES NOT MATCH CHN TABLES #### 1-8-24

#### 1-8-24 - DISCOVERED THAT CHNS TEAM STATS TABLE IS NOT ACCURATE - OHIO STATE LISTED AS ONLY HAVING 58 GOALS, WHEN YOU CLICK THROUGH TO PLAYER BREAKDOWN THEY TALLY TO 64, JUST LIKE MY DATA HAS

In [136]:
breakpoint()

In [137]:
### Scorigami - Annimated Gif Code
import pandas as pd
import numpy as np
import os
import sys


image_folder = os.path.join('..', 'TEMP', 'IMAGES', 'stich_folder')


from PIL import Image, ImageSequence

import os
import numpy as np

import re  # For extracting numbers from filenames

# def create_animated_gif(image_folder, output_gif, total_duration=5, transition_frames=10):
#     """
#     Create an animated GIF from a sequence of images with fade transitions.
    
#     Parameters:
#     - image_folder: Path to the folder containing images (named with leading numbers, e.g., 1_*.png, 2_*.png).
#     - output_gif: Path for the output GIF.
#     - total_duration: Total duration of the animation in seconds.
#     - transition_frames: Number of intermediate frames for transitions between images.
#     """
#     # Load images sorted by the leading number in filenames
#     images = sorted(
#         [Image.open(os.path.join(image_folder, img)) for img in os.listdir(image_folder) if img.endswith(".png")],
#         key=lambda x: int(re.match(r"(\d+)", os.path.basename(x.filename)).group(1))  # Extract leading numbers
#     )

#     # Resize images to a suitable size while maintaining the aspect ratio
#     # Set max width and height
#     max_width = 800
#     max_height = 1080

#     for i, img in enumerate(images):
#         width, height = img.size
#         if width > max_width or height > max_height:
#             # Resize the image
#             ratio = min(max_width / width, max_height / height)
#             new_size = (int(width * ratio), int(height * ratio))
#             images[i] = img.resize(new_size, Image.Resampling.LANCZOS)

    



#     # Calculate total frames and duration per frame
#     num_images = len(images)
#     frames_per_image = total_duration * 1000 // (num_images + (num_images - 1) * transition_frames)
#     frame_duration = int(frames_per_image)  # Duration of each frame in milliseconds

#     all_frames = []
#     for i in range(num_images - 1):
#         # Add the current image
#         all_frames.append(images[i])
        
#         # Create transition frames (fade to next image)
#         for t in range(1, transition_frames + 1):
#             alpha = t / (transition_frames + 1)
#             blend_frame = Image.blend(images[i], images[i + 1], alpha)
#             all_frames.append(blend_frame)

#     # Add the final image
#     all_frames.append(images[-1])

#     # Save all frames as a GIF
#     all_frames[0].save(
#         output_gif,
#         save_all=True,
#         append_images=all_frames[1:],
#         duration=frame_duration,
#         loop=1
#     )

# def create_animated_gif(image_folder, output_gif, total_duration=5, transition_frames=10, reverse_order=False):
#     """
#     Create an animated GIF from a sequence of images with fade transitions.

#     Parameters:
#     - image_folder: Path to the folder containing images (named with leading numbers, e.g., 1_*.png, 2_*.png).
#     - output_gif: Path for the output GIF.
#     - total_duration: Total duration of the animation in seconds.
#     - transition_frames: Number of intermediate frames for transitions between images.
#     - reverse_order: If True, creates the GIF in reverse order.
#     """
#     # Load images sorted by the leading number in filenames
#     images = sorted(
#         [Image.open(os.path.join(image_folder, img)) for img in os.listdir(image_folder) if img.endswith(".png")],
#         key=lambda x: int(re.match(r"(\d+)", os.path.basename(x.filename)).group(1))  # Extract leading numbers
#     )

#     # Reverse the order if requested
#     if reverse_order:
#         images.reverse()

#     # Resize images to a suitable size while maintaining the aspect ratio
#     max_width = 800
#     max_height = 1080
#     for i, img in enumerate(images):
#         width, height = img.size
#         if width > max_width or height > max_height:
#             ratio = min(max_width / width, max_height / height)
#             new_size = (int(width * ratio), int(height * ratio))
#             images[i] = img.resize(new_size, Image.Resampling.LANCZOS)

#     # Calculate total frames and duration per frame
#     num_images = len(images)
#     frames_per_image = total_duration * 1000 // (num_images + (num_images - 1) * transition_frames)
#     frame_duration = int(frames_per_image)

#     all_frames = []
#     for i in range(num_images - 1):
#         all_frames.append(images[i])
#         for t in range(1, transition_frames + 1):
#             alpha = t / (transition_frames + 1)
#             blend_frame = Image.blend(images[i], images[i + 1], alpha)
#             all_frames.append(blend_frame)

#     all_frames.append(images[-1])

#     # Save all frames as a GIF
#     all_frames[0].save(
#         output_gif,
#         save_all=True,
#         append_images=all_frames[1:],
#         duration=frame_duration,
#         loop=0
#     )

def create_animated_gif(image_folder, output_gif, total_duration=5, transition_frames=10, reverse_order=False):
    """
    Create an animated GIF with smooth transitions and precise duration control.

    Parameters:
    - image_folder: Path to the folder containing images (named with leading numbers, e.g., 1_*.png, 2_*.png).
    - output_gif: Path for the output GIF.
    - total_duration: Total duration of the animation in seconds.
    - transition_frames: Number of intermediate frames for transitions between images.
    - reverse_order: If True, creates the GIF in reverse order.
    """
    import os
    from PIL import Image
    import re

    # Load images sorted by the leading number in filenames
    images = sorted(
        [Image.open(os.path.join(image_folder, img)) for img in os.listdir(image_folder) if img.endswith(".png")],
        key=lambda x: int(re.match(r"(\d+)", os.path.basename(x.filename)).group(1))
    )

    # Reverse the order if requested
    if reverse_order:
        images.reverse()

    # Resize images to maintain aspect ratio
    max_width, max_height = 800, 1080
    for i, img in enumerate(images):
        width, height = img.size
        if width > max_width or height > max_height:
            ratio = min(max_width / width, max_height / height)
            new_size = (int(width * ratio), int(height * ratio))
            images[i] = img.resize(new_size, Image.Resampling.LANCZOS)

    # Total frames calculation
    num_images = len(images)
    total_frames = num_images + (num_images - 1) * transition_frames

    # Frame duration in milliseconds
    frame_duration = (total_duration * 1000) // total_frames

    all_frames = []
    durations = []

    for i in range(num_images - 1):
        # Add the current image as a static frame
        all_frames.append(images[i])
        durations.append(frame_duration)

        # Create transition frames
        for t in range(1, transition_frames + 1):
            alpha = t / (transition_frames + 1)
            blend_frame = Image.blend(images[i], images[i + 1], alpha)
            all_frames.append(blend_frame)
            durations.append(frame_duration)

    # Add the final image
    all_frames.append(images[-1])
    durations.append(frame_duration)

    # Save all frames as a GIF
    all_frames[0].save(
        output_gif,
        save_all=True,
        append_images=all_frames[1:],
        duration=durations,
        # loop=0  # Infinite loop
        loop=1  # Loop once
    )


# # image_folder = "/path/to/your/image/folder"
output_gif_reverse = os.path.join('..', 'TEMP', 'IMAGES', 'stich_folder', 'scorigami_all_time_reverse_animated.gif')
output_gif = os.path.join('..', 'TEMP', 'IMAGES', 'stich_folder', 'scorigami_all_time_animated.gif')
# total_duration = 20  # Total duration of the animation in seconds
# transition_frames = 15  # Number of fade frames per transition



## Reverse Order
create_animated_gif(image_folder, output_gif_reverse, total_duration=10, transition_frames=0, reverse_order=False)

## Reverse Order
create_animated_gif(image_folder, output_gif, total_duration=10, transition_frames=0, reverse_order=True)


# # Example Usage
# # Set the folder containing images and output GIF path


# create_animated_gif(image_folder, output_gif, total_duration, transition_frames)




In [138]:
import pandas as pd
import numpy as np
import os
import sys

# Load Schedult/results data and compare conferences
path = os.path.join('..', 'data', 'schedule', 'Week 1 Scores.csv')

# Load the data
schedule_df = pd.read_csv(path)

# filter out exhibition games
schedule_df = schedule_df[schedule_df['Conference'] != 'Exhibition']
# Clean up Team names (remove ' and periods)
schedule_df['Away_Team'] = schedule_df['Away_Team'].str.replace("'", "").str.replace(".", "")
schedule_df['Home_Team'] = schedule_df['Home_Team'].str.replace("'", "").str.replace(".", "")
# strip leading and trailing spaces
schedule_df['Away_Team'] = schedule_df['Away_Team'].str.strip()
schedule_df['Home_Team'] = schedule_df['Home_Team'].str.strip()
# Drop any rows containing a / or TBD
schedule_df = schedule_df[~schedule_df['Away_Team'].str.contains('/')]
schedule_df = schedule_df[~schedule_df['Home_Team'].str.contains('/')]
schedule_df = schedule_df[~schedule_df['Away_Team'].str.contains('TBD')]
schedule_df = schedule_df[~schedule_df['Home_Team'].str.contains('TBD')]




# Define the conferences
conference_teams = {
    'Atlantic': ['Air Force', "American Intl", 'Army', 'Bentley', 'Canisius', 'Holy Cross', 'Mercyhurst', 
                 'Niagara', 'RIT', 'Robert Morris', 'Sacred Heart'],
    'Big Ten': ['Michigan', 'Michigan State', 'Minnesota', 'Notre Dame', 'Ohio State', 'Penn State', 'Wisconsin'],
    'CCHA': ['Augustana', 'Bemidji State', 'Bowling Green', 'Ferris State', 'Lake Superior', 'Michigan Tech', 
             'Minnesota State', 'Northern Michigan', 'St Thomas'],
    'ECAC': ['Brown', 'Clarkson', 'Colgate', 'Cornell', 'Dartmouth', 'Harvard', 'Princeton', 'Quinnipiac',
             'Rensselaer', 'St Lawrence', 'Union', 'Yale'],
    'Hockey East': ['Boston College', 'Boston University', 'Connecticut', 'Maine', 'Massachusetts', 'Mass Lowell',
                    'Merrimack', 'New Hampshire', 'Northeastern', 'Providence', 'Vermont'],
    'NCHC': ['Arizona State', 'Colorado College', 'Denver', 'Miami', 'Minnesota Duluth', 'North Dakota', 'Omaha', 
             'St Cloud State', 'Western Michigan'],
    'Independents': ['Alaska Anchorage', 'Alaska', 'Lindenwood', 'Long Island', 'Stonehill']
}

# Function to get the conference of a team
def get_conference(team):
    for conference, teams in conference_teams.items():
        if team in teams:
            return conference
    return 'Unknown'  # For teams not in the provided lists

# Add columns for conference of both the away and home teams
schedule_df['Away_Conference'] = schedule_df['Away_Team'].apply(get_conference)
schedule_df['Home_Conference'] = schedule_df['Home_Team'].apply(get_conference)

# Drop rows with Unknown conferences - Stonehill and Long Island annonmaly
schedule_df = schedule_df[schedule_df['Away_Conference'] != 'Unknown']
schedule_df = schedule_df[schedule_df['Home_Conference'] != 'Unknown']

# Rename to completed_games_df
completed_games_df = schedule_df

# Matrix for away team wins
away_wins_matrix = pd.crosstab(index=completed_games_df['Away_Conference'],
                               columns=completed_games_df['Home_Conference'],
                               values=(completed_games_df['Away_Score'] > completed_games_df['Home_Score']).astype(int),
                               aggfunc='sum', dropna=False)

# Matrix for home team wins
home_wins_matrix = pd.crosstab(index=completed_games_df['Home_Conference'],
                               columns=completed_games_df['Away_Conference'],
                               values=(completed_games_df['Home_Score'] > completed_games_df['Away_Score']).astype(int),
                               aggfunc='sum', dropna=False)

# Transpose the home wins matrix so that it aligns with the away wins matrix for summation
home_wins_matrix = home_wins_matrix.T

# Sum both matrices to get the total wins
total_wins_matrix = away_wins_matrix.add(home_wins_matrix, fill_value=0)
# total_wins_matrix = total_wins_matrix.astype(int) # Convert to integers


# Display the results matrix
print(total_wins_matrix)
# calculate and print the total number of games played
total_games = total_wins_matrix.sum().sum()
print(f'Total games played: {total_games}')



Home_Conference  Atlantic  Big Ten  CCHA  ECAC  Hockey East  Independents  \
Away_Conference                                                             
Atlantic                0        0     0     5            2             0   
Big Ten                 0        0     2     0            0             2   
CCHA                    2        2     1     0            0             0   
ECAC                    0        0     0     0            2             0   
Hockey East             1        0     0     0            0             0   
Independents            0        0     0     0            1             0   
NCHC                    2        0     1     0            0             2   

Home_Conference  NCHC  
Away_Conference        
Atlantic            0  
Big Ten             0  
CCHA                1  
ECAC                0  
Hockey East         0  
Independents        0  
NCHC                0  
Total games played: 26


##### Data Check

In [139]:
# # Find the team without a conference
# # Display rows with 'Unknown' in either column
# unknown_teams = schedule_df[(schedule_df['Away_Conference'] == 'Unknown') | (schedule_df['Home_Conference'] == 'Unknown')]
# print(len(unknown_teams))  # Number of rows with unknown teams

# # value count of unknown teams
# print(unknown_teams['Away_Team'].value_counts())
# print(unknown_teams['Home_Team'].value_counts())


### Join 2023 Player Stats to 2024 Rosters

In [140]:
import pandas as pd
import numpy as np
import os
import sys

## Path to the data
roster_path = os.path.join("..", "data", "roster_2024_current_v3.csv")
stat_path = os.path.join("..", "data", "player_stats_2023_v1.csv")

# Load the data
roster_df = pd.read_csv(roster_path)
stat_df = pd.read_csv(stat_path)

# Check the data
roster_df.head()
# stat_df.head()

Unnamed: 0,Current Team,Last_Name,First_Name,No,Position,Yr,Ht,Wt,DOB,Hometown,Height_Inches,Draft_Year,NHL_Team,D_Round,Last Team,League,City,State_Province,Country
0,Lake Superior,Barone,Adam,6,Defensemen,Fr,1-Jun,174,5/6/2004,"Sault Ste. Marie, Ont.",73,,,,Trail,BCHL,Sault Ste. Marie,Ontario,Canada
1,Lake Superior,Blanchett,Jack,16,Defensemen,So,11-May,185,5/12/2003,"Monroe, Mich.",71,,,,Powell,BCHL,Monroe,Michigan,USA
2,Lake Superior,Brown,Mike,3,Defensemen,Jr,2-Jun,209,4/3/2001,"Belmont, Mass.",74,,,,Merrimack,,Belmont,Massachusetts,USA
3,Lake Superior,Bushy,Evan,5,Defensemen,So,1-Jun,195,3/26/2002,"Mankato, Minn.",73,,,,Trail,BCHL,Mankato,Minnesota,USA
4,Lake Superior,Conrad,Jacob,4,Defensemen,Fr,11-May,180,5/18/2002,"Green Bay, Wis.",71,,,,Fairbanks,NAHL,Green Bay,Wisconsin,USA


#### Data Transform

In [141]:
# Split stats Clean_Player into first and last name
stat_df['First_Name'] = stat_df['Clean_Player'].str.split(" ").str[0]
stat_df['Last_Name'] = stat_df['Clean_Player'].str.split(" ").str[1:]


stat_df['Last_Name'] = stat_df['Last_Name'].str[0].str.replace('[','').str.replace(']','') # Remove the brackets from the last name
# Remove periods dashes ect from both names
stat_df['First_Name'] = stat_df['First_Name'].str.replace('.','').str.replace('-',' ')
stat_df['Last_Name'] = stat_df['Last_Name'].str.replace('.','').str.replace('-',' ')
roster_df['First_Name'] = roster_df['First_Name'].str.replace('.','').str.replace('-',' ')
roster_df['Last_Name'] = roster_df['Last_Name'].str.replace('.','').str.replace('-',' ')
# strip white space
stat_df['First_Name'] = stat_df['First_Name'].str.strip()
stat_df['Last_Name'] = stat_df['Last_Name'].str.strip()
roster_df['First_Name'] = roster_df['First_Name'].str.strip()
roster_df['Last_Name'] = roster_df['Last_Name'].str.strip()

# Rename Team to Team_2023 for clarity
stat_df.rename(columns={'Team':'Team_2023'}, inplace=True)
# Rename Current_Team to Team_2024 for clarity
roster_df.rename(columns={'Current Team':'Team_2024'}, inplace=True)

stat_df.head()
# OUTPUT THE DATA TO TEMP CSVs
roster_df.to_csv(os.path.join("..", "TEMP", "TEST_roster_2024_current_v4.csv"), index=False)
stat_df.to_csv(os.path.join("..", "TEMP", "TEST_player_stats_2023_v2.csv"), index=False)

In [142]:
## Try a quick merge
merged_df = pd.merge(roster_df, stat_df, left_on=['First_Name', 'Last_Name'], right_on=['First_Name', 'Last_Name'], how='outer', suffixes=('_2024', '_2023'))
merged_df.head()

# Print report of the merge
print(f"Number of players in the roster: {len(roster_df)}")
print(f"Number of players in the stats: {len(stat_df)}")
print(f"Number of players in the merged data: {len(merged_df)}")


# Find Number Number of players whos Team_2023 does not match Team_2024
mismatched_teams = merged_df[merged_df['Team_2023'] != merged_df['Team_2024']]
print(f"Number of players with mismatched teams: {len(mismatched_teams)}")




print(merged_df.info())
merged_df.head()

Number of players in the roster: 1820
Number of players in the stats: 1729
Number of players in the merged data: 2309
Number of players with mismatched teams: 1374
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2309 entries, 0 to 2308
Data columns (total 28 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Team_2024       1825 non-null   object 
 1   Last_Name       2309 non-null   object 
 2   First_Name      2309 non-null   object 
 3   No              1825 non-null   float64
 4   Position        1825 non-null   object 
 5   Yr              1825 non-null   object 
 6   Ht              1822 non-null   object 
 7   Wt              1825 non-null   float64
 8   DOB             1735 non-null   object 
 9   Hometown        1825 non-null   object 
 10  Height_Inches   1825 non-null   float64
 11  Draft_Year      225 non-null    float64
 12  NHL_Team        225 non-null    object 
 13  D_Round         225 non-null    float64
 14  Last

Unnamed: 0,Team_2024,Last_Name,First_Name,No,Position,Yr,Ht,Wt,DOB,Hometown,...,Country,Clean_Player,Team_2023,G,A,Pts,plus_minus,Sh,PIM,Games_Played
0,Long Island,Casperson,AJ,3.0,Defensemen,Jr,2-Jun,190.0,7/19/2001,"Flower Mound, Texas",...,USA,AJ Casperson,Long Island,0.0,1.0,1.0,1.0,7.0,2.0,12.0
1,Bentley,Hodges,AJ,20.0,Forwards,Gr,Jun-00,175.0,8/24/2001,"Littleton, Colo.",...,USA,A.J. Hodges,Bentley,6.0,9.0,15.0,-1.0,57.0,2.0,29.0
2,Bemidji State,Macaulay,AJ,12.0,Defensemen,Sr,9-May,185.0,4/12/2002,"Bonnyville, Alb.",...,Canada,A.J. Macaulay,Alaska,5.0,10.0,15.0,9.0,44.0,14.0,34.0
3,Quinnipiac,Bohlinger,Aaron,5.0,Defensemen,Gr,9-May,165.0,8/25/2000,"Walden, N.Y.",...,USA,Aaron Bohlinger,Massachusetts,3.0,5.0,8.0,1.0,22.0,4.0,34.0
4,Long Island,Grounds,Aaron,23.0,Forwards,Sr,2-Jun,190.0,12/24/1999,"Jamestown, N.D.",...,USA,Aaron Grounds,Long Island,1.0,2.0,3.0,-5.0,14.0,16.0,11.0


In [143]:
## Drop players who aren't playing this year (No Team_2024)
merged_df = merged_df.dropna(subset=['Team_2024'])
print(merged_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 1825 entries, 0 to 2308
Data columns (total 28 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Team_2024       1825 non-null   object 
 1   Last_Name       1825 non-null   object 
 2   First_Name      1825 non-null   object 
 3   No              1825 non-null   float64
 4   Position        1825 non-null   object 
 5   Yr              1825 non-null   object 
 6   Ht              1822 non-null   object 
 7   Wt              1825 non-null   float64
 8   DOB             1735 non-null   object 
 9   Hometown        1825 non-null   object 
 10  Height_Inches   1825 non-null   float64
 11  Draft_Year      225 non-null    float64
 12  NHL_Team        225 non-null    object 
 13  D_Round         225 non-null    float64
 14  Last Team       1815 non-null   object 
 15  League          1767 non-null   object 
 16  City            1825 non-null   object 
 17  State_Province  1825 non-null   object

In [144]:


# Convert all number columns to int
int_columns = ['No', 'Height_Inches', 'Wt', 'Draft_Year', 'D_Round', 
               'G', 'A', 'Pts', 'plus_minus', 'Sh', 'PIM', 'Games_Played']

for col in int_columns:
    merged_df[col] = merged_df[col].astype('Int64')

merged_df.head()

Unnamed: 0,Team_2024,Last_Name,First_Name,No,Position,Yr,Ht,Wt,DOB,Hometown,...,Country,Clean_Player,Team_2023,G,A,Pts,plus_minus,Sh,PIM,Games_Played
0,Long Island,Casperson,AJ,3,Defensemen,Jr,2-Jun,190,7/19/2001,"Flower Mound, Texas",...,USA,AJ Casperson,Long Island,0,1,1,1,7,2,12
1,Bentley,Hodges,AJ,20,Forwards,Gr,Jun-00,175,8/24/2001,"Littleton, Colo.",...,USA,A.J. Hodges,Bentley,6,9,15,-1,57,2,29
2,Bemidji State,Macaulay,AJ,12,Defensemen,Sr,9-May,185,4/12/2002,"Bonnyville, Alb.",...,Canada,A.J. Macaulay,Alaska,5,10,15,9,44,14,34
3,Quinnipiac,Bohlinger,Aaron,5,Defensemen,Gr,9-May,165,8/25/2000,"Walden, N.Y.",...,USA,Aaron Bohlinger,Massachusetts,3,5,8,1,22,4,34
4,Long Island,Grounds,Aaron,23,Forwards,Sr,2-Jun,190,12/24/1999,"Jamestown, N.D.",...,USA,Aaron Grounds,Long Island,1,2,3,-5,14,16,11


In [145]:
## OUTPUT CSV TO TEMP FOR INSPECTION
merged_df.to_csv(os.path.join("..", "data", "roster_2024_with_2023_stats.csv"), index=False)

### Transform congressional demographic data

In [146]:
# import pandas as pd
# import numpy as np
# import os
# import geopandas as gpd


# ## PATHS ##
# # ## 118 Congress Shapefile
# # shape_path = os.path.join('..', 'data', 'vault', '118th_congress', 'USA_118th_Congressional_Districts.shp')
# # ## Load Shapefile
# # gdf = gpd.read_file(shape_path)

# # Income data table - 5 Year Average 2022
# income_path = os.path.join('..', 'data', 'vault', '118th_congress', 'income_data', 'ACSST5Y2022.S1903-Data.csv')
# income_df = pd.read_csv(income_path, skiprows=1) # Load Income Data

# # Summary table with Populations and Representative Names
# summary_path = os.path.join('..', 'data', 'vault', 'USA_118th_Congressional_Districts_info_table.csv')
# summary_df = pd.read_csv(summary_path)

# # Check 
# # gdf.head()
# # income_df.head()
# # summary_df.head()


In [147]:
# # Check 
# gdf.head()
# income_df.head()
# # summary_df.head()

### Manipulate Image icons so they are all 300 x 300 px squares
- making sure they are all squares will make resizing issues easier later on
    - The aspect ratio is getting screwed up during resizing for icons that are not square

In [148]:
from PIL import Image, ImageOps
import os

# Directory where the logos are stored
logo_dir = os.path.join('..', 'images', 'logos')

# Make logos square by adding transparent space equally on both sides
for logo_file in os.listdir(logo_dir):
    logo_path = os.path.join(logo_dir, logo_file)
    
    # Check if the path is a file and not a directory
    if os.path.isfile(logo_path):
        with Image.open(logo_path) as img:
            # Ensure the image has an alpha channel (for transparency)
            img = img.convert("RGBA")
            
            width, height = img.size
            
            # If the image is already square, no changes are needed
            if width == height:
                continue
            
            # Calculate padding to add on the shorter side to make the image square
            if width > height:
                padding = (width - height) // 2
                new_img = ImageOps.expand(img, border=(0, padding, 0, padding), fill=(0, 0, 0, 0))
            else:
                padding = (height - width) // 2
                new_img = ImageOps.expand(img, border=(padding, 0, padding, 0), fill=(0, 0, 0, 0))
            
            # Save the padded square image, overwriting the original
            new_img.save(logo_path)

print("All logos made square by adding transparent space equally to each side.")


All logos made square by adding transparent space equally to each side.


## Verify the coordinates for the rinks in arena_info

In [149]:
# # Dependencies
# import os
# import requests
# import pandas as pd

# # Path to arena file
# arena_file = os.path.join('..','data', 'arena_school_info.csv')
# arena_df = pd.read_csv(arena_file)

# # Open Roster File To Clean State/Provences Names
# roster_file = os.path.join('..','data', 'roster_2024_current_v2.csv')
# roster_df = pd.read_csv(roster_file)

# roster_df.head()


In [150]:
# ## Get list of Unique State/Province Names
# unique_states = roster_df['State_Province'].unique()
# unique_states

# ## Dictionary to standardize state/province names

# standardized_locations = {
#     'Ont.': 'Ontario', 'Mich.': 'Michigan', 'Mass.': 'Massachusetts', 'Minn.': 'Minnesota', 
#     'Wis.': 'Wisconsin', 'Sweden': 'Sweden', 'Germany': 'Germany', 'B.C.': 'British Columbia',
#     'N.Y.': 'New York', 'Wash.': 'Washington', 'Que.': 'Quebec', 'Alb.': 'Alberta', 
#     'N.J.': 'New Jersey', 'Sask.': 'Saskatchewan', 'Conn.': 'Connecticut', 'Mo.': 'Missouri',
#     'Texas': 'Texas', 'Calif.': 'California', 'DC': 'District of Columbia', 'Fla.': 'Florida',
#     'Ohio': 'Ohio', 'Ill.': 'Illinois', 'Pa.': 'Pennsylvania', 'Ga.': 'Georgia',
#     'Mont.': 'Montana', 'Tenn.': 'Tennessee', 'Colo.': 'Colorado', 'Va.': 'Virginia', 
#     'Vt.': 'Vermont', 'R.I.': 'Rhode Island', 'Md.': 'Maryland', 'Ariz.': 'Arizona', 
#     'Wisc.': 'Wisconsin', 'Iowa': 'Iowa', 'Man.': 'Manitoba', 'Slovakia': 'Slovakia', 
#     'N.D.': 'North Dakota', 'N.C.': 'North Carolina', 'P.E.I.': 'Prince Edward Island',
#     'N.H.': 'New Hampshire', 'Alaska': 'Alaska', 'Belarus': 'Belarus', 'MB': 'Manitoba',
#     'Russia': 'Russia', 'Finland': 'Finland', 'Newf.': 'Newfoundland and Labrador', 
#     'Hungary': 'Hungary', 'SUI': 'Switzerland', 'S.C.': 'South Carolina', 'Latvia': 'Latvia',
#     'Czech Republic': 'Czech Republic', 'N.B.': 'New Brunswick', 'Great Britain': 'United Kingdom', 
#     'NB': 'New Brunswick', 'Norway': 'Norway', 'N.S.': 'Nova Scotia', 'Ind.': 'Indiana', 
#     'NWT': 'Northwest Territories', 'AUT': 'Austria', 'Idaho': 'Idaho', 'S.D.': 'South Dakota', 
#     'Switzerland': 'Switzerland', 'Ore.': 'Oregon', 'Wyo.': 'Wyoming', 'Utah': 'Utah', 
#     'ITA': 'Italy', 'Slovenia': 'Slovenia', 'YT': 'Yukon', 'Del.': 'Delaware', 'Maine': 'Maine',
#     'Poland': 'Poland', 'Yukon': 'Yukon', 'Ukraine': 'Ukraine', 'Japan': 'Japan', 'Neb.': 'Nebraska'
# }

# ## Apply the standardization to the State/Province column
# roster_df['State_Province'] = roster_df['State_Province'].replace(standardized_locations)

# # Check the unique values after standardization
# roster_df['State_Province'].unique()
# print(roster_df['State_Province'].unique())

In [151]:
## Output the cleaned roster to a new CSV file
cleaned_roster_file = os.path.join('..','data', 'roster_cleaned_state_prov_2024.csv')
roster_df.to_csv(cleaned_roster_file, index=False)

In [152]:
# import requests
# import pandas as pd

# import requests
# import pandas as pd

# # Define the function to check the location using Google Places API
# def check_location(lat, lng, api_key):
#     url = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"
#     params = {
#         'location': f'{lat},{lng}',
#         'radius': 500,  # Distance in meters from the provided coordinates
#         'type': 'stadium',  # Filter search to stadiums/arenas
#         'key': api_key
#     }
    
#     # Debugging: Print the URL and parameters
#     print(f"Requesting URL: {url}")
#     print(f"Parameters: {params}")
    
#     response = requests.get(url, params=params)
    
#     # Debugging: Print the response code and content
#     print(f"Response status code: {response.status_code}")
#     print(f"Response content: {response.text}")
    
#     if response.status_code == 200:
#         results = response.json().get('results')
#         if results:
#             return results[0].get('name'), results[0].get('vicinity')
#         else:
#             return None, "No results found"
#     else:
#         return None, f"API request failed with status {response.status_code}"

# # Define the function to verify coordinates in the DataFrame
# def verify_coordinates(df, api_key):
#     results = []
#     for index, row in df.iterrows():
#         lat = row['Latitude']
#         lng = row['Longitude']
#         arena_name = row['Arena']
        
#         # Debugging: Print the current coordinates and arena being checked
#         print(f"Checking coordinates for arena: {arena_name}")
#         print(f"Latitude: {lat}, Longitude: {lng}")
        
#         # Get the name and vicinity of the nearest stadium/arena
#         name, vicinity = check_location(lat, lng, api_key)
        
#         # Append the original data and verification results
#         results.append({
#             'Arena': arena_name,
#             'Latitude': lat,
#             'Longitude': lng,
#             'Google Places Name': name,
#             'Vicinity': vicinity
#         })
#     return pd.DataFrame(results)

# # Load your API key


# # Assuming arena_df is your DataFrame
# verified_df = verify_coordinates(arena_df, api_key)

# # Output the results
# print(verified_df)


In [153]:
# verified_df.head(10)

In [154]:
# ## Version 2 of Arena Location verifications
# ## Returns 5 closest Google Places to coordinates given

# import requests
# import pandas as pd

# # Define the function to check the 5 closest places using Google Places API
# def check_nearby_places(lat, lng, api_key):
#     url = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"
#     params = {
#         'location': f'{lat},{lng}',
#         'radius': 500,  # Distance in meters from the provided coordinates
#         'key': api_key
#     }
    
#     # Debugging: Print the URL and parameters being sent to the API
#     print(f"Requesting places near lat: {lat}, lng: {lng}")
#     print(f"Request URL: {url}")
#     print(f"Parameters: {params}")
    
#     response = requests.get(url, params=params)
    
#     # Debugging: Print the response status and content
#     print(f"Response status code: {response.status_code}")
#     print(f"Response content: {response.text}\n")  # This shows the full response from the API
    
#     if response.status_code == 200:
#         results = response.json().get('results')
#         if results:
#             # Return the top 5 closest places
#             return [(result.get('name'), result.get('vicinity')) for result in results[:5]]
#         else:
#             return [("None", "No results found")]
#     else:
#         return [("None", f"API request failed with status {response.status_code}")]

# # Define the function to verify coordinates and return the 5 closest places
# def verify_coordinates(df, api_key):
#     results = []
#     for index, row in df.iterrows():
#         lat = row['Latitude']
#         lng = row['Longitude']
#         arena_name = row['Arena']
#         school_name = row['School']
        
#         # Debugging: Print the current arena and coordinates being checked
#         print(f"\nChecking nearby places for arena: {arena_name} (School: {school_name})")
#         print(f"Latitude: {lat}, Longitude: {lng}")
        
#         # Get the 5 closest places
#         nearby_places = check_nearby_places(lat, lng, api_key)
        
#         # Add each place to the results, along with the original data
#         for place in nearby_places:
#             results.append({
#                 'Arena': arena_name,
#                 'School': school_name,
#                 'Latitude': lat,
#                 'Longitude': lng,
#                 'Google Places Name': place[0],
#                 'Vicinity': place[1]
#             })
            
#     return pd.DataFrame(results)

# # Load your API key
# api_key = ''

# # Assuming arena_df is your DataFrame
# verified_df = verify_coordinates(arena_df, api_key)

# # Output the results
# print(verified_df)


In [155]:
verified_df.head(10)

## OUTPUT TO TEMP FOLDER FOR MANUAL REVIEW
# output_file = os.path.join('..','TEMP', 'arena_school_info_place_checkV3.csv')
# verified_df.to_csv(output_file, index=False)



NameError: name 'verified_df' is not defined