# Players by Location
- create a map using folium that plots the hometown location of every US (and maybe Canadian) player in D1 (2024-25 season)

In [6]:
## Dependencies

## System Libraries
import sys
import os
# Data handling
import pandas as pd
import geopandas as gpd
# Plotting and visualization
import matplotlib.pyplot as plt
from PIL import Image
## Map visualization
import folium
from folium.plugins import MarkerCluster
from folium.plugins import HeatMap
from folium.features import CustomIcon

### ROSTERFILE
# Path to the roster file
roster_path = os.path.join('..', 'data', 'roster_2024_current_v3.csv')
roster_df = pd.read_csv(roster_path) # Load the roster file


############### NOT USED IN THIS SCRIPT ################
#### 2023 STATS FILE
# stats_path = os.path.join('..', 'data', 'player_stats_2023_v1.csv')
# stats_df = pd.read_csv(stats_path) # Load the stats file

########## ROSTER SET WITH 2023 STATS FILE
roster_stats_path = os.path.join('..', 'data', 'roster_2024_with_2023_stats.csv')
roster_stats_df = pd.read_csv(roster_stats_path) # Load the roster file with stats


### SCHOOL INFO TABLE FOR LOGO PATHS
school_info_path = os.path.join('..', 'data', 'arena_school_info.csv')
school_info_df = pd.read_csv(school_info_path) # Load school info

# Path to logo folder
logo_folder = os.path.join('..', 'images', 'logos')

### SHAPEFILES
# Path to .geojson file with State Boundries
geojson_path = os.path.join('..', 'data', 'vault', 'combined-us-canada.geojson')
# Load the states shapefile
gdf_states = gpd.read_file(geojson_path)

# Path to shapefile with all US counties
shapefile_path = os.path.join('..', 'data', 'vault', 'cb_2018_us_county_500k.shp')
gdf = gpd.read_file(shapefile_path)
# Set the initial CRS (assuming it's in EPSG:4326, but you may need to verify the original CRS)
gdf = gdf.set_crs(epsg=4326)

## CHECK SHAPEFILES FOR COMPATIBILITY
# Set the CRS for both dataframes if it's missing
if gdf.crs is None:
    gdf.set_crs(epsg=4326, inplace=True)  # Assuming coordinates are in WGS 84 (lat/lon)

if gdf_states.crs is None:
    gdf_states.set_crs(epsg=4326, inplace=True)  # Assuming coordinates are in WGS 84 (lat/lon)



# Check the first few rows of the DataFrames
# roster_df.head()
# gdf_states.head()
# gdf.head()
school_info_df.head()
roster_stats_df.head()



Unnamed: 0,Team_2024,Last_Name,First_Name,No,Position,Yr,Ht,Wt,DOB,Hometown,...,Country,Clean_Player,Team_2023,G,A,Pts,plus_minus,Sh,PIM,Games_Played
0,Long Island,Casperson,AJ,3,Defensemen,Jr,2-Jun,190,7/19/2001,"Flower Mound, Texas",...,USA,AJ Casperson,Long Island,0.0,1.0,1.0,1.0,7.0,2.0,12.0
1,Bentley,Hodges,AJ,20,Forwards,Gr,Jun-00,175,8/24/2001,"Littleton, Colo.",...,USA,A.J. Hodges,Bentley,6.0,9.0,15.0,-1.0,57.0,2.0,29.0
2,Bemidji State,Macaulay,AJ,12,Defensemen,Sr,9-May,185,4/12/2002,"Bonnyville, Alb.",...,Canada,A.J. Macaulay,Alaska,5.0,10.0,15.0,9.0,44.0,14.0,34.0
3,Quinnipiac,Bohlinger,Aaron,5,Defensemen,Gr,9-May,165,8/25/2000,"Walden, N.Y.",...,USA,Aaron Bohlinger,Massachusetts,3.0,5.0,8.0,1.0,22.0,4.0,34.0
4,Long Island,Grounds,Aaron,23,Forwards,Sr,2-Jun,190,12/24/1999,"Jamestown, N.D.",...,USA,Aaron Grounds,Long Island,1.0,2.0,3.0,-5.0,14.0,16.0,11.0


## Prep The Data

In [7]:
# Clean up and prepare the data for mapping

# Drop rows with missing geographic data
geo_columns = ['City', 'State_Province', 'Country']
roster_cleaned_df = roster_df.dropna(subset=geo_columns)

#########################
###### TEST #####
## Sub the roster with stats file for the cleaned roster
roster_cleaned_df = roster_stats_df.copy()

# Create a simplified dataframe with relevant location columns for easier mapping
# roster_cleaned_df = roster_cleaned_df[['Current Team', 'First_Name', 'Last_Name', 'City', 'State_Province', 'Country']]

# Group by City, State_Province, and Country to count the number of players from each location
location_counts = roster_cleaned_df.groupby(['City', 'State_Province', 'Country']).size().reset_index(name='Player_Count')


## CHECK DATA TRANSFORMATION
# # Display the cleaned and grouped dataframe
# location_counts.head()
# # Sort the location counts in descending order
location_counts_sorted = location_counts.sort_values(by='Player_Count', ascending=False)
location_counts_sorted.head(10) # Display


Unnamed: 0,City,State_Province,Country,Player_Count
116,Calgary,Alberta,Canada,49
850,Toronto,Ontario,Canada,27
943,Winnipeg,Manitoba,Canada,16
587,North Vancouver,British Columbia,Canada,14
811,Stockholm,Sweden,Sweden,13
233,Edmonton,Alberta,Canada,13
231,Edina,Minnesota,USA,12
524,Mississauga,Ontario,Canada,12
796,St. Louis,Missouri,USA,12
246,Espoo,Finland,Finland,12


### Geocode Conversion
- 9-26-24 NOTE - Some issuse spoted with Geocoder. Canton, MI is being assigned a lat and long somewhere in Lansing.
    - MSU's Russian player comes from far Eastern Russia and does not appear on map at all
- takes the names of places and converts to lat long coordinates
- uses a rate limiter to avoid overloading service
- takes about 15 min to run - output is saved in the data folder - load from there 

In [8]:
# ### GEOCODING USING GOOGLE MAPS API 
# # LCHECK FOR AND LOAD GEOCODED DATA BEFORE RUNNING - THIS COSTS MONEY

# import googlemaps
# import pandas as pd
# import config




# # Initialize the Google Places API client
# gmaps = googlemaps.Client(key=config.g_key)

# def geocode_google_places(row):
#     try:
#         location_str = f"{row['City']}, {row['State_Province']}, {row['Country']}"
#         print(f"Querying location: {location_str}")  # Debugging output
#         geocode_result = gmaps.geocode(location_str)
        
#         # Check the API response
#         print(f"Geocode result: {geocode_result}")  # Debugging output
        
#         # Check if we got a valid result
#         if geocode_result and 'geometry' in geocode_result[0]:
#             location = geocode_result[0]['geometry']['location']
#             return pd.Series([location['lat'], location['lng']])
#         else:
#             return pd.Series([None, None])  # Return None if no valid result
#     except Exception as e:
#         print(f"Error encountered: {e}")  # Debugging output
#         return pd.Series([None, None])  # Handle errors gracefully

# ## ORIG CODE
# # # Function to geocode a city and state combination using Google Places API
# # def geocode_google_places(row):
# #     try:
# #         location_str = f"{row['City']}, {row['State_Province']}, {row['Country']}"
# #         geocode_result = gmaps.geocode(location_str)
        
# #         # Check if we got a valid result
# #         if geocode_result and 'geometry' in geocode_result[0]:
# #             location = geocode_result[0]['geometry']['location']
# #             return pd.Series([location['lat'], location['lng']])
# #         else:
# #             return pd.Series([None, None])  # Return None if no valid result
# #     except Exception as e:
# #         return pd.Series([None, None])  # Handle errors gracefully

# # Apply the geocode function to the data using Google Places API
# location_counts[['Latitude', 'Longitude']] = location_counts.apply(geocode_google_places, axis=1)

# # Filter out rows with missing coordinates if needed
# location_counts_cleaned = location_counts.dropna(subset=['Latitude', 'Longitude'])

# # Display the cleaned data with coordinates
# location_counts_cleaned.head()

In [9]:
#### ORIGIINAL GEOCODING STRATEGY - RETURNED SOME ERRORS - SWITCHED TO GOOGLE PLACES API
# # LCHECK FOR AND LOAD GEOCODED DATA BEFORE RUNNING - THIS TAKES 15+ MINUTES

# from geopy.geocoders import Nominatim
# from geopy.extra.rate_limiter import RateLimiter

# # Initialize geocoder
# geolocator = Nominatim(user_agent="college_hockey_map")

# # Create a rate-limited geocode function to avoid overloading the service
# geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

# # Function to geocode a city and state combination
# def geocode_location(row):
#     try:
#         location_str = f"{row['City']}, {row['State_Province']}, {row['Country']}"
#         location = geocode(location_str)
#         if location:
#             return pd.Series([location.latitude, location.longitude])
#         else:
#             return pd.Series([None, None])
#     except Exception as e:
#         return pd.Series([None, None])

# # Apply the geocode function to the data
# location_counts[['Latitude', 'Longitude']] = location_counts.apply(geocode_location, axis=1)

# # Filter out rows with missing coordinates
# location_counts_cleaned = location_counts.dropna(subset=['Latitude', 'Longitude'])

# # Display cleaned data with coordinates
# location_counts_cleaned.head()


In [10]:
# Save Geocoded Data to CSV to avoid having to run geocoding repeatedly

# # Save the cleaned and geocoded data to a CSV file
output_path = os.path.join('..', 'data', 'player_geocoded_location_counts_google_api_v1.csv')
# output_path = os.path.join('..', 'data', 'player_geocoded_location_counts_v3.0.csv')
location_counts_cleaned.to_csv(output_path, index=False)



#### Load the previously geocoded table from csv

In [11]:
## Path to the geocoded data
geocoded_data_path = os.path.join('..', 'data', 'player_geocoded_location_counts_google_api_v1.csv')
# geocoded_data_path = os.path.join('..', 'data', 'player_geocoded_location_counts_v3.0.csv')
location_counts_cleaned = pd.read_csv(geocoded_data_path)

##### HOTFIX 9-30-24
### Canton, MI is not being geocoded correctly.  I will manually update the coordinates for this location
# 42.309147747338855, -83.47945385169615
location_counts_cleaned.loc[location_counts_cleaned['City'] == 'Canton', 'Latitude'] = 42.309147747338855
location_counts_cleaned.loc[location_counts_cleaned['City'] == 'Canton', 'Longitude'] = -83.47945385169615
##### STOCKHOLM, SWEDEN IS GEOCODED TO STOCKHOLM MAINE 
# 59.32736015579712, 18.058473904470663
location_counts_cleaned.loc[location_counts_cleaned['City'] == 'Stockholm', 'Latitude'] = 59.32736015579712
location_counts_cleaned.loc[location_counts_cleaned['City'] == 'Stockholm', 'Longitude'] = 18.058473904470663
########## FAIRBUILT, MN
# 44.29627256152846, -93.27016563553472
location_counts_cleaned.loc[location_counts_cleaned['City'] == 'Faribault', 'Latitude'] = 44.29627256152846
location_counts_cleaned.loc[location_counts_cleaned['City'] == 'Faribault', 'Longitude'] = -93.27016563553472


# Check the first few rows of the geocoded data
# location_counts_cleaned.head()

### Add Player name, team, ect data into the location counts

#### Add 2023 stats to current roster
- Going to do this in a seperate notebook for ease

In [12]:
# roster_cleaned_df.head()

In [13]:
# Merge roster_df with location_counts_cleaned
merged_df = pd.merge(roster_cleaned_df, location_counts_cleaned, 
                     on=['City', 'State_Province', 'Country'], how='inner')

### NOT USING THIS TOOLTIP FORMAT - CREATING TOOLTIP IN THE MAP FUNCTION
# # Prepare the tooltip text with a header row for each location
# merged_df['Tooltip'] = merged_df.apply(
#     lambda row: f"Name - Position - Year - Team<br>{row['First_Name']} {row['Last_Name']} - {row['Position']} - {row['Yr']} - {row['Current Team']}", axis=1
# )

# # Display the merged dataframe to check if the tooltips are correctly generated
# merged_df[['First_Name', 'Last_Name', 'Tooltip', 'Latitude', 'Longitude']].head()
#
### Change the Position name and Yr to be more readable in the tooltip
merged_df['Position'] = merged_df['Position'].replace({'Forwards': 'Forward', 'Defensemen':'Defense', 'Goaltenders': 'Goalie'})
merged_df['Yr'] = merged_df['Yr'].replace({'Fr': 'Freshman', 'So': 'Sophomore', 'Jr': 'Junior', 'Sr': 'Senior', 'Gr': 'Graduate'})
# merged_df.head(35)
########################################

## Start Mapping

#### Map with team logo as markers
- thurs 9-26-24 start

In [14]:
# roster_df.head()
# merged_df.head(25)


In [15]:
import math

# Function to apply a circular offset to markers with the same location
def add_circular_offset(lat, lon, count, index, radius=0.007):
    """
    Distributes markers in a circular pattern around a central point.
    The radius increases slightly with the number of markers to prevent overlap.
    """
    # Calculate angle in radians (360 degrees divided by number of markers)
    angle = (360 / count) * index
    radians = math.radians(angle)

    # Dynamic adjustment of the radius: the more markers, the larger the radius
    dynamic_radius = radius * (1 + (count / 5))  # Scale the radius based on the number of markers

    # Offset latitude and longitude using circular placement
    lat_offset = lat + (dynamic_radius * math.cos(radians))  # Offset based on cosine
    lon_offset = lon + (dynamic_radius * math.sin(radians))  # Offset based on sine

    return lat_offset, lon_offset


###### REFACTOR W 01 PREVIEW

In [16]:
# Assign unique index per player in each city group
merged_df['city_group_index'] = merged_df.groupby(['City', 'State_Province', 'Country']).cumcount()

# Assign 'Player_Count' per city directly to 'merged_df' using 'transform'
merged_df['Player_Count'] = merged_df.groupby(['City', 'State_Province', 'Country'])['First_Name'].transform('count')

# Set Logo Size (tuple of width and height in pixels)
logo_size = (55, 55)  # Adjust as needed

# Convert all number columns to int
int_columns = ['No', 'Height_Inches', 'Wt', 'Draft_Year', 'D_Round', 
               'G', 'A', 'Pts', 'plus_minus', 'Sh', 'PIM', 'Games_Played']

for col in int_columns:
    merged_df[col] = merged_df[col].astype('Int64')

import math

def create_map_with_team_logos(merged_df, school_info_df, logo_folder, gdf_states, map_center=[45.0, -93.0], zoom_start=4):
    # Initialize the map
    folium_map = folium.Map(location=map_center, zoom_start=zoom_start, tiles='OpenStreetMap', name='Default Map')

    # ---- ADD BASE LAYERS ----
    # Add additional base layers (you can add more as needed)
    # folium.TileLayer('OpenStreetMap', name='Default Map').add_to(folium_map)
    # folium.TileLayer('Stamen Terrain', name='Terrain', attr=".").add_to(folium_map)
    # folium.TileLayer('Stamen Toner', name='Toner', attr=".").add_to(folium_map)
    folium.TileLayer('CartoDB dark_matter', name='Dark Theme', attr=".").add_to(folium_map)
    folium.TileLayer('CartoDB positron', name='Light Theme', attr=".").add_to(folium_map)
    

    # ---- ADD CHOROPLETH LAYER ----
    # Create 'state_counts_df' from 'merged_df'
    state_counts = merged_df['State_Province'].value_counts()
    state_counts_df = pd.DataFrame(state_counts).reset_index()
    state_counts_df.columns = ['State_Province', 'Player_Count']

    # Create Custom Bins for Choropleth to better control look
    # Define custom bins to handle the wide distribution
    custom_bins = [0, 1, 5, 10, 20, 50, 100, 200, 250] # Adjust as needed

    # Convert the GeoDataFrame to GeoJSON using __geo_interface__
    geojson_data = gdf_states.__geo_interface__

    ### ORIGINAL CODE ###
    # Add the Choropleth directly to the map with a name for LayerControl
    folium.Choropleth(
        geo_data=geojson_data,
        data=state_counts_df,
        columns=['State_Province', 'Player_Count'],
        key_on='feature.properties.name',  # Adjust this if necessary
        fill_color='YlGn',
        fill_opacity=0.5,
        line_opacity=0.2,
        legend_name='Number of Players by State/Province',
        bins=custom_bins,  # Apply custom bins
        reset=True,  # Ensure the choropleth is reset based on new bins
        name='Shade by Player Count'
    ).add_to(folium_map)

    ############## NEW CODE ##############
    # ---- ADD STATE LABELS LAYER ----
    # Merge 'state_counts_df' with 'gdf_states' to get centroids
    gdf_states_subset = gdf_states[['name', 'geometry']]  # Adjust 'name' if your GeoDataFrame has a different column name
    state_counts_gdf = gdf_states_subset.merge(state_counts_df, left_on='name', right_on='State_Province')

    # Calculate centroids
    state_counts_gdf['centroid'] = state_counts_gdf.geometry.centroid

    # Create a FeatureGroup for the labels
    labels_layer = folium.FeatureGroup(name='Players Count by State')

    # Add labels to the labels_layer
    for idx, row in state_counts_gdf.iterrows():
        # Get centroid coordinates
        lat = row['centroid'].y
        lon = row['centroid'].x
        # Get player count
        player_count = row['Player_Count']
        # Create a text label
        label = folium.Marker(
            location=[lat, lon],
            icon=folium.DivIcon(
                html=f'''
                    <div style="
                        font-family: Optima, sans-serif;
                        font-weight: bold;
                        font-size: 16px;
                        color: black;
                        text-align: center;
                        
                        padding: 2px;
                        
                    ">
                        {player_count}
                    </div>
                '''
            )
        )
        labels_layer.add_child(label)

    # Add the labels_layer to the map
    labels_layer.add_to(folium_map)


########### NOT BIG FAN OF COLOR SCHEME
    # # ---- ADD HEATMAP LAYER ----
    # # Create heat_data from merged_df
    # heat_data = [[row['Latitude'], row['Longitude']] for idx, row in merged_df.iterrows()]

    # # Create a FeatureGroup for the heatmap layer
    # heatmap_layer = folium.FeatureGroup(name='Heatmap')

    # # Define a custom gradient for better color transitions
    # custom_gradient = {
    #     0.2: '#ADD8E6',  # Light Blue for low intensity
    #     0.4: '#00FF00',  # Green for mid-low intensity
    #     0.6: '#FFFF00',  # Yellow for mid-high intensity
    #     0.8: '#FFA500',  # Orange for high intensity
    #     1.0: '#FF0000'   # Red for maximum intensity
    # }

    # # Add the HeatMap to the FeatureGroup with adjusted parameters
    # HeatMap(
    #     heat_data, 
    #     radius=15,                # Increase radius for smoother heat blobs
    #     blur=15,                  # Slightly increase blur to smooth transitions
    #     max_intensity=100,         # Adjust max intensity for better scaling
    #     gradient=custom_gradient, # Use the custom gradient
    #     min_opacity=0.4           # Slight opacity for low intensity
    # ).add_to(heatmap_layer)

    # # Add the heatmap layer to the map
    # heatmap_layer.add_to(folium_map)
    
    
    # ---- ADD HEATMAP LAYER ----
    # Create heat_data from merged_df
    heat_data = [[row['Latitude'], row['Longitude']] for idx, row in merged_df.iterrows()]

    # Create a FeatureGroup for the heatmap layer
    heatmap_layer = folium.FeatureGroup(name='Heatmap')

    # Add the HeatMap to the FeatureGroup
    HeatMap(heat_data, radius=25, blur=15, max_intensity=20).add_to(heatmap_layer)

    # Add the heatmap layer to the map
    heatmap_layer.add_to(folium_map)

    # ---- MARKER CLUSTER LAYER ----
    cluster_group = folium.FeatureGroup(name='Individual Players', control=True, show=False)
    marker_cluster = MarkerCluster(
        spiderfy_on_max_zoom=True,
        show_coverage_on_hover=False,
        max_cluster_radius=20,
        disableClusteringAtZoom=14,
        animateAddingMarkers=True,
        zoomToBoundsOnClick=True
    ).add_to(cluster_group)

    # Compute the mean latitude and longitude for centering the map
    Latitude = merged_df['Latitude'].mean()
    Longitude = merged_df['Longitude'].mean()

    # Create the map centered on the computed mean Latitude and Longitude
    map_instance = folium.Map(location=[Latitude, Longitude], zoom_start=12)

    # Add the cluster group to the map but initially hidden
    map_instance.add_child(cluster_group)

    # Define a custom script to toggle the visibility of the cluster group on zoom
    map_instance.get_root().html.add_child(folium.Element(f'''
        <script>
            var clusterLayer = {cluster_group.get_name()};
            var map = {map_instance.get_name()};
            map.on('zoomend', function() {{
                if (map.getZoom() >= 14) {{
                    if (!map.hasLayer(clusterLayer)) {{
                        map.addLayer(clusterLayer);
                    }}
                }} else {{
                    if (map.hasLayer(clusterLayer)) {{
                        map.removeLayer(clusterLayer);
                    }}
                }}
            }});
        </script>
    '''))


    # Loop through the merged_df to place markers
    for idx, row in merged_df.iterrows():
        # Retrieve team and logo information
        team_name = row['Team_2024']
        logo_info = school_info_df[school_info_df['Team'] == team_name]['logo_abv'].values

        if len(logo_info) > 0:
            logo_abv = logo_info[0]
            logo_path = os.path.join(logo_folder, f"{logo_abv}.png")

            if os.path.exists(logo_path):
                logo_icon = CustomIcon(logo_path, icon_size=logo_size)

                player_count = row['Player_Count']
                current_index = row['city_group_index']

                # Apply circular offset for overlapping markers
                if player_count > 1:
                    lat_offset, lon_offset = add_circular_offset(
                        row['Latitude'], row['Longitude'], player_count, current_index
                    )
                else:
                    lat_offset, lon_offset = row['Latitude'], row['Longitude']  # No offset if only one player

                # Enhance the tooltip with player information, including hometown
                tooltip_html = f"""
                <div style="font-size: 14px; font-family: Arial;">
                    <strong>{row['First_Name']} {row['Last_Name']} - {row['Team_2024']}</strong><br>
                    {row['Hometown']}<br>
                    {row['Yr']} {row['Position']}<br>
                    {f"<div style='font-size: 12px; color: gray; margin-top: 5px;'>2023 SEASON:<br> {row['Games_Played']} GP, {row['G']} G, {row['A']} A, {row['Pts']} PTS, {row['PIM']} PIM</div>" if pd.notna(row['Games_Played']) else ""}
                </div>
                """

###################### OLD / ORIG CODE ###########
                # <div style="font-size: 14px; font-family: Arial;">
                #     <strong>{row['First_Name']} {row['Last_Name']} - {row['Team_2024']}</strong><br>
                #     {row['Hometown']}<br>
                #     {row['Yr']} {row['Position']}<br>
                #     2023 STATS: {row['Games_Played']} GP, {row['G']} G, {row['A']} A, {row['Pts']} PTS
                    
                # </div>
                # """

                # Add player marker with the custom logo icon and enhanced tooltip
                folium.Marker(
                    location=[lat_offset, lon_offset],
                    tooltip=folium.Tooltip(tooltip_html),
                    icon=logo_icon
                ).add_to(marker_cluster)

    # Add the marker cluster layer to the map
    cluster_group.add_to(folium_map)

    # ---- ADD LAYER CONTROL ----
            
    folium.LayerControl().add_to(folium_map)

    # Inject custom CSS for styling the LayerControl
    custom_css = """
    <style>
    /* Style for the Layer Control List */
    .leaflet-control-layers-list {
        font-size: 18px;  /* Increase font size */
        line-height: 1.5; /* Ensure adequate spacing between lines */
    }

    /* Style for the checkboxes and radio buttons */
    .leaflet-control-layers input[type="radio"], 
    .leaflet-control-layers input[type="checkbox"] {
        transform: scale(1.5);  /* Scale the size of the checkbox/radio button */
        margin-right: 8px;      /* Add space between the button and label */
    }

    /* Optional: Style the background of the layer control to make it stand out */
    .leaflet-control-layers {
        background-color: white;  /* Ensure the control has a visible background */
        border-radius: 5px;       /* Slight rounding of the control edges */
        padding: 10x;
        box-shadow: 0px 0px 5px rgba(0,0,0,0.3);  /* Add a shadow for better visibility */
    }
    </style>
    """

    # Add the custom CSS to the map's HTML
    folium_map.get_root().html.add_child(folium.Element(custom_css))

    # Return the map after processing all markers
    return folium_map

# Assuming 'gdf_states' is already defined in your code
enhanced_player_map = create_map_with_team_logos(merged_df, school_info_df, logo_folder, gdf_states)

# Save the map to an HTML file for visualization
enhanced_map_file_path = os.path.join('..', 'TEMP', 'player_origin_map_with_stats_v3.html')
enhanced_player_map.save(enhanced_map_file_path)



  state_counts_gdf['centroid'] = state_counts_gdf.geometry.centroid


#### Very Simple V0.1 Map

In [17]:

# # Function to create the player origin map with tooltips
# def create_player_origin_map_with_tooltip(data, map_center=[45.0, -93.0], zoom_start=4):
#     # Map settings block for customization
#     folium_map = folium.Map(location=map_center, zoom_start=zoom_start, tiles='cartodb positron')
    
#     # Create MarkerCluster
#     marker_cluster = MarkerCluster(disableClusteringAtZoom=10).add_to(folium_map)

#     # Add player markers with tooltips to the MarkerCluster
#     for _, row in data.iterrows():
#         folium.Marker(
#             location=[row['Latitude'], row['Longitude']],
#             tooltip=row['Tooltip']
#         ).add_to(marker_cluster)
    
#     return folium_map

# # Create the player origins map with tooltips
# player_map = create_player_origin_map_with_tooltip(merged_df)

# # # Save the map to an HTML file for visualization
# # map_file_path = os.path.join('..', 'TEMP', 'player_origin_map_v1.html')
# # player_map.save(map_file_path)

# # map_file_path


### Map with Custom Java for Cluster behavior
- not behaving well - probably not worth the time to get smooth

In [18]:
# # Function to create the player origin map with custom cluster zoom levels
# def create_player_origin_map_with_tooltip(data, map_center=[45.0, -93.0], zoom_start=4):
#     # Map settings block for customization
#     folium_map = folium.Map(location=map_center, zoom_start=zoom_start, tiles='cartodb positron')
    
#     # Create MarkerCluster without adding it to the map right away
#     marker_cluster = MarkerCluster().add_to(folium.FeatureGroup(name="Clusters"))

#     # Add player markers with tooltips to the MarkerCluster
#     for _, row in data.iterrows():
#         folium.Marker(
#             location=[row['Latitude'], row['Longitude']],
#             tooltip=row['Tooltip']
#         ).add_to(marker_cluster)

#     # Custom JavaScript to toggle marker clusters based on zoom level
#     toggle_cluster_js = """
#     function toggleClusters(map) {
#         var clusterGroup = map._layers[Object.keys(map._layers).find(key => map._layers[key].options && map._layers[key].options.spiderfyOnMaxZoom !== undefined)];
        
#         map.on('zoomend', function () {
#             var currentZoom = map.getZoom();
            
#             // Define the zoom range where clusters should be shown
#             var minZoom = 5;  // Set the zoom level when clusters appear
#             var maxZoom = 7;  // Set the zoom level when clusters disappear again
            
#             if (currentZoom < minZoom || currentZoom > maxZoom) {
#                 map.removeLayer(clusterGroup);
#             } else {
#                 map.addLayer(clusterGroup);
#             }
#         });

#         // Hide the clusters initially
#         map.removeLayer(clusterGroup);
#     }
#     """

#     # Add the JavaScript to the map
#     folium_map.get_root().html.add_child(folium.Element(f'<script>{toggle_cluster_js}</script>'))

#     # Call the function that runs the zoom toggle functionality
#     folium_map.add_child(folium.Element(f'<script>toggleClusters({{map_name}});</script>'.format(map_name=folium_map.get_name())))

#     # Add the marker cluster group to the map
#     marker_cluster.add_to(folium_map)

#     return folium_map


### Aggrigate data on state by state basis 
- will use to color specific states - regions

In [19]:

state_counts = location_counts_cleaned['State_Province'].value_counts()
# Create df of the state counts
state_counts_df = pd.DataFrame(state_counts).reset_index()
state_counts_df.columns = ['State_Province', 'Player_Count']
state_counts_df.head()

# location_counts_cleaned.head()



Unnamed: 0,State_Province,Player_Count
0,Minnesota,92
1,Ontario,85
2,Massachusetts,75
3,New York,69
4,Michigan,66


In [20]:
## Check player dataframe and school info dataframe
merged_df.head()
# school_info_df.head()

# ## Save both to Temp folder for checking
# merged_df.to_csv('../TEMP/merged_df.csv', index=False)
# school_info_df.to_csv('../TEMP/school_info_df.csv', index=False)

Unnamed: 0,Team_2024,Last_Name,First_Name,No,Position,Yr,Ht,Wt,DOB,Hometown,...,A,Pts,plus_minus,Sh,PIM,Games_Played,Player_Count,Latitude,Longitude,city_group_index
0,Long Island,Casperson,AJ,3,Defense,Junior,2-Jun,190,7/19/2001,"Flower Mound, Texas",...,1,1,1,7,2,12,2,33.014567,-97.096955,0
1,Bentley,Hodges,AJ,20,Forward,Graduate,Jun-00,175,8/24/2001,"Littleton, Colo.",...,9,15,-1,57,2,29,2,39.613321,-105.01665,0
2,Bemidji State,Macaulay,AJ,12,Defense,Senior,9-May,185,4/12/2002,"Bonnyville, Alb.",...,10,15,9,44,14,34,2,54.267966,-110.739783,0
3,Quinnipiac,Bohlinger,Aaron,5,Defense,Graduate,9-May,165,8/25/2000,"Walden, N.Y.",...,5,8,1,22,4,34,1,41.561205,-74.188481,0
4,Long Island,Grounds,Aaron,23,Forward,Senior,2-Jun,190,12/24/1999,"Jamestown, N.D.",...,2,3,-5,14,16,11,1,46.910544,-98.708436,0


In [22]:
merged_df.head()

Unnamed: 0,Team_2024,Last_Name,First_Name,No,Position,Yr,Ht,Wt,DOB,Hometown,...,A,Pts,plus_minus,Sh,PIM,Games_Played,Player_Count,Latitude,Longitude,city_group_index
0,Long Island,Casperson,AJ,3,Defense,Junior,2-Jun,190,7/19/2001,"Flower Mound, Texas",...,1,1,1,7,2,12,2,33.014567,-97.096955,0
1,Bentley,Hodges,AJ,20,Forward,Graduate,Jun-00,175,8/24/2001,"Littleton, Colo.",...,9,15,-1,57,2,29,2,39.613321,-105.01665,0
2,Bemidji State,Macaulay,AJ,12,Defense,Senior,9-May,185,4/12/2002,"Bonnyville, Alb.",...,10,15,9,44,14,34,2,54.267966,-110.739783,0
3,Quinnipiac,Bohlinger,Aaron,5,Defense,Graduate,9-May,165,8/25/2000,"Walden, N.Y.",...,5,8,1,22,4,34,1,41.561205,-74.188481,0
4,Long Island,Grounds,Aaron,23,Forward,Senior,2-Jun,190,12/24/1999,"Jamestown, N.D.",...,2,3,-5,14,16,11,1,46.910544,-98.708436,0


In [25]:
### Calculate the average distance a player is from their home town for each team

# Find the distance from Hometown to home arena for each player
# Create a new column 'Distance' in merged_df

from geopy.distance import geodesic

# add home rink coordinates to merged_df
# reaname columns to match - Current Team to Team
merged_df = merged_df.rename(columns={'Team_2024': 'Team'})
## rename coords in school_info_df to Rink_Lat and Rink_Long
school_info_df = school_info_df.rename(columns={'Latitude': 'Rink_Lat', 'Longitude': 'Rink_Long'})

# Merge Rink coordinates into merged_df
merged_df = pd.merge(merged_df, school_info_df[['Team', 'Rink_Lat', 'Rink_Long']], on='Team', how='left')

merged_df.head()
# merged_df.info()




Unnamed: 0,Team,Last_Name,First_Name,No,Position,Yr,Ht,Wt,DOB,Hometown,...,plus_minus,Sh,PIM,Games_Played,Player_Count,Latitude,Longitude,city_group_index,Rink_Lat,Rink_Long
0,Long Island,Casperson,AJ,3,Defense,Junior,2-Jun,190,7/19/2001,"Flower Mound, Texas",...,1,7,2,12,2,33.014567,-97.096955,0,40.726747,-73.576235
1,Bentley,Hodges,AJ,20,Forward,Graduate,Jun-00,175,8/24/2001,"Littleton, Colo.",...,-1,57,2,29,2,39.613321,-105.01665,0,42.384852,-71.220488
2,Bemidji State,Macaulay,AJ,12,Defense,Senior,9-May,185,4/12/2002,"Bonnyville, Alb.",...,9,44,14,34,2,54.267966,-110.739783,0,47.463603,-94.853858
3,Quinnipiac,Bohlinger,Aaron,5,Defense,Graduate,9-May,165,8/25/2000,"Walden, N.Y.",...,1,22,4,34,1,41.561205,-74.188481,0,41.413864,-72.911176
4,Long Island,Grounds,Aaron,23,Forward,Senior,2-Jun,190,12/24/1999,"Jamestown, N.D.",...,-5,14,16,11,1,46.910544,-98.708436,0,40.726747,-73.576235


In [28]:
merged_df.head()
# merged_df.info()

# Calculate the distance between the player's hometown and their home rink
merged_df['Distance'] = merged_df.apply(
    lambda row: geodesic((row['Latitude'], row['Longitude']), (row['Rink_Lat'], row['Rink_Long'])).miles, axis=1
)

# Calculate the average distance for each team
team_avg_distance = merged_df.groupby('Team')['Distance'].mean().reset_index()

# Calculate the median distance for each team
team_median_distance = merged_df.groupby('Team')['Distance'].median().reset_index()

# Combine the average and median distances into a single DataFrame
team_avg_distance = team_avg_distance.merge(team_median_distance, on='Team', suffixes=('_Avg', '_Median'))

# Sort by average distance in ascending order
team_avg_distance = team_avg_distance.sort_values(by='Distance_Avg')

# Round to two decimal places for better readability
team_avg_distance = team_avg_distance.round(2)
# Reindex the DataFrame for better display
team_avg_distance = team_avg_distance.reset_index(drop=True)

# Display the average and median distances for each team
team_avg_distance.head(20)
# team_avg_distance.tail(20)

# Save the average and median distances to a CSV file
output_path = os.path.join('..', 'TEMP', 'team_avg_distance_v1.csv')