# Players by Location
- create a map using folium that plots the hometown location of every US (and maybe Canadian) player in D1 (2024-25 season)

In [19]:
## Dependencies

## System Libraries
import sys
import os
# Data handling
import pandas as pd
import geopandas as gpd
# Plotting and visualization
import matplotlib.pyplot as plt
from PIL import Image
## Map visualization
import folium
from folium.plugins import MarkerCluster
from folium.plugins import HeatMap

### ROSTERFILE
# Path to the roster file
roster_path = os.path.join('..', 'data', 'roster_2024_current_v3.csv')
roster_df = pd.read_csv(roster_path) # Load the roster file

### SHAPEFILES
# Path to .geojson file with State Boundries
geojson_path = os.path.join('..', 'data', 'vault', 'States_shapefile.shp')
# Load the states shapefile
gdf_states = gpd.read_file(geojson_path)

# Path to shapefile with all US counties
shapefile_path = os.path.join('..', 'data', 'vault', 'cb_2018_us_county_500k.shp')
gdf = gpd.read_file(shapefile_path)
# Set the initial CRS (assuming it's in EPSG:4326, but you may need to verify the original CRS)
gdf = gdf.set_crs(epsg=4326)

## CHECK SHAPEFILES FOR COMPATIBILITY
# Set the CRS for both dataframes if it's missing
if gdf.crs is None:
    gdf.set_crs(epsg=4326, inplace=True)  # Assuming coordinates are in WGS 84 (lat/lon)

if gdf_states.crs is None:
    gdf_states.set_crs(epsg=4326, inplace=True)  # Assuming coordinates are in WGS 84 (lat/lon)


# Check the first few rows of the DataFrames
# roster_df.head()
# gdf_states.head()
# gdf.head()



## Prep The Data

In [20]:
# Clean up and prepare the data for mapping

# Drop rows with missing geographic data
geo_columns = ['City', 'State_Province', 'Country']
roster_cleaned_df = roster_df.dropna(subset=geo_columns)

# Create a simplified dataframe with relevant location columns for easier mapping
roster_cleaned_df = roster_cleaned_df[['Current Team', 'First_Name', 'Last_Name', 'City', 'State_Province', 'Country']]

# Group by City, State_Province, and Country to count the number of players from each location
location_counts = roster_cleaned_df.groupby(['City', 'State_Province', 'Country']).size().reset_index(name='Player_Count')


## CHECK DATA TRANSFORMATION
# # Display the cleaned and grouped dataframe
# location_counts.head()
# # Sort the location counts in descending order
location_counts_sorted = location_counts.sort_values(by='Player_Count', ascending=False)
location_counts_sorted.head(10) # Display


Unnamed: 0,City,State_Province,Country,Player_Count
116,Calgary,Alberta,Canada,48
850,Toronto,Ontario,Canada,27
943,Winnipeg,Manitoba,Canada,16
587,North Vancouver,British Columbia,Canada,14
233,Edmonton,Alberta,Canada,13
811,Stockholm,Sweden,Sweden,13
246,Espoo,Finland,Finland,12
524,Mississauga,Ontario,Canada,12
231,Edina,Minnesota,USA,12
602,Oakville,Ontario,Canada,11


### Geocode Conversion
- takes the names of places and converts to lat long coordinates
- uses a rate limiter to avoid overloading service
- takes about 15 min to run - output is saved in the data folder - load from there 

In [12]:
## LCHECK FOR AND LOAD GEOCODED DATA BEFORE RUNNING - THIS TAKES 15+ MINUTES

# from geopy.geocoders import Nominatim
# from geopy.extra.rate_limiter import RateLimiter

# # Initialize geocoder
# geolocator = Nominatim(user_agent="college_hockey_map")

# # Create a rate-limited geocode function to avoid overloading the service
# geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

# # Function to geocode a city and state combination
# def geocode_location(row):
#     try:
#         location_str = f"{row['City']}, {row['State_Province']}, {row['Country']}"
#         location = geocode(location_str)
#         if location:
#             return pd.Series([location.latitude, location.longitude])
#         else:
#             return pd.Series([None, None])
#     except Exception as e:
#         return pd.Series([None, None])

# # Apply the geocode function to the data
# location_counts[['Latitude', 'Longitude']] = location_counts.apply(geocode_location, axis=1)

# # Filter out rows with missing coordinates
# location_counts_cleaned = location_counts.dropna(subset=['Latitude', 'Longitude'])

# # Display cleaned data with coordinates
# location_counts_cleaned.head()


In [13]:
## Save Geocoded Data to CSV to avoid having to run geocoding repeatedly

# # Save the cleaned and geocoded data to a CSV file
# output_path = os.path.join('..', 'data', 'player_geocoded_location_counts.csv')
# location_counts_cleaned.to_csv(output_path, index=False)



#### Load the previously geocoded table from csv

In [14]:
## Path to the geocoded data
geocoded_data_path = os.path.join('..', 'data', 'player_geocoded_location_counts.csv')
location_counts_cleaned = pd.read_csv(geocoded_data_path)

# Check the first few rows of the geocoded data
location_counts_cleaned.head()

Unnamed: 0,City,State_Province,Country,Player_Count,Latitude,Longitude
0,Abbotsford,British Columbia,Canada,4,49.052116,-122.329479
1,Abington,Massachusetts,USA,1,42.104823,-70.945322
2,Airdrie,Alberta,Canada,3,51.28597,-114.01062
3,Albert Lea,Minnesota,USA,1,43.648013,-93.368266
4,Alexandria,Minnesota,USA,1,45.88351,-95.374501


### Add Player name, team, ect data into the location counts

#### Add 2023 stats to current roster

In [22]:
path_to_stats = os.path.join('..', 'data', 'player_stats_2023_v1.csv')
stats_df = pd.read_csv(path_to_stats) # Load the stats file

stats_df.head() # Check table
roster_cleaned_df.head() # Check table

Unnamed: 0,Current Team,First_Name,Last_Name,City,State_Province,Country
0,Lake Superior,Adam,Barone,Sault Ste. Marie,Ontario,Canada
1,Lake Superior,Jack,Blanchett,Monroe,Michigan,USA
2,Lake Superior,Mike,Brown,Belmont,Massachusetts,USA
3,Lake Superior,Evan,Bushy,Mankato,Minnesota,USA
4,Lake Superior,Jacob,Conrad,Green Bay,Wisconsin,USA


In [15]:
# Merge roster_df with location_counts_cleaned
merged_df = pd.merge(roster_df, location_counts_cleaned, 
                     on=['City', 'State_Province', 'Country'], how='inner')

# Prepare the tooltip text for each location
merged_df['Tooltip'] = merged_df.apply(
    lambda row: f"{row['First_Name']} {row['Last_Name']} - {row['Position']} - {row['Yr']} - {row['Current Team']}", axis=1
)

# Display the merged dataframe to check if the tooltips are correctly generated
merged_df[['First_Name', 'Last_Name', 'Tooltip', 'Latitude', 'Longitude']].head()

Unnamed: 0,First_Name,Last_Name,Tooltip,Latitude,Longitude
0,Adam,Barone,Adam Barone - Defensemen - Fr - Lake Superior,46.52391,-84.320068
1,Jack,Blanchett,Jack Blanchett - Defensemen - So - Lake Superior,41.915336,-83.513566
2,Mike,Brown,Mike Brown - Defensemen - Jr - Lake Superior,42.39593,-71.178665
3,Evan,Bushy,Evan Bushy - Defensemen - So - Lake Superior,44.163466,-93.999351
4,Jacob,Conrad,Jacob Conrad - Defensemen - Fr - Lake Superior,44.512638,-88.012579


## Start Mapping

#### Very Simple V0.1 Map

In [16]:

# Function to create the player origin map with tooltips
def create_player_origin_map_with_tooltip(data, map_center=[45.0, -93.0], zoom_start=4):
    # Map settings block for customization
    folium_map = folium.Map(location=map_center, zoom_start=zoom_start, tiles='cartodb positron')
    
    # Create MarkerCluster
    marker_cluster = MarkerCluster().add_to(folium_map)

    # Add player markers with tooltips to the MarkerCluster
    for _, row in data.iterrows():
        folium.Marker(
            location=[row['Latitude'], row['Longitude']],
            tooltip=row['Tooltip']
        ).add_to(marker_cluster)
    
    return folium_map

# Create the player origins map with tooltips
player_map = create_player_origin_map_with_tooltip(merged_df)

# # Save the map to an HTML file for visualization
map_file_path = os.path.join('..', 'TEMP', 'player_origin_map_v1.html')
player_map.save(map_file_path)

map_file_path


'..\\TEMP\\player_origin_map_v1.html'

#### Aggrigate data on state by state basis 
- will use to color specific states - regions

In [17]:
# First, let's aggregate the player counts by State_Province and Country for the choropleth layer
state_counts = location_counts_cleaned.groupby(['State_Province', 'Country']).size().reset_index(name='Player_Count')

state_counts.head()

# Value Counts sorted by Player_Count
state_counts_sorted = state_counts.sort_values(by='Player_Count', ascending=False)
state_counts['State_Province'].value_counts()


State_Province
Alaska                       1
Russia                       1
North Carolina               1
North Dakota                 1
Northwest Territories        1
Nova Scotia                  1
Ohio                         1
Ontario                      1
Oregon                       1
Pennsylvania                 1
Poland                       1
Prince Edward Island         1
Quebec                       1
Rhode Island                 1
Saskatchewan                 1
New York                     1
Slovakia                     1
South Carolina               1
South Dakota                 1
Sweden                       1
Tennessee                    1
Texas                        1
Utah                         1
Vermont                      1
Virginia                     1
Washington                   1
Wisconsin                    1
Wyoming                      1
Newfoundland and Labrador    1
New Jersey                   1
Alberta                      1
Idaho                   

# Stopped Here Thursday Night
### Code below is the example of next steps from GPI
#### To-DO
- get geodata (state-province boundries, possibly also another layer - county, ect)
- GEOJSON format

In [18]:
############ EAMPLE CODE

import json
import geopandas as gpd
from folium.plugins import HeatMap

# Load a world GeoJSON file (assuming we're focusing on Canada/USA for now) to apply fill colors to states/provinces
# Unfortunately, I cannot access a remote GeoJSON file, but you can use one locally, for example:
# geojson_path = 'path_to_geojson_file'

# For demonstration, we'll simulate this step

# Now, let's create a map with choropleth and heatmap

def create_choropleth_heatmap_map(data, map_center=[45.0, -93.0], zoom_start=4):
    # Initialize the map
    folium_map = folium.Map(location=map_center, zoom_start=zoom_start, tiles='cartodb positron')
    
    # Heatmap layer
    heat_data = [[row['Latitude'], row['Longitude']] for _, row in data.iterrows()]
    HeatMap(heat_data, radius=12, blur=15, max_intensity=10).add_to(folium_map)
    
    # Assuming we have a GeoJSON file with state/province boundaries, we would use it here
    # folium.Choropleth(
    #     geo_data=geojson_path,  # Provide the path to a geojson file with state/province boundaries
    #     data=player_counts_by_region,
    #     columns=['State_Province', 'Player_Count'],
    #     key_on='feature.properties.NAME',  # Adjust this to match the key in the GeoJSON file
    #     fill_color='YlGnBu',
    #     fill_opacity=0.7,
    #     line_opacity=0.2,
    #     legend_name='Number of Players by State/Province'
    # ).add_to(folium_map)
    
    # Add markers with tooltips as in the previous map
    marker_cluster = MarkerCluster().add_to(folium_map)
    for _, row in data.iterrows():
        folium.Marker(
            location=[row['Latitude'], row['Longitude']],
            tooltip=row['Tooltip']
        ).add_to(marker_cluster)
    
    return folium_map

# Create the enhanced map with choropleth and heatmap
enhanced_player_map = create_choropleth_heatmap_map(merged_df)

# Save the map to an HTML file for visualization
# enhanced_map_file_path = '/mnt/data/college_hockey_player_origin_map_enhanced.html'


enhanced_map_file_path = os.path.join('..', 'TEMP', 'player_origin_map_v2.html')

enhanced_player_map.save(enhanced_map_file_path)


