# Divided States of COllege Hockey
- Workbook for college hockey maps centered on congressional districts

## Setup

In [49]:
# Dependencies

import folium
import random
from folium.plugins import MarkerCluster
from folium import LayerControl
import geopandas as gpd
from shapely.geometry import Point
import matplotlib.pyplot as plt
import pandas as pd
import json
import sys
import os
from PIL import Image
from geopy.distance import geodesic

# OUTPUT FILENAME #
output_filename = 'congress_map_v1'

# Path to .geojson file with State Boundries
geojson_path = os.path.join('..', 'data', 'vault', 'States_shapefile.shp')
# Load the states shapefile
gdf_states = gpd.read_file(geojson_path)

### USE THE CONGRTESSIONAL DISTRICT SHAPEFILE
# Path to 118th Congress Districts
# data\vault\118th_congress\USA_118th_Congressional_Districts.shp
congressional_districts_path = os.path.join('..', 'data', 'vault', '118th_congress', 'USA_118th_Congressional_Districts.shp')
# Load the congressional districts shapefile
gdf_congress = gpd.read_file(congressional_districts_path)
# Rename to default name so I can run the rest of the code
gdf = gdf_congress

# Open School Info Table
school_info_path = os.path.join('..', 'data', 'arena_school_info.csv')
school_info = pd.read_csv(school_info_path)

# Load data with Congressionall District Info (Excel File)
# data\vault\USA_118th_Congressional_Districts_info_table.csv
congressional_district_info_path = os.path.join('..', 'data', 'vault', 'USA_118th_Congressional_Districts_info_table.csv')
congressional_district_info = pd.read_csv(congressional_district_info_path)



## CHECK SHAPEFILES FOR COMPATIBILITY
# Set the CRS for both dataframes if it's missing
if gdf.crs is None:
    gdf.set_crs(epsg=4326, inplace=True)  # Assuming coordinates are in WGS 84 (lat/lon)

if gdf_states.crs is None:
    gdf_states.set_crs(epsg=4326, inplace=True)  # Assuming coordinates are in WGS 84 (lat/lon)


In [50]:
# Check that the data is loaded correctly
# school_info.head()

# gdf_states.head()

# gdf_congress.head()

# congressional_district_info.head()

In [51]:


# Make sure hex1, hex2, hex3 are strings
school_info['hex1'] = school_info['hex1'].astype(str)
school_info['hex2'] = school_info['hex2'].astype(str)
school_info['hex3'] = school_info['hex3'].astype(str)

# Transform hex color codes to ensure they are valid - add leading 0s if necessary (6 digits)
def fix_hex_color(hex_color):
    if hex_color.startswith('#'):
        hex_color = hex_color[1:]  # Remove the leading '#'
    hex_color = hex_color.zfill(6)  # Pad with leading zeros to ensure 6 digits
    return f"#{hex_color[-6:]}"  # Return the last 6 characters

# Apply the function to the hex1 column
school_info['hex1'] = school_info['hex1'].apply(fix_hex_color)
# Apply to hex2 & hex3 as well
school_info['hex2'] = school_info['hex2'].apply(fix_hex_color)
school_info['hex3'] = school_info['hex3'].apply(fix_hex_color)

logo_dir = os.path.join('..', 'images', 'logos')


teams = {}
for index, row in school_info.iterrows():
    teams[row['Team']] = {
        'coords': (row['Longitude'], row['Latitude']),
        'color': row['hex1'],
        'logo': os.path.join(logo_dir, f"{row['logo_abv']}.png")
    }

    # Print a few entries to verify
    if index < 5:
        print(f"{row['Team']}: {teams[row['Team']]}")

#### HOTFIX -Changing primary color to hex2 for teams that border other teams with same color
### Change Omaha to hex2
teams['Omaha']['color'] = school_info.loc[school_info['Team'] == 'Omaha', 'hex2'].values[0]
## Arizona State to hex2
teams['Arizona State']['color'] = school_info.loc[school_info['Team'] == 'Arizona State', 'hex2'].values[0]
# print(teams)

# school_info.head()

Air Force: {'coords': (-104.8837269, 39.0137391), 'color': '#003087', 'logo': '..\\images\\logos\\afa.png'}
Alaska: {'coords': (-147.7638406, 64.84212435), 'color': '#236192', 'logo': '..\\images\\logos\\akf.png'}
Alaska Anchorage: {'coords': (-149.8727373, 61.20553644), 'color': '#00583d', 'logo': '..\\images\\logos\\aka.png'}
American Intl: {'coords': (-72.5543263, 42.1180027), 'color': '#000000', 'logo': '..\\images\\logos\\aic.png'}
Arizona State: {'coords': (-111.9108672, 33.4471565), 'color': '#8c1d40', 'logo': '..\\images\\logos\\asu.png'}


In [52]:


#### V_2 #### TAKE ABOUT 4 TIMES LONGER THAN EUCLIDEAN DISTANCE
# Function to get the closest team using geopy's geodesic distance
def get_closest_team(lon, lat, teams):
    min_distance = float('inf')
    closest_team = None
    for team, info in teams.items():
        # Calculate the geodesic distance between the two points (lon, lat) and team's coordinates
        distance = geodesic((lat, lon), (info['coords'][1], info['coords'][0])).kilometers
        if distance < min_distance:
            min_distance = distance
            closest_team = team
    return closest_team

# Add a new column to the GeoDataFrame for the closest team using geopy's geodesic distance
gdf['closest_team'] = gdf.geometry.apply(lambda x: get_closest_team(x.centroid.x, x.centroid.y, teams))
gdf['color'] = gdf['closest_team'].apply(lambda x: teams[x]['color'])

In [53]:
gdf.info()

# Drop geometry column
gdf.drop(columns='geometry', inplace=True)

gdf.head()

# Output Temp File
gdf.to_csv('../TEMP/temp.csv', index=False)


<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 436 entries, 0 to 435
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   DISTRICTID    436 non-null    object  
 1   STFIPS        436 non-null    object  
 2   CDFIPS        436 non-null    object  
 3   STATE_ABBR    436 non-null    object  
 4   NAME          432 non-null    object  
 5   LAST_NAME     432 non-null    object  
 6   PARTY         436 non-null    object  
 7   SQMI          436 non-null    float64 
 8   STATE_NAME    436 non-null    object  
 9   geometry      436 non-null    geometry
 10  closest_team  436 non-null    object  
 11  color         436 non-null    object  
dtypes: float64(1), geometry(1), object(10)
memory usage: 41.0+ KB


### Create Initial SUmmary Table

In [54]:
data = gdf.copy()

# Create the summary table based on the description
summary_table = data.groupby('closest_team').agg(
    districts_controlled=('DISTRICTID', 'count'),
    R_districts=('PARTY', lambda x: (x == 'Republican').sum()),
    D_districts=('PARTY', lambda x: (x == 'Democrat').sum()),
    total_area=('SQMI', 'sum'),
    R_area=('SQMI', lambda x: x[data['PARTY'] == 'Republican'].sum()),
    D_area=('SQMI', lambda x: x[data['PARTY'] == 'Democrat'].sum())
).reset_index()

summary_table.info()
summary_table.head()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   closest_team          55 non-null     object 
 1   districts_controlled  55 non-null     int64  
 2   R_districts           55 non-null     int64  
 3   D_districts           55 non-null     int64  
 4   total_area            55 non-null     float64
 5   R_area                55 non-null     float64
 6   D_area                55 non-null     float64
dtypes: float64(3), int64(3), object(1)
memory usage: 3.1+ KB


Unnamed: 0,closest_team,districts_controlled,R_districts,D_districts,total_area,R_area,D_area
0,Air Force,3,2,1,89582.85,82382.73,7200.12
1,Alaska,1,0,1,587042.86,0.0,587042.86
2,Alaska Anchorage,2,0,2,6431.11,0.0,6431.11
3,American Intl,1,0,1,676.17,0.0,676.17
4,Arizona State,72,22,50,552720.72,372774.68,179946.04


In [56]:
summary_table.tail()

# Most districts controlled by a team
most_districts = summary_table.loc[summary_table['districts_controlled'].idxmax()]
print(most_districts)

# Most area controlled by a team
most_area = summary_table.loc[summary_table['total_area'].idxmax()]
print(most_area)
print('<br>')


# Ignore schools with just one district
summary_table = summary_table[summary_table['districts_controlled'] > 2]

# Who has the biggest ratio of R districts as a percentage of total districts controlled
summary_table['R_ratio'] = summary_table['R_districts'] / summary_table['districts_controlled']
most_R_districts = summary_table.loc[summary_table['R_ratio'].idxmax()]
print(most_R_districts)

# Who hat the biggest ratio of D districts as a percentage of total districts controlled
summary_table['D_ratio'] = summary_table['D_districts'] / summary_table['districts_controlled']
most_D_districts = summary_table.loc[summary_table['D_ratio'].idxmax()]
print(most_D_districts)



closest_team            Arizona State
districts_controlled               72
R_districts                        22
D_districts                        50
total_area                  552720.72
R_area                      372774.68
D_area                      179946.04
R_ratio                      0.305556
D_ratio                      0.694444
Name: 4, dtype: object
closest_team            Arizona State
districts_controlled               72
R_districts                        22
D_districts                        50
total_area                  552720.72
R_area                      372774.68
D_area                      179946.04
R_ratio                      0.305556
D_ratio                      0.694444
Name: 4, dtype: object
<br>
closest_team                Omaha
districts_controlled           12
R_districts                    10
D_districts                     2
total_area              205820.83
R_area                  203095.64
D_area                    2725.19
R_ratio                  0.

In [60]:
## Who COntrols the most D area total
# Max D Area for a team that has mostly D area

# If R_area > D_area then drop row
summary_table = summary_table[summary_table['R_area'] < summary_table['D_area']]
most_D_area = summary_table.loc[summary_table['D_area'].idxmax()]
print(most_D_area)



closest_team            Penn State
districts_controlled            17
R_districts                      3
D_districts                     14
total_area                43062.87
R_area                    20780.91
D_area                    22281.96
R_ratio                   0.176471
D_ratio                   0.823529
Name: 40, dtype: object
