In [1]:
import pandas as pd
import folium
from folium.plugins import MarkerCluster
import plotly.express as px
from IPython.display import display

In [23]:
city_df = pd.read_parquet('../data/processed/city_golf_metrics.parquet')
# load user-provided city coordinates (e.g., data/uscities.csv)
city_coords = pd.read_csv('../data/uscities.csv')
# normalize coordinate column names if needed
if 'lng' in city_coords.columns and 'lat' in city_coords.columns:
    city_coords = city_coords.rename(columns={'lng': 'lon', 'lat': 'lat'})
# some files use state_id for abbreviation
if 'state_id' in city_coords.columns and 'state' not in city_coords.columns:
    city_coords = city_coords.rename(columns={'state_name': 'state'})
# merge on city + state abbreviation
merge_cols = ['city','state']
coords_use = [c for c in ['city','state','lat','lon'] if c in city_coords.columns]
city_df = city_df.merge(city_coords[coords_use].drop_duplicates(), how='left', on=['city','state'])
# standardize column names
city_df = city_df.sort_values('num_golf_courses', ascending=False)
city_df_cleaned = city_df.dropna(subset=["lat","lon"]).reset_index(drop=True) 
city_df_cleaned.describe()


Unnamed: 0,num_golf_courses,avg_rating,sum_ratings_count,avg_length_yards,state_golfable,score,rank,lat,lon
count,4571.0,4571.0,4571.0,4293.0,4571.0,4571.0,4571.0,4571.0,4571.0
mean,1.998687,4.150477,389.921461,5485.365362,0.0,0.259832,2809.098447,38.936494,-91.534165
std,2.531638,0.513101,1419.76696,1772.750268,0.0,0.114452,1604.167375,5.052663,14.755054
min,1.0,3.0,1.0,0.0,0.0,0.0,1.0,18.3331,-159.4801
25%,1.0,3.86,3.0,4640.7,0.0,0.197005,1431.5,35.44105,-96.91485
50%,1.0,4.13,33.0,6300.0,0.0,0.260909,2787.0,39.889,-87.7035
75%,2.0,4.5,275.0,6691.0,0.0,0.34184,4205.0,42.2459,-81.44045
max,49.0,5.0,29150.0,10077.0,0.0,0.718229,5585.0,64.8353,-65.6589


In [24]:
center_lat = city_df_cleaned['lat'].mean()
center_lon = city_df_cleaned['lon'].mean()
m = folium.Map(location=[center_lat, center_lon], zoom_start=5, tiles='CartoDB positron')
mc = MarkerCluster().add_to(m)
# compute rating min/max for normalization
min_rating = city_df_cleaned['avg_rating'].min() if 'avg_rating' in city_df_cleaned.columns else None
max_rating = city_df_cleaned['avg_rating'].max() if 'avg_rating' in city_df_cleaned.columns else None
for _, r in city_df_cleaned.iterrows():
    popup = folium.Popup(f"<b>{r['city']}, {r.get('state','')}</b><br/>Courses: {r['num_golf_courses']}<br/>Avg rating: {r.get('avg_rating', 'n/a')}", max_width=300)
    # radius larger for higher average rating (normalized)
    if pd.notna(r.get('avg_rating')) and min_rating is not None and max_rating is not None and max_rating>min_rating:
        norm = (r['avg_rating'] - min_rating) / (max_rating - min_rating + 1e-9)
        radius = 4 + norm * 20  # range approx 4-24
        if r['avg_rating'] >= 4.0:
            color = 'darkgreen'
        elif r['avg_rating'] >= 3.5:
            color = 'orange'
        else:
            color = 'red'
    else:
        # fallback: size by number of courses
        radius = 4 + (r['num_golf_courses']**0.5)
        color = 'blue'
    folium.CircleMarker(location=[r['lat'], r['lon']], radius=radius, color=color, fill=True, fill_opacity=0.7, popup=popup).add_to(mc)

display(m)