# US City Dot Map (Geocode + Plot)

This notebook:
1. Cleans your city strings
2. Geocodes to latitude/longitude using OpenStreetMap (Nominatim)
3. Plots a dot map over the United States

**Notes**
- Nominatim is rate-limited. This notebook uses a delay to be polite.
- If some metro-area strings fail, it falls back to the first city in `A/B/C, ST`.


In [None]:
import re
import time
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from preprocess.data_utils import get_unique_cities
import plotly.express as px


## 1) Paste your cities here

Replace the list below with your column values.


In [None]:
cities = get_unique_cities()
assert len(cities) > 0, "Please paste your city strings into the `cities` list."

## 2) Cleaning helpers


In [None]:
def normalize_city(s: str) -> str:
    """Remove extra tags/spaces that hurt geocoding."""
    s = re.sub(r"\s*\(Metropolitan Area\)\s*", "", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def fallback_first_city(s: str) -> str:
    """If we have 'A/B/C, ST', fall back to 'A, ST'."""
    if "/" in s and "," in s:
        left, st = s.split(",", 1)
        first = left.split("/")[0].strip()
        return f"{first}, {st.strip()}"
    return s


## 3) Geocode to lat/lon

This uses OpenStreetMap's Nominatim. Please keep the delay (or increase it) to avoid being blocked.


In [None]:
geolocator = Nominatim(user_agent="city_dotmap_colab")
# min_delay_seconds ~1.1 keeps you under the typical 1 req/sec guideline
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1.1)

rows = []
for raw in cities:
    cleaned = normalize_city(raw)
    q1 = f"{cleaned}, USA"
    loc = geocode(q1)

    used_query = q1
    if loc is None:
        cleaned2 = fallback_first_city(cleaned)
        q2 = f"{cleaned2}, USA"
        loc = geocode(q2)
        used_query = q2

    rows.append({
        "city_raw": raw,
        "city_clean": cleaned,
        "geocode_query": used_query,
        "lat": None if loc is None else loc.latitude,
        "lon": None if loc is None else loc.longitude,
    })

df_geo = pd.DataFrame(rows)
df_geo.head()

In [None]:
# Show failures (if any)
fail = df_geo[df_geo["lat"].isna()][["city_raw", "geocode_query"]]
print(f"Failed geocodes: {len(fail)}")
fail.head(25)

## 4) Plot dot map over the USA


In [None]:
df_plot = df_geo.dropna(subset=["lat", "lon"]).copy()

fig = px.scatter_geo(
    df_plot,
    lat="lat",
    lon="lon",
    hover_name="city_raw",
    scope="usa",
    projection="albers usa",
)
fig.update_traces(marker=dict(size=6))
fig.update_layout(title="US City Dot Map")
fig.show()

## 5) Optional: characterize cities by size/color

If you have a metric per city (e.g., counts, score), merge it into `df_geo` and use `size=` / `color=`.


In [None]:
# Example (uncomment and adapt):
# city_metric = pd.DataFrame({
#     "city_raw": ["Albany, NY", "Austin, TX"],
#     "count": [12, 30],
#     "category": ["A", "B"],
# })
# df2 = df_geo.merge(city_metric, on="city_raw", how="left")
# df2 = df2.dropna(subset=["lat", "lon"])
# fig = px.scatter_geo(
#     df2,
#     lat="lat", lon="lon",
#     hover_name="city_raw",
#     size="count",
#     color="category",
#     scope="usa",
#     projection="albers usa",
# )
# fig.show()

In [None]:
# Save geocoded results for reuse
out_path = "geocoded_cities.csv"
df_geo.to_csv(out_path, index=False)
print("Saved:", out_path)