##### Ticketmaster

In [None]:
import requests
import pandas as pd
import time

In [None]:
API_KEY = "VDCKChATdZdyj9wpHuG5eHaeNNiGn49s"

In [None]:
city = "Chicago"
start_date = "2024-01-01T00:00:00Z"
end_date = "2025-01-01T00:00:00Z"

url = "https://app.ticketmaster.com/discovery/v2/events.json"

params = {
    "apikey": API_KEY,
    "city": city,
    "countryCode": "US",
    "startDateTime": start_date,
    "endDateTime": end_date,
    "sort": "date,asc",
    "size": 200,
    "page": 0
}

In [None]:
import time
import math
import requests
import pandas as pd


cities = [
    "New York", "Los Angeles", "Chicago", "Houston", "Phoenix",
    "Philadelphia", "San Antonio", "San Diego", "Dallas", "San Jose",
    "Austin", "Jacksonville", "Fort Worth", "Columbus", "Charlotte",
    "San Francisco", "Indianapolis", "Seattle", "Denver", "Washington",
    "Boston", "El Paso", "Nashville", "Detroit", "Oklahoma City",
    "Portland", "Las Vegas", "Memphis", "Louisville", "Baltimore",
    "Milwaukee", "Albuquerque", "Tucson", "Fresno", "Mesa",
    "Sacramento", "Atlanta", "Kansas City", "Colorado Springs", "Miami",
    "Raleigh", "Omaha", "Long Beach", "Virginia Beach", "Oakland",
    "Minneapolis", "Tulsa", "Arlington", "New Orleans", "Wichita",
    "Cleveland", "Tampa", "Bakersfield", "Aurora", "Honolulu",
    "Anaheim", "Santa Ana", "Corpus Christi", "Riverside", "Lexington",
    "St. Louis", "Stockton", "Pittsburgh", "Anchorage", "Cincinnati",
    "Henderson", "Greensboro", "Plano", "Lincoln", "Buffalo"
]

start_date = "2024-01-01T00:00:00Z"
end_date   = "2025-01-01T00:00:00Z"

BASE = "https://app.ticketmaster.com/discovery/v2/events.json"

def fetch_city_events(city, max_pages=None, size=200, sleep_s=0.25):
    """Fetch all pages for a city; stop at totalPages or max_pages."""
    page = 0
    out = []
    while True:
        params = {
            "apikey": API_KEY,
            "city": city,
            "countryCode": "US",
            "startDateTime": start_date,
            "endDateTime": end_date,
            "sort": "date,asc",
            "size": size,
            "page": page,
        }
        r = requests.get(BASE, params=params)
        if r.status_code == 429:  # rate-limited, backoff and retry
            time.sleep(1.5)
            continue
        if r.status_code != 200:
            # Fail soft for this city
            print(f"⚠️ {city}: HTTP {r.status_code}, stopping.")
            break

        data = r.json()
        events = data.get("_embedded", {}).get("events", [])
        if not events:
            break

        out.extend(events)

        # Pagination info
        page_info = data.get("page", {})  # has number, size, totalElements, totalPages
        total_pages = page_info.get("totalPages")
        current = page_info.get("number")

        # Stop rules
        if max_pages is not None and page + 1 >= max_pages:
            break
        if total_pages is not None and current is not None and current >= total_pages - 1:
            break

        page += 1
        time.sleep(sleep_s)  # keep under ~5 req/s
    return out

# --- Fetch all cities ---
raw = []
for c in cities:
    print(f"Fetching: {c}")
    raw += fetch_city_events(c)

# --- Deduplicate by Ticketmaster event ID ---
# Same event could appear across cities lists in edge cases; use 'id' as stable key
by_id = {}
for e in raw:
    eid = e.get("id")
    if eid and eid not in by_id:
        by_id[eid] = e

events_unique = list(by_id.values())

# --- Normalize into a DataFrame safely ---
rows = []
for e in events_unique:
    venues = e.get("_embedded", {}).get("venues", [{}])
    v0 = venues[0] if venues else {}
    rows.append({
        "EventID": e.get("id"),
        "Name": e.get("name"),
        "DateLocal": e.get("dates", {}).get("start", {}).get("localDate"),
        "DateTimeLocal": e.get("dates", {}).get("start", {}).get("localDateTime"),
        "Venue": v0.get("name"),
        "City": (v0.get("city", {}) or {}).get("name"),
        "State": (v0.get("state", {}) or {}).get("stateCode"),
        "Country": (v0.get("country", {}) or {}).get("countryCode"),
        "URL": e.get("url", "N/A"),
        "Source": e.get("source")
    })

df = pd.DataFrame(rows)

print(f"✅ Unique events retrieved: {len(df)}")
display(df.head())

# --- Quick diagnostics ---
print("\nTop 10 cities by count:")
display(df.groupby("City").size().sort_values(ascending=False).head(10).rename("Count").to_frame())

# Optional: save
# df.to_csv("ticketmaster_events_2024_multicity.csv", index=False)


#### Import packages

In [1]:
import os
import time
import pandas as pd
import requests

#### Get OSM restaurant and venue features

In [4]:
cities = pd.read_excel('cities.xlsx',header=None,names=['city'])

In [26]:
cities["city_name"] = cities["city"].str.split(",").str[0].str.strip()
cities["state_code"] = cities["city"].str.split(",").str[1].str.strip()

cities.head()

Unnamed: 0,city,city_name,state_code
1,"New York, NY",New York,NY
2,"Los Angeles, CA",Los Angeles,CA
3,"Chicago, IL",Chicago,IL
4,"Houston, TX",Houston,TX
5,"Phoenix, AZ",Phoenix,AZ


In [6]:
OVERPASS_URL = "https://overpass-api.de/api/interpreter"

def run_overpass_query(query, timeout=180, max_retries=5):
    for attempt in range(max_retries):
        try:
            r = requests.post(OVERPASS_URL, data={"data": query}, timeout=timeout+30)
            r.raise_for_status()
            return r.json()
        except requests.exceptions.HTTPError as e:
            status = e.response.status_code
            # transient / rate-limit / overload errors
            if status in (429, 502, 503, 504):
                wait = 3 * (attempt + 1)
                print(f"    Overpass HTTP {status} on attempt {attempt+1}, sleeping {wait}s then retrying...")
                time.sleep(wait)
                continue
            else:
                print(f"    Fatal HTTP error {status}: {e}")
                break
        except requests.exceptions.RequestException as e:
            # network glitches, timeouts, etc
            wait = 3 * (attempt + 1)
            print(f"    Request error on attempt {attempt+1}: {e}. Sleeping {wait}s then retrying...")
            time.sleep(wait)
            continue

    print("    Giving up on this query, returning empty result.")
    return {"elements": []}

#### Function to get restaurants data

In [11]:
def get_osm_restaurants(city_name, state_code, timeout=180):
    """
    city_name: cleaned city string, e.g. 'Columbus'
    state_code: two-letter state code, e.g. 'OH'
    """
    query = f"""
    [out:json][timeout:{timeout}];

    // State area (admin_level=4) via ISO3166-2 code, e.g. US-OH
    area["ISO3166-2"="US-{state_code}"]["boundary"="administrative"]["admin_level"=4]->.state;

    // City area inside that state
    area["name"="{city_name}"]["boundary"="administrative"]["admin_level"~"5|6|7|8"](area.state)->.a;

    (
      node["amenity"="restaurant"](area.a);
      way["amenity"="restaurant"](area.a);
      relation["amenity"="restaurant"](area.a);
    );
    out center tags;
    """

    data = run_overpass_query(query, timeout=timeout)
    els = data.get("elements", [])
    rows = []
    for el in els:
        t = el.get("tags", {}) or {}

        if el["type"] == "node":
            lat, lon = el.get("lat"), el.get("lon")
        else:
            c = el.get("center") or {}
            lat, lon = c.get("lat"), c.get("lon")

        rows.append({
            "osm_id": el.get("id"),
            "osm_type": el.get("type"),
            "name": t.get("name"),
            "cuisine": t.get("cuisine"),   # may be "thai;chinese"
            "addr": ", ".join(
                [t.get(k, "") for k in ("addr:housenumber", "addr:street", "addr:city") if t.get(k)]
            ),
            "phone": t.get("contact:phone") or t.get("phone"),
            "website": t.get("contact:website") or t.get("website"),
            "opening_hours": t.get("opening_hours"),
            "lat": lat,
            "lon": lon,
            "city_name": city_name,
            "state_code": state_code,
        })
    return pd.DataFrame(rows)

#### Function to get venue data

In [14]:
def get_osm_venues(city_name, state_code, timeout=180):
    query = f"""
    [out:json][timeout:{timeout}];

    // State area
    area["ISO3166-2"="US-{state_code}"]["boundary"="administrative"]["admin_level"=4]->.state;

    // City area inside that state
    area["name"="{city_name}"]["boundary"="administrative"]["admin_level"~"5|6|7|8"](area.state)->.a;

    (
      node["amenity"~"theatre|cinema|arts_centre|music_venue|bar|pub|nightclub|casino"](area.a);
      way["amenity"~"theatre|cinema|arts_centre|music_venue|bar|pub|nightclub|casino"](area.a);
      relation["amenity"~"theatre|cinema|arts_centre|music_venue|bar|pub|nightclub|casino"](area.a);

      node["leisure"~"stadium|sports_centre|fitness_centre"](area.a);
      way["leisure"~"stadium|sports_centre|fitness_centre"](area.a);
      relation["leisure"~"stadium|sports_centre|fitness_centre"](area.a);
    );
    out center tags;
    """

    data = run_overpass_query(query, timeout=timeout)
    els = data.get("elements", [])
    rows = []
    for el in els:
        tags = el.get("tags", {}) or {}
        if el["type"] == "node":
            coord = el
        else:
            coord = el.get("center") or {}
        rows.append({
            "name": tags.get("name"),
            "type": tags.get("amenity") or tags.get("leisure"),
            "address": tags.get("addr:full") or tags.get("addr:street"),
            "city_name": city_name,
            "state_code": state_code,
            "lat": coord.get("lat"),
            "lon": coord.get("lon"),
        })
    return pd.DataFrame(rows)

#### Get restaurant and venue data for each city and extract the features

In [17]:
FEATURE_PATH = "osm_city_features.csv"

In [19]:
# Build list of (city_name, state_code) pairs from your cities df
city_pairs = list(zaip(cities["city_name"], cities["state_code"]))

In [21]:
# If this is run before, load previous results and resume
if os.path.exists(FEATURE_PATH):
    city_features = pd.read_csv(FEATURE_PATH)

    # Use both city_name and state_code as key
    done_pairs = set(zip(city_features["city_name"], city_features["state_code"]))
    print(f"Loaded {len(done_pairs)} previously processed city/state pairs.")
else:
    city_features = pd.DataFrame()
    done_pairs = set()

In [23]:
for city_name, state_code in city_pairs:
    key = (city_name, state_code)
    if key in done_pairs:
        print(f"Skipping already processed city: {city_name}, {state_code}")
        continue

    print(f"Processing {city_name}, {state_code}...")

    # Fetch OSM data
    try:
        df_restaurant = get_osm_restaurants(city_name, state_code)
    except Exception as e:
        print(f"  Error getting restaurants for {city_name}, {state_code}: {e}")
        df_restaurant = pd.DataFrame()

    try:
        df_cafe = get_osm_cafes(city_name, state_code)
    except Exception as e:
        print(f"  Error getting cafes for {city_name}, {state_code}: {e}")
        df_cafe = pd.DataFrame()
    

    try:
        df_venue = get_osm_venues(city_name, state_code)
    except Exception as e:
        print(f"  Error getting venues for {city_name}, {state_code}: {e}")
        df_venue = pd.DataFrame()

    # Logging of counts
    n_restaurants = len(df_restaurant)
    n_cafes = len(df_cafe)
    n_venues = len(df_venue)

    if n_restaurants == 0:
        print(f"  WARNING: No restaurants found for {city_name}, {state_code}")
    if n_venues == 0:
        print(f"  WARNING: No venues found for {city_name}, {state_code}")

    # Build feature dict
    features = {
        "city_name": city_name,
        "state_code": state_code,
        "n_restaurants": int(n_restaurants),
        "n_venues": int(n_venues),
    }

    # Restaurants
    if n_restaurants > 0 and "cuisine" in df_restaurant.columns:
        cuisines = (
            df_restaurant["cuisine"]
            .dropna()
            .astype(str)
            .str.split(";")
            .explode()
            .str.strip()
        )

        cuisine_counts = cuisines.value_counts()
        features["n_cuisine"] = int(len(cuisine_counts))

        for cuisine_name, count in cuisine_counts.items():
            if not cuisine_name:
                continue
            col_name = f"n_cuisine_{cuisine_name.replace(' ', '_')}"
            features[col_name] = int(count)

    # Venues
    if n_venues > 0 and "type" in df_venue.columns:
        venue_types = df_venue["type"].dropna().astype(str).str.strip()
        venue_counts = venue_types.value_counts()
        features["n_venue_type"] = int(len(venue_counts))

        for vtype, count in venue_counts.items():
            if not vtype:
                continue
            col_name = f"n_venue_{vtype.replace(' ', '_')}"
            features[col_name] = int(count)

    # Append new row
    row_df = pd.DataFrame([features])
    city_features = pd.concat([city_features, row_df], ignore_index=True)
    city_features = city_features.fillna(0)

    # Save immediately for crash safety
    city_features.to_csv(FEATURE_PATH, index=False)

    # One-line summary
    print(
        f"  - Retrieved {n_restaurants} restaurants, {n_venues} venues "
        f"(Saved features for {city_name}, {state_code}. Current shape: {city_features.shape})"
    )

    # Overpass courtesy delay
    time.sleep(1)


Processing New York, NY...
  - Retrieved 0 restaurants, 0 venues (Saved features for New York, NY. Current shape: (1, 4))
Processing Los Angeles, CA...
  - Retrieved 2709 restaurants, 994 venues (Saved features for Los Angeles, CA. Current shape: (2, 182))
Processing Chicago, IL...
  - Retrieved 2316 restaurants, 1668 venues (Saved features for Chicago, IL. Current shape: (3, 239))
Processing Houston, TX...
  - Retrieved 1489 restaurants, 1018 venues (Saved features for Houston, TX. Current shape: (4, 262))
Processing Phoenix, AZ...
  - Retrieved 867 restaurants, 457 venues (Saved features for Phoenix, AZ. Current shape: (5, 270))
Processing Philadelphia, PA...
  - Retrieved 904 restaurants, 657 venues (Saved features for Philadelphia, PA. Current shape: (6, 285))
Processing San Antonio, TX...
  - Retrieved 1034 restaurants, 503 venues (Saved features for San Antonio, TX. Current shape: (7, 295))
Processing San Diego, CA...
  - Retrieved 1332 restaurants, 712 venues (Saved features for

#### Get NYC features

In [87]:
import pandas as pd
import time

nyc_aliases = ["Manhattan", "Brooklyn", "Queens", "Bronx", "Staten Island"]

def build_new_york_restaurant_venue_features(state_code="NY", timeout=180):
    """
    Aggregate restaurant & venue features for New York City by querying
    its boroughs: Manhattan, Brooklyn, Queens, Bronx, Staten Island.

    Returns a one-row DataFrame with columns:
      city_name, state_code,
      n_restaurants, n_venues,
      n_cuisine, n_venue_type,
      n_cuisine_*, n_venue_*
    """
    all_restaurants = []
    all_venues = []

    # --- gather data from each borough ---
    for borough in nyc_aliases:
        print(f"  Borough: {borough}, {state_code}...")

        # Restaurants
        try:
            df_r = get_osm_restaurants(borough, state_code)
        except Exception as e:
            print(f"    Error getting restaurants for {borough}, {state_code}: {e}")
            df_r = pd.DataFrame()

        if not df_r.empty:
            df_r = df_r.copy()
            df_r["borough"] = borough
            all_restaurants.append(df_r)

        # Venues
        try:
            df_v = get_osm_venues(borough, state_code)
        except Exception as e:
            print(f"    Error getting venues for {borough}, {state_code}: {e}")
            df_v = pd.DataFrame()

        if not df_v.empty:
            df_v = df_v.copy()
            df_v["borough"] = borough
            all_venues.append(df_v)

        # gentle delay for Overpass
        time.sleep(1)

    # --- concatenate across boroughs ---
    if all_restaurants:
        df_restaurant = pd.concat(all_restaurants, ignore_index=True)
    else:
        df_restaurant = pd.DataFrame()

    if all_venues:
        df_venue = pd.concat(all_venues, ignore_index=True)
    else:
        df_venue = pd.DataFrame()

    n_restaurants = len(df_restaurant)
    n_venues = len(df_venue)

    print(f"  Total NYC restaurants: {n_restaurants}")
    print(f"  Total NYC venues: {n_venues}")

    # --- build feature dict ---
    features = {
        "city_name": "New York",
        "state_code": state_code,
        "n_restaurants": int(n_restaurants),
        "n_venues": int(n_venues),
    }

    # ----- cuisine features (restaurants) -----
    if n_restaurants > 0 and "cuisine" in df_restaurant.columns:
        cuisines = (
            df_restaurant["cuisine"]
            .dropna()
            .astype(str)
            .str.split(";")
            .explode()
            .str.strip()
        )
        cuisine_counts = cuisines.value_counts()
        features["n_cuisine"] = int(len(cuisine_counts))

        for cuisine_name, count in cuisine_counts.items():
            if not cuisine_name:
                continue
            col = f"n_cuisine_{cuisine_name.replace(' ', '_')}"
            features[col] = int(count)
    else:
        features["n_cuisine"] = 0

    # ----- venue-type features (venues) -----
    if n_venues > 0 and "type" in df_venue.columns:
        venue_types = df_venue["type"].dropna().astype(str).str.strip()
        venue_counts = venue_types.value_counts()
        features["n_venue_type"] = int(len(venue_counts))

        for vtype, count in venue_counts.items():
            if not vtype:
                continue
            col = f"n_venue_{vtype.replace(' ', '_')}"
            features[col] = int(count)
    else:
        features["n_venue_type"] = 0

    # return as a one-row DataFrame
    new_york_features = pd.DataFrame([features])
    return new_york_features

In [89]:
# Build New York restaurant + venue features
new_york_features = build_new_york_restaurant_venue_features()

new_york_features.head()

  Borough: Manhattan, NY...
  Borough: Brooklyn, NY...
  Borough: Queens, NY...
    Overpass HTTP 429 on attempt 1, sleeping 3s then retrying...
    Overpass HTTP 429 on attempt 2, sleeping 6s then retrying...
  Borough: Bronx, NY...
    Overpass HTTP 429 on attempt 1, sleeping 3s then retrying...
    Overpass HTTP 429 on attempt 2, sleeping 6s then retrying...
    Overpass HTTP 504 on attempt 3, sleeping 9s then retrying...
  Borough: Staten Island, NY...
    Overpass HTTP 429 on attempt 1, sleeping 3s then retrying...
  Total NYC restaurants: 7580
  Total NYC venues: 3492


Unnamed: 0,city_name,state_code,n_restaurants,n_venues,n_cuisine,n_cuisine_chinese,n_cuisine_pizza,n_cuisine_italian,n_cuisine_mexican,n_cuisine_japanese,...,n_venue_dojo,n_venue_events_venue,n_venue_cafe;bar,n_venue_cafe,n_venue_juice_bar,n_venue_barber,n_venue_fast_food,n_venue_disused:bar,n_venue_casino,n_venue_toilets
0,New York,NY,7580,3492,292,695,642,627,498,451,...,2,2,1,1,1,1,1,1,1,1


####  Merge and clean

In [101]:
df = pd.read_csv('osm_city_features.csv')
df = df.drop_duplicates()
df.head()

Unnamed: 0,city_name,state_code,n_restaurants,n_venues,n_cuisine,n_cuisine_mexican,n_cuisine_pizza,n_cuisine_american,n_cuisine_japanese,n_cuisine_thai,...,"n_cuisine_american,seafood",n_venue_food_sharing;public_bookcase,n_cuisine__drinks,n_cuisine_brazilian_steakhouse,n_cuisine_south_californian,n_cuisine_Guyanese_Cuisines,n_venue_townhall,n_cuisine_trucker,n_cuisine_pizzeria,n_cuisine_Persian
0,New York,NY,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Los Angeles,CA,2709,994,163.0,278.0,164.0,160.0,134.0,131.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Chicago,IL,2316,1668,154.0,293.0,211.0,207.0,64.0,70.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Houston,TX,1489,1018,111.0,167.0,108.0,128.0,32.0,27.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Phoenix,AZ,867,457,74.0,157.0,94.0,102.0,17.0,21.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [113]:
# Identify the New York row in df
mask_df = (df["city_name"] == "New York")  # add state if needed

# Identify the New York row in new_york_features
mask_ny = (new_york_features["city_name"] == "New York")

# Sanity check
assert mask_df.sum() == 1, "New York should appear exactly once in df"
assert mask_ny.sum() == 1, "New York should appear exactly once in new_york_features"

# ------------------------------
#   Align columns across datasets
# ------------------------------
shared_cols = [
    c for c in df.columns
    if c in new_york_features.columns and c not in ["city_name", "state_code"]
]

# Extract NY rows aligned to shared columns
df_ny = df.loc[mask_df, shared_cols]
nyfix_ny = new_york_features.loc[mask_ny, shared_cols]

# ------------------------------
#   Build overwrite mask
#   df value == 0   AND   new_york_features value != 0
# ------------------------------
overwrite_mask = (df_ny == 0) & (nyfix_ny != 0)

# ------------------------------
#   Apply overwrite only where needed
# ------------------------------
df.loc[mask_df, shared_cols] = df_ny.where(~overwrite_mask, nyfix_ny)
df.head()


Unnamed: 0,city_name,state_code,n_restaurants,n_venues,n_cuisine,n_cuisine_mexican,n_cuisine_pizza,n_cuisine_american,n_cuisine_japanese,n_cuisine_thai,...,"n_cuisine_american,seafood",n_venue_food_sharing;public_bookcase,n_cuisine__drinks,n_cuisine_brazilian_steakhouse,n_cuisine_south_californian,n_cuisine_Guyanese_Cuisines,n_venue_townhall,n_cuisine_trucker,n_cuisine_pizzeria,n_cuisine_Persian
0,New York,NY,7580,3492,292.0,498.0,642.0,388.0,451.0,258.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Los Angeles,CA,2709,994,163.0,278.0,164.0,160.0,134.0,131.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Chicago,IL,2316,1668,154.0,293.0,211.0,207.0,64.0,70.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Houston,TX,1489,1018,111.0,167.0,108.0,128.0,32.0,27.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Phoenix,AZ,867,457,74.0,157.0,94.0,102.0,17.0,21.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [115]:
threshold = 0.9
keep_cols = (df == 0).mean() < threshold

df_reduced = df.loc[:, keep_cols]

print("Original columns:", df.shape[1])
print("Remaining columns:", df_reduced.shape[1])

Original columns: 1060
Remaining columns: 86


In [117]:
df_reduced.columns

Index(['city_name', 'state_code', 'n_restaurants', 'n_venues', 'n_cuisine',
       'n_cuisine_mexican', 'n_cuisine_pizza', 'n_cuisine_american',
       'n_cuisine_japanese', 'n_cuisine_thai', 'n_cuisine_sushi',
       'n_cuisine_italian', 'n_cuisine_chinese', 'n_cuisine_korean',
       'n_cuisine_burger', 'n_cuisine_mediterranean', 'n_cuisine_breakfast',
       'n_cuisine_seafood', 'n_cuisine_asian', 'n_cuisine_indian',
       'n_cuisine_barbecue', 'n_cuisine_sandwich', 'n_cuisine_steak_house',
       'n_cuisine_vietnamese', 'n_cuisine_noodle', 'n_cuisine_chicken',
       'n_cuisine_ramen', 'n_cuisine_french', 'n_cuisine_pasta',
       'n_cuisine_filipino', 'n_cuisine_coffee_shop', 'n_cuisine_salad',
       'n_cuisine_pancake', 'n_cuisine_peruvian', 'n_cuisine_hawaiian',
       'n_cuisine_greek', 'n_cuisine_regional', 'n_cuisine_poke',
       'n_cuisine_lebanese', 'n_cuisine_kebab', 'n_cuisine_fish',
       'n_cuisine_spanish', 'n_cuisine_diner', 'n_cuisine_tacos',
       'n_cuisine_de

In [119]:
df_reduced.to_csv('osm_city_cleaned.csv')

In [121]:
# Identify feature columns (all numeric except city and state)
feature_cols = df.columns.difference(["city_name", "state_code"])
# Rows where all feature columns are zero
df_zero = df[(df[feature_cols] == 0).all(axis=1)]
df_zero[['city_name','state_code']]

Unnamed: 0,city_name,state_code
54,Honolulu,HI
62,St Paul,MN
68,St Louis,MO
84,St Petersburg,FL
88,Winston Salem,NC
106,Yonkers,NY
113,Port St Lucie,FL
278,Ventura,CA
319,Lees Summit,MO
349,St George,UT
