DATA COLLECTION : Most dense cities

In [90]:
import pandas as pd

url = "https://en.wikipedia.org/wiki/List_of_cities_proper_by_population_density"
tables = pd.read_html(url)

demographics_per_cities = tables[3]

# Flatten MultiIndex columns by joining with underscore or take second level if exists
if isinstance(demographics_per_cities.columns, pd.MultiIndex):
    demographics_per_cities.columns = ['_'.join([str(i) for i in col]).strip('_') for col in demographics_per_cities.columns]

# Now rename columns accordingly (use the new names from flattening)
demographics_per_cities = demographics_per_cities.rename(columns={
    'City_City': 'city',
    'Country_Country': 'country',
    'Population_Population': 'population',
    'Area_km2': 'area_km2',
    'Density_/km2': 'density_km2',
})

# Convert columns to numeric after replacing commas
demographics_per_cities['population'] = pd.to_numeric(demographics_per_cities['population'].astype(str).str.replace(',', ''), errors='coerce')
demographics_per_cities['area_km2'] = pd.to_numeric(demographics_per_cities['area_km2'].astype(str).str.replace(',', ''), errors='coerce')
demographics_per_cities['density_km2'] = pd.to_numeric(demographics_per_cities['density_km2'].astype(str).str.replace(',', ''), errors='coerce')


# Drop rows with missing values
demographics_per_cities_clean = demographics_per_cities.dropna(subset=['population', 'area_km2', 'density_km2'])

# Sort by density and take top 100
top_100_dense_cities = demographics_per_cities_clean.sort_values(by='density_km2', ascending=False).head(100)

from geopy.geocoders import Nominatim
import time

geolocator = Nominatim(user_agent="city_density_script")

def get_lat_lon(city, country):
    try:
        location = geolocator.geocode(f"{city}, {country}", timeout=10)
        if location:
            return location.latitude, location.longitude
        else:
            return None, None
    except:
        return None, None

# Create new columns with lat/lon
top_100_dense_cities['latitude'] = None
top_100_dense_cities['longitude'] = None

for idx, row in top_100_dense_cities.iterrows():
    lat, lon = get_lat_lon(row['city'], row['country'])
    top_100_dense_cities.at[idx, 'latitude'] = lat
    top_100_dense_cities.at[idx, 'longitude'] = lon
    time.sleep(1)  # be nice to the API (1 request per second)

print(top_100_dense_cities[['city', 'country', 'latitude', 'longitude']])


print(top_100_dense_cities[['city', 'country', 'population', 'area_km2', 'density_km2']])


                  city      country   latitude   longitude
0                 Giza        Egypt  29.987075   31.211806
1               Manila  Philippines  14.590449  120.980362
2   Croix-des-Bouquets        Haiti  18.577544  -72.229622
10      Port-au-Prince        Haiti  18.547327  -72.339593
3          Mandaluyong  Philippines  14.577439  121.033897
..                 ...          ...        ...         ...
89           Gaza City    Palestine  31.506587   34.461552
90      Kiryat Motzkin       Israel  32.836375   35.075211
91              Geneva  Switzerland  46.201756    6.146601
92             Portici        Italy  40.818973   14.338745
93             Karachi     Pakistan  24.854684   67.020706

[94 rows x 4 columns]
                  city      country  population  area_km2  density_km2
0                 Giza        Egypt     4432915        98        45050
1               Manila  Philippines     1846513        43        43062
2   Croix-des-Bouquets        Haiti      231077         

DATA COLLECTION : Air quality measurement for top_100_dense_cities for 3 years back

In [101]:
import pandas as pd
import numpy as np
from sklearn.neighbors import BallTree

import requests
import pandas as pd

# Your API key
headers = {
    "X-API-Key": "56b6c0a8191ecdb239dbaed764c0e295e80fb1e3e8325d73b80c79a627613d78"
}

# API endpoint for latest PM2.5 measurements per location
url = "https://api.openaq.org/v3/parameters/2/latest"

# Parameters: PM2.5 parameter, max 1000 records
params = {
    "limit": 1000
}

# Request data
response = requests.get(url, headers=headers, params=params)

if response.status_code == 200:
    data = response.json()
    results = data.get('results', [])
    
    # Convert results to DataFrame
    df = pd.json_normalize(results)

    # Cache to avoid repeated requests for the same locationId
    location_cache = {}

# Convert lat/lon to radians (needed for haversine)
def to_radians(df, lat_col, lon_col):
    # Ensure columns are numeric
    df[lat_col] = pd.to_numeric(df[lat_col], errors='coerce')
    df[lon_col] = pd.to_numeric(df[lon_col], errors='coerce')
    df_clean = df.dropna(subset=[lat_col, lon_col])
    return np.radians(df_clean[[lat_col, lon_col]].values), df_clean.index, df_clean

# Prepare city coordinates
city_coords_rad, city_idx, clean_cities = to_radians(top_100_dense_cities, 'latitude', 'longitude')

# Build BallTree with haversine metric
tree = BallTree(city_coords_rad, metric='haversine')

# Prepare sensor coordinates
sensor_coords_rad, sensor_idx, df_clean = to_radians(df, 'coordinates.latitude', 'coordinates.longitude')

# Query nearest city for each sensor point
dist, idx = tree.query(sensor_coords_rad, k=1)

# Convert distance from radians to kilometers
dist_km = dist.flatten() * 6371

# Add nearest city info
df_clean['nearest_city'] = top_100_dense_cities.iloc[city_idx[idx.flatten()]]['city'].values
df_clean['distance_to_city_km'] = dist_km

# Optional: filter
df_filtered = df_clean[df_clean['distance_to_city_km'] < 50]

print(df_filtered.head())


         value  sensorsId  locationsId          datetime.utc  \
28   26.000000    8539667      2623591  2024-04-12T22:00:00Z   
29    4.000000       3587         2020  2023-06-30T13:00:00Z   
37   23.300000      36317        10851  2022-10-16T15:45:00Z   
88    0.900000       1103          628  2025-06-27T16:00:00Z   
102   2.269388    8433565      2577003  2024-10-06T12:00:00Z   

                datetime.local  coordinates.latitude  coordinates.longitude  \
28   2024-04-13T07:00:00+09:00             37.404167             126.726111   
29   2023-06-30T07:00:00-06:00             19.532900             -99.030300   
37   2022-10-16T21:15:00+05:30             22.581570              88.410025   
88   2025-06-27T12:00:00-04:00             40.726900             -73.893300   
102  2024-10-06T14:00:00+02:00             40.837158              14.219318   

     nearest_city  distance_to_city_km  
28          Seoul            28.664357  
29      Kathmandu            13.957253  
37       Caloocan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['nearest_city'] = top_100_dense_cities.iloc[city_idx[idx.flatten()]]['city'].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['distance_to_city_km'] = dist_km


Public Transportation

In [110]:
import requests
import pandas as pd
import time

api_key = "k0AeovUG49nRbi2PBa40kjxZCT6xmoNV"
base_url = "https://transit.land/api/v2/rest/routes"

all_routes = []

# Correct way to iterate over rows
for _, row in top_100_dense_cities.iterrows():
    lat = row["latitude"]
    lon = row["longitude"]
    city_name = row["city"]

    # Skip if coordinates are missing
    if pd.isna(lat) or pd.isna(lon):
        print(f"Skipping {city_name} due to missing coordinates.")
        continue

    params = {
        "lat": lat,
        "lon": lon,
        "radius": 10000,  # meters
        "per_page": 100,
        "apikey": api_key
    }

    response = requests.get(base_url, params=params)
    if response.status_code == 200:
        data = response.json()
        routes = data.get("routes", [])

        for r in routes:
            r["queried_city"] = city_name
            all_routes.append(r)
        
        print(f"Fetched {len(routes)} routes for {city_name}")
    else:
        print(f"Failed to fetch routes for {city_name} (status {response.status_code})")

    # Be nice to the API
    time.sleep(1)

# Convert to DataFrame
df_routes = pd.DataFrame(all_routes)

print(df_routes.head())
print(f"Total routes fetched: {len(df_routes)}")


Fetched 3 routes for Giza
Fetched 20 routes for Manila
Fetched 0 routes for Croix-des-Bouquets
Fetched 0 routes for Port-au-Prince
Fetched 20 routes for Mandaluyong
Fetched 0 routes for Malé
Fetched 0 routes for Dhaka
Fetched 20 routes for Bnei Brak
Fetched 20 routes for Caloocan
Fetched 0 routes for Kolkata
Fetched 20 routes for Makati
Fetched 2 routes for Neapoli, Thessaloniki
Fetched 20 routes for Levallois-Perret
Fetched 0 routes for Guédiawaye
Fetched 20 routes for Montrouge
Fetched 20 routes for Bogotá
Fetched 20 routes for Vincennes
Fetched 20 routes for Le Pré-Saint-Gervais
Fetched 20 routes for Pasig
Fetched 20 routes for Saint-Mandé
Fetched 2 routes for La Plata
Fetched 20 routes for Saint-Josse-ten-Noode
Fetched 0 routes for Guttenberg
Fetched 20 routes for Malabon
Fetched 20 routes for Pasay
Fetched 0 routes for Damascus
Fetched 20 routes for San Juan, Metro Manila
Fetched 20 routes for Navotas
Fetched 0 routes for Asmara
Fetched 20 routes for Mislata
Fetched 20 routes for 

Cultural Life

In [111]:
import requests
import pandas as pd
import time

API_KEY = "Xw5BJzVrGCt9FM1guInPANLtTG1GQPsN"
url = "https://app.ticketmaster.com/discovery/v2/events.json"

all_events = []

for _, row in top_100_dense_cities.iterrows():
    city = row['city']
    country = row['country']

    params = {
        "apikey": API_KEY,
        "city": city,
        "countryCode": "",  # Optional, leave empty or add country codes if needed
        "startDateTime": "2025-01-01T00:00:00Z",
        "endDateTime": "2025-12-31T23:59:59Z",
        "size": 100,
        "page": 0
    }

    response = requests.get(url, params=params)
    if response.status_code == 200:
        data = response.json()
        events = data.get('_embedded', {}).get('events', [])

        print(f"{len(events)} events retrieved for {city}")
        for event in events:
            event_info = {
                "city": city,
                "event_name": event.get("name"),
                "date": event.get("dates", {}).get("start", {}).get("localDate"),
                "venue": event.get("_embedded", {}).get("venues", [{}])[0].get("name"),
            }
            all_events.append(event_info)
    else:
        print(f"Failed to fetch events for {city}: {response.status_code} {response.text}")

    time.sleep(1)  # Rate limiting

# Convert all events to DataFrame
df_events = pd.DataFrame(all_events)
print(df_events.head())
print(f"Total events collected: {len(df_events)}")


0 events retrieved for Giza
0 events retrieved for Manila
0 events retrieved for Croix-des-Bouquets
0 events retrieved for Port-au-Prince
0 events retrieved for Mandaluyong
0 events retrieved for Malé
0 events retrieved for Dhaka
0 events retrieved for Bnei Brak
0 events retrieved for Caloocan
0 events retrieved for Kolkata
0 events retrieved for Makati
0 events retrieved for Neapoli, Thessaloniki
0 events retrieved for Levallois-Perret
0 events retrieved for Guédiawaye
0 events retrieved for Montrouge
1 events retrieved for Bogotá
0 events retrieved for Vincennes
0 events retrieved for Le Pré-Saint-Gervais
0 events retrieved for Pasig
0 events retrieved for Saint-Mandé
0 events retrieved for La Plata
0 events retrieved for Saint-Josse-ten-Noode
0 events retrieved for Guttenberg
0 events retrieved for Malabon
0 events retrieved for Pasay
0 events retrieved for Damascus
43 events retrieved for San Juan, Metro Manila
0 events retrieved for Navotas
0 events retrieved for Asmara
0 events r