In [1]:
import geopandas as gpd
import numpy as np

In [2]:
# Read file
city_proper = gpd.read_file('./data/city_proper_over_1000.geojson')
hsr_countries = [
    "China",
    "Spain",
    "France",
    "Germany",
    "Japan",
    "Italy",
    "United Kingdom",
    "Korea, Republic of",
    "Turkey",
    "Finland",
    "Sweden",
    "Uzbekistan",
    "United States",
    "Greece",
    "Russia",
    "Saudi Arabia",
    "Taiwan, China",
    "Austria",
    "Portugal",
    "Poland",
    "Belgium",
    "Morocco",
    "Switzerland",
    "Indonesia",
    "Norway",
    "Netherlands",
    "Serbia",
    "Denmark",
    "Hong Kong, China",
]

In [3]:
city_proper = city_proper[
    [
        "geoname_id",
        "name",
        "ascii_name",
        "country_code",
        "cou_name_en",
        "population",
        "geometry",
    ]
]

city_proper["geoname_id"] = city_proper["geoname_id"].astype("string")
city_proper["name"] = city_proper["name"].astype("string")
city_proper["ascii_name"] = city_proper["ascii_name"].astype("string")
city_proper["country_code"] = city_proper["country_code"].astype("string")
city_proper["cou_name_en"] = city_proper["cou_name_en"].astype("string")
city_proper["population"] = city_proper["population"].astype(int)

In [4]:
city_proper["cou_name_en"] = city_proper["cou_name_en"].fillna("")
city_proper = city_proper[city_proper["cou_name_en"].apply(lambda x: x in hsr_countries)].reset_index(drop=True)
city_proper.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 79439 entries, 0 to 79438
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   geoname_id    79439 non-null  string  
 1   name          79439 non-null  string  
 2   ascii_name    79439 non-null  string  
 3   country_code  79439 non-null  string  
 4   cou_name_en   79439 non-null  string  
 5   population    79439 non-null  int32   
 6   geometry      79439 non-null  geometry
dtypes: geometry(1), int32(1), string(5)
memory usage: 3.9 MB


In [5]:
# Take a subset of only cities with population greater than 50K
city_over_50 = city_proper[city_proper["population"] >= 75000]
city_below_50 = city_proper[(city_proper["population"] < 50000) & (city_proper["population"] > 5000)]

# Want to calculate dist between every city over 50K and every city below 50K
crossed = city_over_50.merge(
    city_below_50,
    how="cross",
)
crossed = gpd.GeoDataFrame(crossed)

In [8]:
crossed.set_geometry("geometry_x")
crossed["geometry_x"] = gpd.GeoSeries(crossed["geometry_x"]).to_crs("epsg:3857")
crossed["geometry_y"] = gpd.GeoSeries(crossed["geometry_y"]).to_crs("epsg:3857")
crossed["dist"] = crossed["geometry_x"].distance(crossed["geometry_y"])

In [13]:
# Save smaller dataset for later if needed
crossed[crossed["dist"] < 80467.2].to_csv("./data/under_50_mi.csv")

In [31]:
# Select for city pairs closer than 20 miles to main city (over 75K)
valid = crossed[crossed["dist"] < 32186.9]
# If the second (smaller) city is matched to multiple main cities, only keep match with largest main city
valid = valid.sort_values(by="population_x", ascending=False)
valid = valid.drop_duplicates(subset="name_y", keep="first")
valid.head(2)

Unnamed: 0,geoname_id_x,name_x,ascii_name_x,country_code_x,cou_name_en_x,population_x,geometry_x,geoname_id_y,name_y,ascii_name_y,country_code_y,cou_name_en_y,population_y,geometry_y,dist
23354071,1795565,Shenzhen,Shenzhen,CN,China,17494398,POINT (12698025.072 2577151.127),1819003,San Po Kong,San Po Kong,HK,"Hong Kong, China",18199,POINT (12712317.381 2551849.156),29059.591763
23351727,1795565,Shenzhen,Shenzhen,CN,China,17494398,POINT (12698025.072 2577151.127),1819326,Nai Chung,Nai Chung,HK,"Hong Kong, China",5001,POINT (12719090.059 2563370.484),25172.203195


In [32]:
# Groupby main city, sum population, agg list of names together for record and keep largest city as metro name
valid = valid.groupby("name_x").agg({"population_x": "first", "country_code_x": "first", "geometry_x": "first", "population_y": "sum", "name_y": list}).reset_index()
valid["population"] = valid["population_x"] + valid["population_y"]

In [33]:
valid.sort_values(by="population", ascending=False).head(10)

Unnamed: 0,name_x,population_x,country_code_x,geometry_x,population_y,name_y,population
1087,Shenzhen,17494398,CN,POINT (12698025.072 2577151.127),1426550,"[San Po Kong, Nai Chung, San Tin, Tai Mong Tsa...",18920948
521,Istanbul,14804116,TR,POINT (3222661.410 5014383.275),36146,"[Adalar, Yakuplu]",14840262
833,New York City,8804190,US,POINT (-8238306.896 4970287.468),2764156,"[Bergenfield, Cedar Grove, Midland Beach, Far ...",11568346
669,London,8961989,GB,POINT (-13997.313 6711744.580),2587696,"[St. Ann's, Holloway, Woodford Green, New Mald...",11549685
1182,Tianjin,11090314,CN,POINT (13044047.237 4742063.890),77740,"[Dongditou, Qingguang, Beicang, Huantuo, Hanji...",11168054
1080,Seoul,10349312,KR,POINT (14135170.830 4518296.290),12612,[Yongsan-dong],10361924
1188,Tokyo,8336599,JP,POINT (15550410.025 4257980.732),8629,[Hatsudai],8345228
489,Hong Kong,7491609,HK,POINT (12709868.352 2544973.839),538052,"[Laguna City, Shau Kei Wan, Mount Davis, Yau T...",8029661
1153,Taipei,7871900,TW,POINT (13528863.657 2881612.005),56152,"[Shimen, Shiding, Pinglin, Wulai, Sanzhi]",7928052
224,Chongqing,7457599,CN,POINT (11861950.017 3447149.679),439199,"[Mudong, Lijia, Longxing, Ersheng, Nanpeng, Hu...",7896798


In [36]:
valid[valid["name_x"].str.contains("Atlanta")]

Unnamed: 0,name_x,population_x,country_code_x,geometry_x,population_y,name_y,population
69,Atlanta,463878,US,POINT (-9394026.963 3995148.448),490682,"[Candler-McAfee, North Decatur, Doraville, Sco...",954560
