In [1]:
import re
import numpy as np
import json
import requests
from requests.adapters import HTTPAdapter, Retry
from joblib import Parallel, delayed

from osgeo import gdal
import geopandas as gpd
gdal.SetConfigOption('SHAPE_RESTORE_SHX', 'YES')

import pandas as pd
import foursquare as fsq
import folium
import shapely
from geopy import distance
from tqdm import tqdm

In [2]:
# Shanghai
shape_file = 'shanghai-provjson.shp'
# Nanjing
top_left = (118.39246295229297, 32.56306709606652) # long, lat
bottom_right = (119.22570190410435, 31.261649948659116)

In [3]:
r_earth = 6_371_000 # meters
def move_lat(lat, d_lat):
    lat += d_lat / r_earth * 180 / np.pi
    return lat
def move_long(long, d_long, lat):
    long += d_long / r_earth * 180 / np.pi / np.cos(lat * np.pi / 180)
    return long

In [4]:
if shape_file:
    geo_df = gpd.read_file(shape_file)

    polygons = geo_df.geometry.tolist()
    boundary = gpd.GeoSeries(shapely.ops.unary_union(polygons))
    long, lat = boundary.at[0].exterior.coords.xy

    min_long = min(long)
    max_long = max(long)
    min_lat = min(lat)
    max_lat = max(lat)

else:
    if top_left is None or bottom_right is None:
        raise ValueError("Please provide either a shape file or the top left and bottom right coordinates.")
    
    min_long, min_lat = top_left
    max_long, max_lat = bottom_right

long = min_long
lat = min_lat

columns={"min_long":[], "min_lat":[], "max_long":[], "max_lat":[]}

while lat <= max_lat:
    next_lat = move_lat(lat, 500)

    while long <= max_long:
        next_long = move_long(long, 500, lat)

        columns["min_long"].append(long)
        columns["min_lat"].append(lat)
        columns["max_long"].append(next_long)
        columns["max_lat"].append(next_lat)

        long = next_long
        
    lat = next_lat
    long = min_long    

squares = pd.DataFrame(columns)

In [None]:
api_key=""

url = "https://api.foursquare.com/v3/places/search"
headers = {"accept": "application/json", "Authorization": api_key}

def poi_scraper(coords, radius=50_000, query="7 Days Inn", category=19014):

    if isinstance(category, list):
        category = ",".join([str(c) for c in category])

    # Add search parameters here
    params={
        "ll": f"{coords[0]},{coords[1]}",
        # "near": "Shanghai",
        "query": query,
        "radius": int(radius),
        "limit": 50,
        "categories": category # 19014 is the id for hotel
    }

    s = requests.Session()
    retries = Retry(backoff_factor=0.1, status_forcelist=[429, 500, 502, 503, 504])
    s.mount('https://', HTTPAdapter(max_retries=retries))
    data = json.loads(s.get(url, headers=headers,params=params).text)
    
    """ data = []
    while "results" not in data:
        # Request data from foursquare
        data = json.loads(requests.get(url, headers=headers,params=params).text)

        if "results" not in data:
            print(data) """
    
    df = pd.DataFrame(data["results"])
    if len(df) == 0:
        return None
    df["latitude"] = df["geocodes"].apply(lambda x: x["main"]["latitude"])
    df["longitude"] = df["geocodes"].apply(lambda x: x["main"]["longitude"])
    df["address"] = df["location"].apply(lambda x: x["formatted_address"])
    df["genre"] = df["categories"].apply(lambda x: [item["id"] for item in x])
    df = df.loc[:, ["name", "latitude", "longitude", "distance", "address", "genre"]]    

    return df

POI categories from paper | foursquare categories
--- | ---
education | 12009,Community and Government > Education
scenic spots | 16000,Landmarks and Outdoors
sports | 18000,Sports and Recreation
commercial spots | 17000,Retail
financial services | 11042,Business and Professional Services > Financial Service
transport facilities | 19030,Travel and Transportation > Transport Hub

In [None]:
cat_df = pd.read_csv("foursquare_categories.csv")

categories = {
    "education": (12009,"Community and Government > Education"),
    "scenic": (16000,"Landmarks and Outdoors"),
    "sports": (18000,"Sports and Recreation"),
    "commercial": (17000,"Retail"),
    "financial": (11042,"Business and Professional Services > Financial Service"),
    "transport": (19030,"Travel and Transportation > Transport Hub")
}

city_names = ["Shanghai", "上海"]

In [None]:
def parallel_scraping(center, radius, squares):
    df = poi_scraper(center, radius, query="", category=list(categories.values()))
    if df is None:
        # print("No data for", center, radius)
        return []
    
    if len(df) > 45:
        # print(center, radius, len(df))
        return [
            parallel_scraping((move_lat(center[0],-radius/(2*np.sqrt(2))),move_long(center[1],-radius/(2*np.sqrt(2)), center[0])), radius/2, squares),
            parallel_scraping((move_lat(center[0],-radius/(2*np.sqrt(2))),move_long(center[1], radius/(2*np.sqrt(2)), center[0])), radius/2, squares),
            parallel_scraping((move_lat(center[0], radius/(2*np.sqrt(2))),move_long(center[1],-radius/(2*np.sqrt(2)), center[0])), radius/2, squares),
            parallel_scraping((move_lat(center[0], radius/(2*np.sqrt(2))),move_long(center[1], radius/(2*np.sqrt(2)), center[0])), radius/2, squares),
        ]
    
    # remove pois outside of the city
    df = df[(
            pd.DataFrame([df["address"].str.contains(city_names[i]) for i in range(len(city_names))]).transpose().any(axis=1)
        )].reset_index(drop=True)
    
    if len(df) == 0:
        return []
    
    # assign square
    df["square"] = df.apply(lambda x:
        squares[
            (squares["min_long"]<= x["longitude"]) & (squares["max_long"]>x["longitude"]) &
            (squares["min_lat"]<= x["latitude"]) & (squares["max_lat"]>x["latitude"])
        ].index[0], 
    axis=1)

    # assign category
    df["category"] = df.apply(lambda x: [
        name 
        for name, cat in categories.items() 
        for genre in x["genre"] 
        if re.match(cat[1],cat_df[cat_df["category_id"]==genre]["category_label"].values[0])
    ][0], axis=1)

    return [df]

In [None]:
squares = squares.assign(**{cat:0 for cat in categories.keys()})

grid_size = 100
long_step = (max_long - min_long) / grid_size
lat_step = (max_lat - min_lat) / grid_size

data = Parallel(n_jobs=16)(delayed(parallel_scraping)(
    (min_lat + (i+1) * lat_step, min_long + (j+1) * long_step), 
    int(distance.distance(
        (min_lat + (i+1) * lat_step, min_long + (j+1) * long_step),
        (min_lat + (i+2) * lat_step, min_long + (j+2) * long_step)
    ).m / 2),
    squares
) for i in tqdm(range(grid_size)) for j in range(grid_size))

100%|██████████| 33/33 [04:27<00:00,  8.11s/it]


In [32]:
def flatten(d):
    if isinstance(d[0], list):
        flattened = []
        for i in d:
            if len(i) > 0:
                flattened += flatten(i)
        return flattened
    else:
        return d
flattened = flatten(data)

In [48]:
df = pd.concat(flattened).reset_index(drop=True).drop_duplicates(subset=["name", "latitude", "longitude"])

In [76]:
df["square"] = df.apply(lambda x: x["square"][0], axis=1)

In [78]:
df["category"] = df.apply(lambda x: x["category"][0], axis=1)

In [86]:
squares = pd.merge(squares, df.pivot_table(index="square", columns="category", values="name", aggfunc="count", fill_value=0), left_index=True, right_index=True, how="outer", suffixes=("", "_new"))

for cat in categories.keys():
    squares[cat] = squares[cat] + squares[cat+"_new"].fillna(0)
    squares = squares.drop(columns=cat+"_new")


In [97]:
squares[(squares.loc[:, list(categories.keys())]!=0).any(axis=1)].to_csv("squares_filtered.csv", index=False)

In [88]:
squares.to_csv("squares.csv", index=False)