## Fetch brand locations

In [1]:
import requests
import pandas as pd
import time
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

API_KEY = "AIzaSyC1y0zKO0nnY6OOlUMs5ciN2EKygF8xAuM"

NEARBY_SEARCH_URL = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"

# List of Brands
brands = ["Starbucks", "McDonald's", "Walmart", "Target"]

# USA Boundaries
USA_BOUNDS = {
    "lat_min": 24.5, "lat_max": 49.5,
    "lon_min": -125.0, "lon_max": -66.9
}

LAT_STEP = 1.0
LON_STEP = 1.0
SEARCH_RADIUS = 75000

def get_pois(brand, lat, lon):
    params = {
        "location": f"{lat},{lon}",
        "radius": SEARCH_RADIUS,
        "keyword": brand,
        "key": API_KEY
    }

    all_results = []

    while True:
        response = requests.get(NEARBY_SEARCH_URL, params=params)
        data = response.json()

        if "results" in data:
            for place in data["results"]:
                all_results.append({
                    "brand": brand,
                    "name": place.get("name"),
                    "latitude": place["geometry"]["location"]["lat"],
                    "longitude": place["geometry"]["location"]["lng"],
                    "address": place.get("vicinity", "N/A"),
                    "place_id": place.get("place_id"),
                    "rating": place.get("rating", "N/A"),
                    "user_ratings_total": place.get("user_ratings_total", "N/A")
                })

        next_page_token = data.get("next_page_token")
        if not next_page_token:
            break

        params["pagetoken"] = next_page_token
        time.sleep(2)

    return all_results

def generate_grid(lat_min, lat_max, lon_min, lon_max):
    grid_points = []
    lat = lat_min
    while lat <= lat_max:
        lon = lon_min
        while lon <= lon_max:
            grid_points.append((lat, lon))
            lon += LON_STEP
        lat += LAT_STEP
    return grid_points

poi_data = []
grid_points = generate_grid(USA_BOUNDS["lat_min"], USA_BOUNDS["lat_max"], USA_BOUNDS["lon_min"], USA_BOUNDS["lon_max"])

def scrape_location(lat_lon):
    lat, lon = lat_lon
    local_poi_data = []
    for brand in brands:
        local_poi_data.extend(get_pois(brand, lat, lon))
    return local_poi_data

with ThreadPoolExecutor(max_workers=5) as executor:
    results = list(tqdm(executor.map(scrape_location, grid_points), total=len(grid_points)))

for result in results:
    poi_data.extend(result)

df = pd.DataFrame(poi_data).drop_duplicates(subset=["place_id"])

# df.to_csv("data.csv", index=False)


  7%|██▋                                     | 104/1534 [00:56<13:02,  1.83it/s]


KeyboardInterrupt: 

In [3]:
df.to_csv("data.csv", index=False)

NameError: name 'df' is not defined

## Fetch restaurants and cafes

In [1]:
import requests
import pandas as pd
import time
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

API_KEY = "AIzaSyB22-hL86l-7Adt3Hf93k932Z_PnWhtBWQ"

NEARBY_SEARCH_URL = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"

USA_BOUNDS = {
    "lat_min": 24.5, "lat_max": 49.5,  # Continental USA latitude range
    "lon_min": -125.0, "lon_max": -66.9  # Continental USA longitude range
}

LAT_STEP = 0.5
LON_STEP = 0.5
SEARCH_RADIUS = 75000  # Increased from 50 km to 75 km to cover more area

# Function to get restaurants in a specific lat/lon area
def get_restaurants(lat, lon):
    params = {
        "location": f"{lat},{lon}",
        "radius": SEARCH_RADIUS,
        "type": "coffee",  # Search for restaurants
        "key": API_KEY
    }

    all_results = []

    while True:
        response = requests.get(NEARBY_SEARCH_URL, params=params)
        data = response.json()

        if "results" in data:
            for place in data["results"]:
                all_results.append({
                    "name": place.get("name"),
                    "latitude": place["geometry"]["location"]["lat"],
                    "longitude": place["geometry"]["location"]["lng"],
                    "address": place.get("vicinity", "N/A"),
                    "place_id": place.get("place_id"),
                    "rating": place.get("rating", "N/A"),
                    "user_ratings_total": place.get("user_ratings_total", "N/A")
                })

        next_page_token = data.get("next_page_token")
        if not next_page_token:
            break

        params["pagetoken"] = next_page_token
        time.sleep(2)

    return all_results

def generate_grid(lat_min, lat_max, lon_min, lon_max):
    grid_points = []
    lat = lat_min
    while lat <= lat_max:
        lon = lon_min
        while lon <= lon_max:
            grid_points.append((lat, lon))
            lon += LON_STEP
        lat += LAT_STEP
    return grid_points

In [2]:
# Fetch restaurants for all locations in the USA
poi_data = []
grid_points = generate_grid(USA_BOUNDS["lat_min"], USA_BOUNDS["lat_max"], USA_BOUNDS["lon_min"], USA_BOUNDS["lon_max"])

def scrape_location(lat_lon):
    lat, lon = lat_lon
    return get_restaurants(lat, lon)

with ThreadPoolExecutor(max_workers=5) as executor:
    results = list(tqdm(executor.map(scrape_location, grid_points), total=len(grid_points)))

for result in results:
    poi_data.extend(result)

df = pd.DataFrame(poi_data).drop_duplicates(subset=["place_id"])


100%|██████████| 5967/5967 [1:07:08<00:00,  1.48it/s]


In [3]:
df.shape

(165157, 7)

In [4]:
df.to_csv('coffee.csv',index=False)