This is the first script that must be ran, this outputs which gmap_ids correspond to restaurants.

In [2]:
import json
from collections import Counter

file_path = 'Data/meta-District_of_Columbia.json'

# Open file and extract categories using a nested list comprehension.
# The core fix is using `(json.loads(line).get("category") or [])`
# to ensure that if 'category' is present but its value is null (None),
# it defaults to an empty list `[]` instead of trying to iterate over `None`.
with open(file_path, 'r') as f:
    categories = [
        cat
        for line in f
        if line.strip()
        for cat in (json.loads(line).get("category") or [])
    ]

# Calculate frequencies and print the results sorted by most common
print("Category Frequencies:")
for category, count in Counter(categories).most_common():
    print(f"- {category}: {count}")

Category Frequencies:
- Restaurant: 1283
- Bar: 642
- Coffee shop: 408
- Sandwich shop: 369
- American restaurant: 366
- Cafe: 359
- Fast food restaurant: 320
- Non-profit organization: 313
- Takeout Restaurant: 307
- Tourist attraction: 302
- Breakfast restaurant: 248
- Caterer: 246
- Park: 213
- Pizza restaurant: 194
- Beauty salon: 186
- Clothing store: 173
- Convenience store: 173
- Italian restaurant: 172
- Liquor store: 171
- Apartment building: 163
- Bakery: 162
- Hair salon: 159
- Grocery store: 156
- Brunch restaurant: 155
- Hotel: 143
- Event venue: 142
- Mexican restaurant: 142
- Nail salon: 141
- Church: 139
- Deli: 127
- Apartment complex: 125
- Seafood restaurant: 124
- ATM: 123
- Lounge: 121
- Cocktail bar: 120
- Parking garage: 119
- Hamburger restaurant: 114
- Barber shop: 111
- Chinese restaurant: 107
- Embassy: 106
- Condominium complex: 103
- Historical landmark: 102
- Gym: 102
- Bar & grill: 101
- Espresso bar: 101
- Women's clothing store: 101
- Asian restaurant: 

In [3]:
import json
import os
import sys

# --- Configuration ---
META_FILE_PATH = 'Data/meta-District_of_Columbia.json'
OUTPUT_DIR = 'Output_Data'
OUTPUT_FILE = os.path.join(OUTPUT_DIR, 'restaurant_gmap_ids.json')

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- Define Filter Categories ---
# Using a set for O(1) lookup speed
RESTAURANT_CATEGORIES = {
    "Restaurant",
    "Bar",
    "Coffee shop",
    "Sandwich shop",
    "American restaurant",
    "Cafe",
    "Fast food restaurant",
    "Takeout Restaurant",
    "Breakfast restaurant",
    "Pizza restaurant",
    "Italian restaurant",
    "Bakery",
    "Brunch restaurant",
    "Mexican restaurant",
    "Deli",
    "Seafood restaurant",
    "Lounge",
    "Cocktail bar",
    "Hamburger restaurant",
    "Chinese restaurant",
    "Bar & grill",
    "Espresso bar",
    "Asian restaurant",
    "Delivery Restaurant",
    "Salad shop",
    "Mediterranean restaurant",
    "Vegetarian restaurant",
    "Ice cream shop",
    "Wine bar",
    "Sports bar",
    "Vegan restaurant",
    "Sushi restaurant",
    "Internet cafe",
    "New American restaurant",
    "Pizza delivery",
    "Indian restaurant",
    "Japanese restaurant",
    "Night club",
    "Chicken restaurant",
    "Pizza Takeout",
    "Dessert shop",
    "Thai restaurant",
    "Health food restaurant",
    "French restaurant",
    "Family restaurant",
    "Steak house",
    "Asian fusion restaurant",
    "Juice shop",
    "Fine dining restaurant",
    "Latin American restaurant",
    "Bagel shop",
    "Chicken wings restaurant",
    "Pub",
    "Dessert restaurant",
    "Lunch restaurant",
    "Taco restaurant",
    "Barbecue restaurant",
    "Ethiopian restaurant",
    "Korean restaurant",
    "Bistro",
    "Donut shop",
    "Middle Eastern restaurant",
    "Diner",
    "Traditional American restaurant",
    "Pastry shop",
    "Greek restaurant",
    "Vietnamese restaurant",
    "Tea house",
    "Gluten-free restaurant",
    "Tex-Mex restaurant",
    "Southern restaurant (US)",
    "European restaurant",
    "Gastropub",
    "Hookah bar",
    "Spanish restaurant",
    "Cake shop",
    "Halal restaurant",
    "Soup restaurant",
    "Salvadoran restaurant",
    "Southeast Asian restaurant",
    "Caribbean restaurant",
    "Brewery",
    "Soup shop",
    "Oyster bar restaurant",
    "African restaurant",
    "Ramen restaurant",
    "Peruvian restaurant",
    "Cheesesteak restaurant",
    "Gay bar",
    "Bubble tea store",
    "Hoagie restaurant",
    "Club",
    "Frozen yogurt shop",
    "Hot dog restaurant",
    "Tapas restaurant",
    "Eclectic restaurant",
    "Beer garden",
    "Noodle shop",
    "Burrito restaurant",
    "Hot dog stand",
    "Food court",
    "Cookie shop",
    "Irish pub",
    "Pasta shop",
    "Jamaican restaurant",
    "Hawaiian restaurant",
    "South Asian restaurant",
    "Brewpub",
    "Cuban restaurant",
    "Lebanese restaurant",
    "Pakistani restaurant",
    "Food and drink",
    "Small plates restaurant",
    "Snack bar",
    "Afghani restaurant",
    "Soul food restaurant",
    "Cafeteria",
    "Tapas bar",
    "Buffet restaurant",
    "Beer hall",
    "Chinese takeaway",
    "Distillery",
    "Belgian restaurant",
    "Falafel restaurant",
    "Pho restaurant",
    "Chocolate shop",
    "Brasserie",
    "German restaurant",
    "Crêperie",
    "Nepalese restaurant",
    "Moroccan restaurant",
    "Brazilian restaurant",
    "Irish restaurant",
    "Eritrean restaurant",
    "Korean barbecue restaurant",
    "Jazz club",
    "Sushi takeaway",
    "Modern European restaurant",
    "Dim sum restaurant",
    "Neapolitan restaurant",
    "Dumpling restaurant",
    "Puerto Rican restaurant",
    "Venezuelan restaurant",
    "Box lunch supplier",
    "Karaoke",
    "Authentic Japanese restaurant",
    "Grill",
    "Organic restaurant",
    "Pie shop",
    "Continental restaurant",
    "Kosher restaurant",
    "Fried chicken takeaway",
    "Vegetarian cafe and deli",
    "Chocolate cafe",
    "Gay night club",
    "Argentinian restaurant",
    "Coffee roasters",
    "Central American restaurant",
    "Northern Italian restaurant",
    "Southern Italian restaurant",
    "Catering",
    "British restaurant",
    "East African restaurant",
    "Southwestern restaurant (US)",
    "Chinese noodle restaurant",
    "Laotian restaurant",
    "West African restaurant",
    "Wedding bakery",
    "Izakaya restaurant",
    "Scandinavian restaurant",
    "Cupcake shop",
    "Mid-Atlantic restaurant (US)",
    "Cider bar",
    "Sri Lankan restaurant",
    "Burmese restaurant",
    "Tiki bar",
    "French steakhouse restaurant",
    "Piano bar",
    "Raw food restaurant",
    "Sichuan restaurant",
    "Modern British restaurant",
    "New England restaurant",
    "Tuscan restaurant",
    "Modern Indian restaurant",
    "Fish & chips restaurant",
    "Ethnic restaurant",
    "Eastern European restaurant",
    "Russian restaurant",
    "Salsa bar",
    "BBQ area",
    "Meat dish restaurant",
    "Açaí shop",
    "Soft drinks shop",
    "Childrens cafe",
    "Mobile caterer",
    "Traditional restaurant",
    "Mongolian barbecue restaurant",
    "Wine wholesaler and importer",
    "Austrian restaurant",
    "Jewish restaurant",
    "Chophouse restaurant",
    "Gyro restaurant",
    "Restaurant or cafe",
    "Down home cooking restaurant",
    "Dog cafe",
    "Contemporary Louisiana restaurant",
    "Patisserie",
    "Serbian restaurant",
    "Dance restaurant",
    "Colombian restaurant",
    "Haute French restaurant",
    "Indian food",
    "Polynesian restaurant",
    "Nuevo Latino restaurant",
    "Czech restaurant",
    "Israeli restaurant",
    "Polish restaurant",
    "Fish and seafood restaurant",
    "Japanese food",
    "Native American restaurant",
    "Georgian restaurant",
    "Dominican restaurant",
    "Seafood donburi restaurant",
    "Tonkatsu restaurant",
    "Australian restaurant",
    "Guatemalan restaurant",
    "Syrian restaurant",
    "Yakitori restaurant",
    "Persian restaurant",
    "South American restaurant",
    "Portuguese restaurant",
    "Modern French restaurant",
    "Filipino restaurant",
    "South African restaurant",
    "Chicken shop"
}

# --- Execution ---
print(f"Scanning {META_FILE_PATH}...")
restaurant_ids = []
total_processed = 0

try:
    with open(META_FILE_PATH, 'r') as f:
        for line in f:
            if not line.strip():
                continue

            total_processed += 1
            try:
                record = json.loads(line)
                gmap_id = record.get('gmap_id')
                categories = record.get('category')

                # Check 1: Must have an ID and categories list
                if gmap_id and categories and isinstance(categories, list):
                    # Check 2: Intersection - Does this place have ANY restaurant category?
                    # We check if the set intersection is non-empty
                    if not RESTAURANT_CATEGORIES.isdisjoint(categories):
                        restaurant_ids.append(gmap_id)

            except json.JSONDecodeError:
                continue

            if total_processed % 10000 == 0:
                print(f"Processed {total_processed} businesses...", end='\r')

    print(f"\nScanning complete. Processed {total_processed} total businesses.")
    print(f"Found {len(restaurant_ids)} businesses matching restaurant categories.")

    # Save to JSON
    with open(OUTPUT_FILE, 'w') as f:
        json.dump(restaurant_ids, f)

    print(f"Saved restaurant IDs to: {OUTPUT_FILE}")

except FileNotFoundError:
    print(f"Error: Metadata file not found at {META_FILE_PATH}")
    sys.exit(1)

Scanning Data/meta-District_of_Columbia.json...
Processed 10000 businesses...
Scanning complete. Processed 11060 total businesses.
Found 3565 businesses matching restaurant categories.
Saved restaurant IDs to: Output_Data\restaurant_gmap_ids.json


In [5]:
#Category search with blacklist

import json
import os
import sys

# --- Configuration ---
META_FILE_PATH = 'Data/meta-District_of_Columbia.json'
OUTPUT_DIR = 'Output_Data'
OUTPUT_FILE = os.path.join(OUTPUT_DIR, 'restaurant_gmap_ids_1215.json')

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- Define Filter Categories ---

# 1. POSITIVE LIST: If a place has these, it MIGHT be a restaurant.
RESTAURANT_CATEGORIES = {
    "Restaurant", "Bar", "Coffee shop", "Sandwich shop", "American restaurant", "Cafe",
    "Fast food restaurant", "Takeout Restaurant", "Breakfast restaurant", "Pizza restaurant",
    "Italian restaurant", "Bakery", "Brunch restaurant", "Mexican restaurant", "Deli",
    "Seafood restaurant", "Lounge", "Cocktail bar", "Hamburger restaurant", "Chinese restaurant",
    "Bar & grill", "Espresso bar", "Asian restaurant", "Delivery Restaurant", "Salad shop",
    "Mediterranean restaurant", "Vegetarian restaurant", "Ice cream shop", "Wine bar",
    "Sports bar", "Vegan restaurant", "Sushi restaurant", "Internet cafe", "New American restaurant",
    "Pizza delivery", "Indian restaurant", "Japanese restaurant", "Night club", "Chicken restaurant",
    "Pizza Takeout", "Dessert shop", "Thai restaurant", "Health food restaurant", "French restaurant",
    "Family restaurant", "Steak house", "Asian fusion restaurant", "Juice shop", "Fine dining restaurant",
    "Latin American restaurant", "Bagel shop", "Chicken wings restaurant", "Pub", "Dessert restaurant",
    "Lunch restaurant", "Taco restaurant", "Barbecue restaurant", "Ethiopian restaurant",
    "Korean restaurant", "Bistro", "Donut shop", "Middle Eastern restaurant", "Diner",
    "Traditional American restaurant", "Pastry shop", "Greek restaurant", "Vietnamese restaurant",
    "Tea house", "Gluten-free restaurant", "Tex-Mex restaurant", "Southern restaurant (US)",
    "European restaurant", "Gastropub", "Hookah bar", "Spanish restaurant", "Cake shop",
    "Halal restaurant", "Soup restaurant", "Salvadoran restaurant", "Southeast Asian restaurant",
    "Caribbean restaurant", "Brewery", "Soup shop", "Oyster bar restaurant", "African restaurant",
    "Ramen restaurant", "Peruvian restaurant", "Cheesesteak restaurant", "Gay bar",
    "Bubble tea store", "Hoagie restaurant", "Club", "Frozen yogurt shop", "Hot dog restaurant",
    "Tapas restaurant", "Eclectic restaurant", "Beer garden", "Noodle shop", "Burrito restaurant",
    "Hot dog stand", "Food court", "Cookie shop", "Irish pub", "Pasta shop", "Jamaican restaurant",
    "Hawaiian restaurant", "South Asian restaurant", "Brewpub", "Cuban restaurant", "Lebanese restaurant",
    "Pakistani restaurant", "Small plates restaurant", "Snack bar", "Afghani restaurant",
    "Soul food restaurant", "Cafeteria", "Tapas bar", "Buffet restaurant", "Beer hall",
    "Chinese takeaway", "Distillery", "Belgian restaurant", "Falafel restaurant", "Pho restaurant",
    "Chocolate shop", "Brasserie", "German restaurant", "Crêperie", "Nepalese restaurant",
    "Moroccan restaurant", "Brazilian restaurant", "Irish restaurant", "Eritrean restaurant",
    "Korean barbecue restaurant", "Jazz club", "Sushi takeaway", "Modern European restaurant",
    "Dim sum restaurant", "Neapolitan restaurant", "Dumpling restaurant", "Puerto Rican restaurant",
    "Venezuelan restaurant", "Box lunch supplier", "Karaoke", "Authentic Japanese restaurant",
    "Grill", "Organic restaurant", "Pie shop", "Continental restaurant", "Kosher restaurant",
    "Fried chicken takeaway", "Vegetarian cafe and deli", "Chocolate cafe", "Gay night club",
    "Argentinian restaurant", "Coffee roasters", "Central American restaurant",
    "Northern Italian restaurant", "Southern Italian restaurant", "British restaurant",
    "East African restaurant", "Southwestern restaurant (US)", "Chinese noodle restaurant",
    "Laotian restaurant", "West African restaurant", "Wedding bakery", "Izakaya restaurant",
    "Scandinavian restaurant", "Cupcake shop", "Mid-Atlantic restaurant (US)", "Cider bar",
    "Sri Lankan restaurant", "Burmese restaurant", "Tiki bar", "French steakhouse restaurant",
    "Piano bar", "Raw food restaurant", "Sichuan restaurant", "Modern British restaurant",
    "New England restaurant", "Tuscan restaurant", "Modern Indian restaurant",
    "Fish & chips restaurant", "Ethnic restaurant", "Eastern European restaurant",
    "Russian restaurant", "Salsa bar", "BBQ area", "Meat dish restaurant", "Açaí shop",
    "Soft drinks shop", "Childrens cafe", "Mobile caterer", "Traditional restaurant",
    "Mongolian barbecue restaurant", "Wine wholesaler and importer", "Austrian restaurant",
    "Jewish restaurant", "Chophouse restaurant", "Gyro restaurant", "Restaurant or cafe",
    "Down home cooking restaurant", "Dog cafe", "Contemporary Louisiana restaurant",
    "Patisserie", "Serbian restaurant", "Dance restaurant", "Colombian restaurant",
    "Haute French restaurant", "Indian food", "Polynesian restaurant", "Nuevo Latino restaurant",
    "Czech restaurant", "Israeli restaurant", "Polish restaurant", "Fish and seafood restaurant",
    "Japanese food", "Native American restaurant", "Georgian restaurant", "Dominican restaurant",
    "Seafood donburi restaurant", "Tonkatsu restaurant", "Australian restaurant",
    "Guatemalan restaurant", "Syrian restaurant", "Yakitori restaurant", "Persian restaurant",
    "South American restaurant", "Portuguese restaurant", "Modern French restaurant",
    "Filipino restaurant", "South African restaurant", "Chicken shop"
}

# 2. NEGATIVE LIST (Blacklist): If a place has ANY of these, it is NOT a primary restaurant.
# This filters out government buildings, malls, cinemas, and gas stations that sell food.
EXCLUDED_CATEGORIES = {
    # Government & Offices
    "Government office", "Federal government office", "State government office",
    "City government office", "Local government office", "Courthouse", "Association or organization",
    "Corporate office", "Office space agency", "Non-profit organization",

    # Entertainment Venues (Cinemas often serve food but aren't restaurants)
    "Movie theater", "Cinema", "Drive-in movie theater", "Performing arts theater",
    "Amusement park", "Tourist attraction", "Museum", "Art gallery", "Historical landmark",
    "Casino", "Bowling alley",

    # Retail & Shopping (Malls and stores often have food courts/cafes)
    "Shopping mall", "Shopping center", "Outlet mall", "Plaza", "Department store",
    "Convenience store", "Gas station", "Grocery store", "Supermarket", "Liquor store",
    "Pharmacy", "Drugstore", "Discount store", "Warehouse club",

    # Lodging & Transport (Hotels often list 'Restaurant' as a secondary category for their internal dining)
    # Note: Dedicated restaurants INSIDE hotels usually have their own separate GMap ID.
    # The GMap ID for the hotel building itself should be excluded.
    "Hotel", "Lodging", "Motel", "Hostel", "Airport", "Train station", "Bus station",

    # Education & Health
    "School", "University", "College", "Education center", "Hospital", "Medical center",
    "Doctor", "Dentist", "Gym", "Fitness center"
}

# --- Execution ---
print(f"Scanning {META_FILE_PATH}...")
restaurant_ids = []
total_processed = 0
excluded_count = 0

try:
    with open(META_FILE_PATH, 'r') as f:
        for line in f:
            if not line.strip():
                continue

            total_processed += 1
            try:
                record = json.loads(line)
                gmap_id = record.get('gmap_id')
                categories = record.get('category')

                if gmap_id and categories and isinstance(categories, list):
                    categories_set = set(categories)

                    # Step 1: Must match at least one POSITIVE category
                    if not RESTAURANT_CATEGORIES.isdisjoint(categories_set):

                        # Step 2: Must NOT match any NEGATIVE category
                        if EXCLUDED_CATEGORIES.isdisjoint(categories_set):
                            restaurant_ids.append(gmap_id)
                        else:
                            # Useful for debugging: count how many false positives we catch
                            excluded_count += 1

            except json.JSONDecodeError:
                continue

            if total_processed % 10000 == 0:
                print(f"Processed {total_processed} businesses...", end='\r')

    print(f"\nScanning complete. Processed {total_processed} total businesses.")
    print(f"Filtered out {excluded_count} businesses (e.g., cinemas, offices, gas stations).")
    print(f"Found {len(restaurant_ids)} valid restaurants.")

    # Save to JSON
    with open(OUTPUT_FILE, 'w') as f:
        json.dump(restaurant_ids, f)

    print(f"Saved restaurant IDs to: {OUTPUT_FILE}")

except FileNotFoundError:
    print(f"Error: Metadata file not found at {META_FILE_PATH}")
    sys.exit(1)

Scanning Data/meta-District_of_Columbia.json...
Processed 10000 businesses...
Scanning complete. Processed 11060 total businesses.
Filtered out 68 businesses (e.g., cinemas, offices, gas stations).
Found 3492 valid restaurants.
Saved restaurant IDs to: Output_Data\restaurant_gmap_ids_1215.json


In [None]:
5