# Data Collection

**Category:**
1. Tourist attractions
2. Hotels
3. Restaurants

T&C: Filter data to 'user_ratings_total'>= 300


In [1]:
from dotenv import load_dotenv
import os

load_dotenv()  

API_KEY = os.getenv("API_KEY")

In [1]:
import requests
import time
import pandas as pd
import json
import os
from datetime import datetime

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Kuala Lumpur

## Tourist attractions

### Phase 1: Text search

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Kuala Lumpur, Malaysia"
search_keywords =  [
    "tourist attractions", "museums", "parks", "historical sites",
    "landmarks", "shopping mall", "temple", "mosque", "church",
    "island", "beach", "theme park", "natural preserve",
    "forest", "mountain", "waterfall", "zoo", "aquarium"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/kl_tourist_attractions_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 KL complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: tourist attractions in Kuala Lumpur, Malaysia
  - Page 1 complete. Total IDs: 18
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 32
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 40
Finished search for tourist attractions.
Starting search for: museums in Kuala Lumpur, Malaysia
  - Page 1 complete. Total IDs: 51
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 54
Finished search for museums.
Starting search for: parks in Kuala Lumpur, Malaysia
  - Page 1 complete. Total IDs: 66
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 75
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 84
Finished search for parks.
Starting search for: historical sites in Kuala Lumpur, Malaysia
  - Page 1 complete. Total IDs: 85
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 85
  Waiting 3s for next_page_token...
  - Page 3 comple

### Phase 2: Place details

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/kl_stage2_tourist_attractions_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/kl_tourist_attractions_place_data'
batch_size = 100 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    # FIX 1: Removed ** from the fields string
    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        # This gives you a list of strings, like ["Monday: 9 AM - 5 PM", ...]
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 225
Total IDs already cached: 0
[1/225] Fetching details for ID: ChIJoxNlLcpLzDERzE5fun6q7v0
[2/225] Fetching details for ID: ChIJy--G_eg3zDER6tu3Sj_86eM
[3/225] Fetching details for ID: ChIJ3_8kzZlJzDERNUD82tloEXc
[4/225] Fetching details for ID: ChIJO4xxANGYyTERVBNRkTxDfWs
[5/225] Fetching details for ID: ChIJm1A1iSw2zDERtJ-nsFBslu0
[6/225] Fetching details for ID: ChIJJTHt3YhMzDERvM3wesdMZds
[7/225] Fetching details for ID: ChIJj2n_9tCYyTERh0_RwmJa6ZA
[8/225] Fetching details for ID: ChIJYYacASk2zDERh3Iqf0NTR1g
[9/225] Fetching details for ID: ChIJ26dohvpLzDERZYxkRNwXzqA
[10/225] Fetching details for ID: ChIJE5JTkCg2zDERhiN5NLiq2R8
[11/225] Fetching details for ID: ChIJqZg2wMVJzDERdw_L8rBtIfo
[12/225] Fetching details for ID: ChIJR-PQ0m84zDERfECwtBmV0Yk
[13/225] Fetching details for ID: ChIJVXXIZcU3zDERkpdEfCQBDT8
[14/225] Fetching details for ID: ChIJK8N_CJ82zDERqpN2Iubh148
[15/225] Fetching details for ID: ChIJFxT1luNLzDERBAD-5RkuVRM
[16/225] F

## Restaurant

### Phase 1: Text search

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Kuala Lumpur, Malaysia"
search_keywords =   [
    "restaurant", "cafe", "bar", "food", "bistro","hawker centre",
    "food court", "seafood restaurant", "mamak restaurant",
    "malay restaurant", "indian restaurant", "japanese food",
    "western food","korean food", "chinese restaurant"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/kl_restaurants_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 KL complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: restaurant in Kuala Lumpur, Malaysia
  - Page 1 complete. Total IDs: 19
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 38
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 56
Finished search for restaurant.
Starting search for: cafe in Kuala Lumpur, Malaysia
  - Page 1 complete. Total IDs: 74
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 91
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 108
Finished search for cafe.
Starting search for: bar in Kuala Lumpur, Malaysia
  - Page 1 complete. Total IDs: 123
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 131
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 142
Finished search for bar.
Starting search for: food in Kuala Lumpur, Malaysia
  - Page 1 complete. Total IDs: 152
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 162
  Waiting 3s for

### Phase 2: Place details

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/kl_stage2_restaurants_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/kl_restaurants_place_data'
batch_size = 100 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    # FIX 1: Removed ** from the fields string
    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        # This gives you a list of strings, like ["Monday: 9 AM - 5 PM", ...]
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 549
Total IDs already cached: 0
[1/549] Fetching details for ID: ChIJsbWFbLVJzDEREWLEXfJ2Yqo
[2/549] Fetching details for ID: ChIJQWz3Xiw2zDERbEXEwVEsBto
[3/549] Fetching details for ID: ChIJg3SVybo3zDERADqEe4aL-VA
[4/549] Fetching details for ID: ChIJz8aGQgU3zDER4cmfVbrmVPw
[5/549] Fetching details for ID: ChIJzeuypKJJzDERHXhWd6KcFNU
[6/549] Fetching details for ID: ChIJT1_CUSw2zDERQ7_nbpyp41s
[7/549] Fetching details for ID: ChIJyZajZxw3zDERjATBdx3JelQ
[8/549] Fetching details for ID: ChIJXRIQbiU2zDERe0qOT6pSqJE
[9/549] Fetching details for ID: ChIJO46b3KRJzDER5gWHHPiswpk
[10/549] Fetching details for ID: ChIJ53qfq7Y3zDERipJ2QQuw140
[11/549] Fetching details for ID: ChIJZQTyPbM3zDERCM3DbnjRclQ
[12/549] Fetching details for ID: ChIJZToQyFQ2zDERJtoQvmFHc7g
[13/549] Fetching details for ID: ChIJ95uEytI5zDERaxfmDJdBZUI
[14/549] Fetching details for ID: ChIJ5__rVPFLzDER1rahBIC4JGI
[15/549] Fetching details for ID: ChIJETEpoT03zDER9pThuSDE2kY
[16/549] F

## Hotel

### Phase 1: Text search

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Kuala Lumpur, Malaysia"
search_keywords =    [
    "hotel", "resort", "airbnb", "homestay", "guesthouse",
"motel", "hostel", "boutique hotel"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/kl_hotel_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 KL complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: hotel in Kuala Lumpur, Malaysia
  - Page 1 complete. Total IDs: 18
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 32
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 46
Finished search for hotel.
Starting search for: resort in Kuala Lumpur, Malaysia
  - Page 1 complete. Total IDs: 51
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 52
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 57
Finished search for resort.
Starting search for: airbnb in Kuala Lumpur, Malaysia
  - Page 1 complete. Total IDs: 59
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 61
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 67
Finished search for airbnb.
Starting search for: homestay in Kuala Lumpur, Malaysia
Error in search for 'homestay': ZERO_RESULTS
Finished search for homestay.
Starting search for: guesthouse in Kuala Lumpur, 

### Phase 2: Place details

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/kl_stage2_hotel_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/kl_hotel_place_data'
batch_size = 100 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    # FIX 1: Removed ** from the fields string
    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 106
Total IDs already cached: 0
[1/106] Fetching details for ID: ChIJadCDLNBJzDER2ZPHt4dB9xY
[2/106] Fetching details for ID: ChIJh5ZUHM1JzDERGcRD2RkB0zs
[3/106] Fetching details for ID: ChIJSyjlVuZJzDER6FHsJ9Cd1oM
[4/106] Fetching details for ID: ChIJkxav4Sk2zDER0zfZeYXlwfY
[5/106] Fetching details for ID: ChIJy7bjBilIzDER-WVjAcqIOe0
[6/106] Fetching details for ID: ChIJi8FtadJJzDERR8Ay7h-xhag
[7/106] Fetching details for ID: ChIJgem-G9NJzDERyYZ0w6DIOuc
[8/106] Fetching details for ID: ChIJ34bYjtU3zDERVyctdbXli-I
[9/106] Fetching details for ID: ChIJk-_x18FJzDER0ATpcoL2uI0
[10/106] Fetching details for ID: ChIJOWfiUtBJzDERAr6WwsWxRnk
[11/106] Fetching details for ID: ChIJTf0--Cs2zDERFbJ1c8OQXYc
[12/106] Fetching details for ID: ChIJodR-Vi1IzDERzeOZjmQPB-w
[13/106] Fetching details for ID: ChIJFTWd-us9zDERBiJX7zbDTFk
[14/106] Fetching details for ID: ChIJB8FE89BJzDER_VzFuZAco3Y
[15/106] Fetching details for ID: ChIJ6WzDGFs3zDERhc-OCZdURVM
[16/106] F

# Selangor

## Tourist Attractions

### Phase 1: Text Search

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Selangor, Malaysia"
search_keywords =  [
    "tourist attractions", "museums", "parks", "historical sites",
    "landmarks", "shopping mall", "temple", "mosque", "church",
    "island", "beach", "theme park", "natural preserve",
    "forest", "mountain", "waterfall", "zoo", "aquarium"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/selangor_tourist_attractions_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 KL complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: tourist attractions in Selangor, Malaysia
  - Page 1 complete. Total IDs: 15
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 25
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 37
Finished search for tourist attractions.
Starting search for: museums in Selangor, Malaysia
  - Page 1 complete. Total IDs: 42
Finished search for museums.
Starting search for: parks in Selangor, Malaysia
  - Page 1 complete. Total IDs: 57
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 72
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 82
Finished search for parks.
Starting search for: historical sites in Selangor, Malaysia
  - Page 1 complete. Total IDs: 85
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 90
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 92
Finished search for historical sites.
Starting search for: landmarks 

### Phase 2: Place details

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/selangor_stage2_tourist_attractions_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/selangor_tourist_attractions_place_data'
batch_size = 100 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    # FIX 1: Removed ** from the fields string
    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 279
Total IDs already cached: 0
[1/279] Fetching details for ID: ChIJNcwuNhpQzDERW96z_RhiPv4
[2/279] Fetching details for ID: ChIJb--aZrOszTERQRRydRmx2ms
[3/279] Fetching details for ID: ChIJNXQunadPzDERL2YwBNkgTSI
[4/279] Fetching details for ID: ChIJJ5m5fi9NzDERfJRxhABGfBs
[5/279] Fetching details for ID: ChIJmxJP_daKzDERkwyZ-OtL8VE
[6/279] Fetching details for ID: ChIJAz5nvLtLzDERbhlyAgF4bUY
[7/279] Fetching details for ID: ChIJc81a5GtJzDERXKXMtN51ZEg
[8/279] Fetching details for ID: ChIJEahtjIPLzTERUxMVeEyCFqc
[9/279] Fetching details for ID: ChIJ-Q7SA67KzTERHsCzroKfMwo
[10/279] Fetching details for ID: ChIJm1A1iSw2zDERtJ-nsFBslu0
[11/279] Fetching details for ID: ChIJ_Y02esvLzTERV1ypX4RaxlE
[12/279] Fetching details for ID: ChIJFSdoifZpzDER7y36LRwJbOU
[13/279] Fetching details for ID: ChIJXdl9VPm2zTERn0D4oXDHqEU
[14/279] Fetching details for ID: ChIJ_eQULrOszTERuISYlExE51Y
[15/279] Fetching details for ID: ChIJh49nFMpNzDER4qf_TF6n3tc
[16/279] F

## Restaurant

### Phase 1: Text Search

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Selangor, Malaysia"
search_keywords =  [
    "restaurant", "cafe", "bar", "food", "bistro","hawker centre",
    "food court", "seafood restaurant", "mamak restaurant",
    "malay restaurant", "indian restaurant", "japanese food",
    "western food","korean food", "chinese restaurant"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/selangor_restaurants_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 KL complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: restaurant in Selangor, Malaysia
  - Page 1 complete. Total IDs: 16
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 33
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 46
Finished search for restaurant.
Starting search for: cafe in Selangor, Malaysia
  - Page 1 complete. Total IDs: 63
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 79
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 91
Finished search for cafe.
Starting search for: bar in Selangor, Malaysia
  - Page 1 complete. Total IDs: 103
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 113
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 120
Finished search for bar.
Starting search for: food in Selangor, Malaysia
  - Page 1 complete. Total IDs: 133
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 143
  Waiting 3s for next_page_token.

### Phase 2: Place details

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/selangor_stage2_restaurants_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/selangor_restaurants_place_data'
batch_size = 100 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    # FIX 1: Removed ** from the fields string
    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 549
Total IDs already cached: 0
[1/549] Fetching details for ID: ChIJOS9HNEb4zDERkoNj5Lfav7s
[2/549] Fetching details for ID: ChIJoy-9TBhMzDERAIgST_KXm0c
[3/549] Fetching details for ID: ChIJhSocvf82zDERP78uvCxEons
[4/549] Fetching details for ID: ChIJX0QOZdFMzDER1Xi-vLXRhOU
[5/549] Fetching details for ID: ChIJvzcScan0zDERxMoGdF3PcnA
[6/549] Fetching details for ID: ChIJDV_2X5ZNzDER-hLL_-Be3no
[7/549] Fetching details for ID: ChIJ9wiBHGpNzDERfysi3fZdYQE
[8/549] Fetching details for ID: ChIJr5fucNNbzDERK4omjPY3WC4
[9/549] Fetching details for ID: ChIJIUxS0FBszDERjsqRD_bd8UU
[10/549] Fetching details for ID: ChIJ0b-R9tBZzDERuwOTmikgqFQ
[11/549] Fetching details for ID: ChIJQdzeQ8RMzDEROCVzLscgiX8
[12/549] Fetching details for ID: ChIJobrDichPzDERcI84P8Y2m-w
[13/549] Fetching details for ID: ChIJ-fpxZzwUzDER_uXVJSt51Xo
[14/549] Fetching details for ID: ChIJ-bcMtFZJzDERztua3I0FE2I
[15/549] Fetching details for ID: ChIJMVeo-RpTzDERmgbCMt5fqVA
[16/549] F

## Hotel

### Phase 1: Text Search

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Selangor, Malaysia"
search_keywords =    [
    "hotel", "resort", "airbnb", "homestay", "guesthouse",
    "motel", "hostel", "boutique hotel"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/selangor_hotel_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 KL complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: hotel in Selangor, Malaysia
  - Page 1 complete. Total IDs: 16
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 31
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 45
Finished search for hotel.
Starting search for: resort in Selangor, Malaysia
  - Page 1 complete. Total IDs: 62
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 76
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 80
Finished search for resort.
Starting search for: airbnb in Selangor, Malaysia
  - Page 1 complete. Total IDs: 80
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 80
Finished search for airbnb.
Starting search for: homestay in Selangor, Malaysia
Error in search for 'homestay': ZERO_RESULTS
Finished search for homestay.
Starting search for: guesthouse in Selangor, Malaysia
  - Page 1 complete. Total IDs: 82
Finished search for guesthouse.
Starting search

### Phase 2: Place details

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/selangor_stage2_hotel_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/selangor_hotel_place_data'
batch_size = 100 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    # FIX 1: Removed ** from the fields string
    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 120
Total IDs already cached: 0
[1/120] Fetching details for ID: ChIJTzI4YJqQzDERbANK9ZHQKKY
[2/120] Fetching details for ID: ChIJu9CSAai1zTERbsMDWMe4u5w
[3/120] Fetching details for ID: ChIJE5fKo5pAzDER_ontLL235io
[4/120] Fetching details for ID: ChIJSb6RRk_HzTERl2TvIfvnD4Q
[5/120] Fetching details for ID: ChIJHRMe5Fg1zDERomenmJMhlls
[6/120] Fetching details for ID: ChIJK13X97ZOzDERECGk7wtI__o
[7/120] Fetching details for ID: ChIJUzwIa9jLzTERs-KeE_GSaDg
[8/120] Fetching details for ID: ChIJu4b_S5qQzDERX9SpmQqTJG8
[9/120] Fetching details for ID: ChIJxYkG-2XMzTERNFNN7sbKNcs
[10/120] Fetching details for ID: ChIJExRePxZqzDERt0dP0WbCEDc
[11/120] Fetching details for ID: ChIJTf0--Cs2zDERFbJ1c8OQXYc
[12/120] Fetching details for ID: ChIJFTWd-us9zDERBiJX7zbDTFk
[13/120] Fetching details for ID: ChIJpa-eNXzOzTERvR7MjczMJEI
[14/120] Fetching details for ID: ChIJJewp6FdSzDERWf_jKQuISFc
[15/120] Fetching details for ID: ChIJ9QvZLixLzDER43jMpS7IL5s
[16/120] F

# Penang

## Tourist Attractions

### Phase 1: Text Search

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Penang, Malaysia"
search_keywords =  [
    "tourist attractions", "museums", "parks", "historical sites",
    "landmarks", "shopping mall", "temple", "mosque", "church",
    "island", "beach", "theme park", "natural preserve",
    "forest", "mountain", "waterfall", "zoo", "aquarium"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/penang_tourist_attractions_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: tourist attractions in Penang, Malaysia
  - Page 1 complete. Total IDs: 20
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 37
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 46
Finished search for tourist attractions.
Starting search for: museums in Penang, Malaysia
  - Page 1 complete. Total IDs: 50
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 52
Finished search for museums.
Starting search for: parks in Penang, Malaysia
  - Page 1 complete. Total IDs: 63
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 71
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 71
Finished search for parks.
Starting search for: historical sites in Penang, Malaysia
  - Page 1 complete. Total IDs: 72
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 72
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 74
Finish

### Phase 2: Place details

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/penang_stage2_tourist_attractions_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/penang_tourist_attractions_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 168
Total IDs already cached: 0
[1/168] Fetching details for ID: ChIJj9l3JL6_SjARpW6YLDIrtjk
[2/168] Fetching details for ID: ChIJ3XarOFbCSjAR6HHFOkU29lQ
[3/168] Fetching details for ID: ChIJ6ztb-fznSjAR4wsqPcykCuA
[4/168] Fetching details for ID: ChIJrzLcX_nISjAROoUcgOWPZAI
[5/168] Fetching details for ID: ChIJ7c98AMzCSjARJu_4dnJKMWw
[6/168] Fetching details for ID: ChIJJ6QQ59PrSjAR-WrOukGa4wA
[7/168] Fetching details for ID: ChIJF2TlbsPDSjAR6yFD9mKvX9U
[8/168] Fetching details for ID: ChIJu40ep5rDSjARuoH1NhjrY-E
[9/168] Fetching details for ID: ChIJxxFGL4TQSjARA3a5exMIv9s
[10/168] Fetching details for ID: ChIJd4X4hs3FSjARHHCpdB9aIkA
[11/168] Fetching details for ID: ChIJOQ-o1PLQSjARlacqP9jjLgg
[12/168] Fetching details for ID: ChIJJ--BzTXDSjARE4q6CZ1HiVA
[13/168] Fetching details for ID: ChIJ50W1D43DSjARlPqYV1MqscE
[14/168] Fetching details for ID: ChIJt9ZhIGHISjAR5UYGFyXsJsk
[15/168] Fetching details for ID: ChIJ3xaC-ru_SjARrMFeCiCAbvE
[16/168] F

## Restaurant

### Phase 1: Text Search

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Penang, Malaysia"
search_keywords =  [
    "restaurant", "cafe", "bar", "food", "bistro","hawker centre",
    "food court", "seafood restaurant", "mamak restaurant",
    "malay restaurant", "indian restaurant", "japanese food",
    "western food","korean food", "chinese restaurant"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/penang_restaurants_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: restaurant in Penang, Malaysia
  - Page 1 complete. Total IDs: 20
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 35
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 51
Finished search for restaurant.
Starting search for: cafe in Penang, Malaysia
  - Page 1 complete. Total IDs: 67
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 85
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 99
Finished search for cafe.
Starting search for: bar in Penang, Malaysia
  - Page 1 complete. Total IDs: 109
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 114
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 117
Finished search for bar.
Starting search for: food in Penang, Malaysia
  - Page 1 complete. Total IDs: 121
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 126
  Waiting 3s for next_page_token...
  - P

### Phase 2: Place details

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/penang_stage2_restaurants_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/penang_restaurants_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 317
Total IDs already cached: 0
[1/317] Fetching details for ID: ChIJ1QGfFdrDSjARjjI6AjHOZGE
[2/317] Fetching details for ID: ChIJKUUY68vDSjARTj0SVt2K-FI
[3/317] Fetching details for ID: ChIJH4AZp1LCSjARX0P8CxndSfM
[4/317] Fetching details for ID: ChIJw3J6CMHDSjARgwztFAojOyM
[5/317] Fetching details for ID: ChIJsyDSkLrDSjARSTPelqNUFQ4
[6/317] Fetching details for ID: ChIJsy71mY_DSjARmk89_jE9puU
[7/317] Fetching details for ID: ChIJ7_MpOLPDSjARniyNVCbpi1c
[8/317] Fetching details for ID: ChIJGVDmCo_DSjARUykhDgjSqb8
[9/317] Fetching details for ID: ChIJcw_4mI_DSjARqwqH3X4ZxGc
[10/317] Fetching details for ID: ChIJs4VZ_JTDSjARsJ0Q7w9uZ_o
[11/317] Fetching details for ID: ChIJGQu5kVHASjARXjq1pkrI6Ts
[12/317] Fetching details for ID: ChIJjynNe6fDSjARCb9NYyaZ76E
[13/317] Fetching details for ID: ChIJHxw7MdzDSjAR4k5E82lQtsk
[14/317] Fetching details for ID: ChIJQZ-6bd7DSjARmj3r5gTs_J0
[15/317] Fetching details for ID: ChIJEx1THgTDSjARUFk0feDse1U
[16/317] F

## Hotel

### Phase 1: Text Search

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Penang, Malaysia"
search_keywords =    [
    "hotel", "resort", "airbnb", "homestay", "guesthouse",
    "motel", "hostel", "boutique hotel"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/penang_hotel_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: hotel in Penang, Malaysia
  - Page 1 complete. Total IDs: 17
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 32
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 43
Finished search for hotel.
Starting search for: resort in Penang, Malaysia
  - Page 1 complete. Total IDs: 50
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 56
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 56
Finished search for resort.
Starting search for: airbnb in Penang, Malaysia
  - Page 1 complete. Total IDs: 60
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 62
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 65
Finished search for airbnb.
Starting search for: homestay in Penang, Malaysia
Error in search for 'homestay': ZERO_RESULTS
Finished search for homestay.
Starting search for: guesthouse in Penang, Malaysia
  - Page 1 complete. 

### Phase 2: Place details

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/penang_stage2_hotel_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/penang_hotel_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 83
Total IDs already cached: 0
[1/83] Fetching details for ID: ChIJB7TT7c7HSjARA4Jf-xeTZ9U
[2/83] Fetching details for ID: ChIJqTSiR5rDSjAR1klafrVJENU
[3/83] Fetching details for ID: ChIJbyd3r1HDSjAR6qjH27ia8Y4
[4/83] Fetching details for ID: ChIJ5eWkLOPDSjARxt5S8drk5UQ
[5/83] Fetching details for ID: ChIJgSfAzpHDSjARzG5foFIkSOo
[6/83] Fetching details for ID: ChIJe83EeFTCSjARrG0Svr1Sn60
[7/83] Fetching details for ID: ChIJnx6DZ1LDSjARNnFRwbz05I0
[8/83] Fetching details for ID: ChIJT_-vTwHDSjAROYc7--YwMNw
[9/83] Fetching details for ID: ChIJU4SWvo3DSjARaF0-l7zks0E
[10/83] Fetching details for ID: ChIJ3yDHXr3DSjARsl_NphkAkzM
[11/83] Fetching details for ID: ChIJF_UWVZfDSjARrluQiFT_3Fs
[12/83] Fetching details for ID: ChIJuTvIHRyiSjARZfpR9MmnjYQ
[13/83] Fetching details for ID: ChIJM_JCPSjDSjARvLGywhi5ZUk
[14/83] Fetching details for ID: ChIJa8iPrrTCSjARmOt4cf2b_sE
[15/83] Fetching details for ID: ChIJX-KOSX3DSjARe0hUHzm8iTk
[16/83] Fetching details f

# Melaka

## Tourist Attractions

### Phase 1: Text Search

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Melaka, Malaysia"
search_keywords =  [
    "tourist attractions", "museums", "parks", "historical sites",
    "landmarks", "shopping mall", "temple", "mosque", "church",
    "island", "beach", "theme park", "natural preserve",
    "forest", "mountain", "waterfall", "zoo", "aquarium"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/melaka_tourist_attractions_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: tourist attractions in Melaka, Malaysia
  - Page 1 complete. Total IDs: 18
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 23
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 31
Finished search for tourist attractions.
Starting search for: museums in Melaka, Malaysia
  - Page 1 complete. Total IDs: 32
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 33
Finished search for museums.
Starting search for: parks in Melaka, Malaysia
  - Page 1 complete. Total IDs: 44
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 48
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 48
Finished search for parks.
Starting search for: historical sites in Melaka, Malaysia
  - Page 1 complete. Total IDs: 50
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 51
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 52
Finish

### Phase 2: Place details

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/melaka_stage2_tourist_attractions_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/melaka_tourist_attractions_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 122
Total IDs already cached: 0
[1/122] Fetching details for ID: ChIJK1COAhDu0TERKWKNTxQWwy8
[2/122] Fetching details for ID: ChIJRUQNwJXv0TER_BU8-rdIuzM
[3/122] Fetching details for ID: ChIJLVcTMx3x0TERUyCKFp4Sz3g
[4/122] Fetching details for ID: ChIJQX8JdKnl0TERrH-_MJ4dnNI
[5/122] Fetching details for ID: ChIJU4mkkSDu0TER-gdTCZYf4Ss
[6/122] Fetching details for ID: ChIJnfW5-NLv0TER-qCgmTfKpac
[7/122] Fetching details for ID: ChIJI0_TZADx0TER9IjtqRxSASQ
[8/122] Fetching details for ID: ChIJTd7U3t7x0TERgaLEVI5AJ4s
[9/122] Fetching details for ID: ChIJ92jmQ23n0TERFMiZG7VzuFM
[10/122] Fetching details for ID: ChIJW_RdyQNHzDERrXyjH-ZwKWo
[11/122] Fetching details for ID: ChIJey7oxn3x0TERxfytdgU6C5U
[12/122] Fetching details for ID: ChIJq6463uDx0TERR2FjVOigvMg
[13/122] Fetching details for ID: ChIJ1cXIFU_70TER3yWOlxyTaeA
[14/122] Fetching details for ID: ChIJTea-Pg_l0TER38wU1pYkWSk
[15/122] Fetching details for ID: ChIJvSRsY3nv0TERpQOGwzaLR4s
[16/122] F

## Restaurant

### Phase 1: Text Search

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Melaka, Malaysia"
search_keywords =  [
    "restaurant", "cafe", "bar", "food", "bistro","hawker centre",
    "food court", "seafood restaurant", "mamak restaurant",
    "malay restaurant", "indian restaurant", "japanese food",
    "western food","korean food", "chinese restaurant"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/melaka_restaurants_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: restaurant in Melaka, Malaysia
  - Page 1 complete. Total IDs: 19
  Waiting 3s for next_page_token...
Error in search for 'restaurant': ZERO_RESULTS
Finished search for restaurant.
Starting search for: cafe in Melaka, Malaysia
  - Page 1 complete. Total IDs: 31
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 41
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 52
Finished search for cafe.
Starting search for: bar in Melaka, Malaysia
  - Page 1 complete. Total IDs: 61
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 64
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 65
Finished search for bar.
Starting search for: food in Melaka, Malaysia
  - Page 1 complete. Total IDs: 68
  Waiting 3s for next_page_token...
Error in search for 'food': ZERO_RESULTS
Finished search for food.
Starting search for: bistro in Melaka, Malaysia
  - Page 1 complete. Tota

### Phase 2: Place details

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/melaka_stage2_restaurants_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/melaka_restaurants_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 240
Total IDs already cached: 0
[1/240] Fetching details for ID: ChIJ2aiCUWXl0TERddGWVAjBdu0
[2/240] Fetching details for ID: ChIJB-oArRnx0TER3UCHA71A19E
[3/240] Fetching details for ID: ChIJ58JJfNzx0TERXqNFXRie9ss
[4/240] Fetching details for ID: ChIJER0CQSLx0TEROilQNLrGsvU
[5/240] Fetching details for ID: ChIJF9T3RMLx0TERjhUeOohp_qg
[6/240] Fetching details for ID: ChIJnYue3-Hx0TERjGgwg4hm0Lk
[7/240] Fetching details for ID: ChIJgRK6SNfx0TERp9RXem9pDFc
[8/240] Fetching details for ID: ChIJ1w6vcMHx0TER0DW9eJizQPQ
[9/240] Fetching details for ID: ChIJBWxED8Hx0TERsfM8-QdX9As
[10/240] Fetching details for ID: ChIJKcgqp-Lx0TERqvDz-JGmugw
[11/240] Fetching details for ID: ChIJ2a1RJQnx0TER95fj-xlzpig
[12/240] Fetching details for ID: ChIJEQC6LbDx0TERxl1CvJMHGz0
[13/240] Fetching details for ID: ChIJ04rEztnx0TERMDQQpxcb0pA
[14/240] Fetching details for ID: ChIJe_HzONzx0TERLIDBF4guNWE
[15/240] Fetching details for ID: ChIJUd6WMkbx0TERkMWmBi_7x18
[16/240] F

## Hotel

### Phase 1: Text Search

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Melaka, Malaysia"
search_keywords =    [
    "hotel", "resort", "airbnb", "homestay", "guesthouse",
    "motel", "hostel", "boutique hotel"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/melaka_hotel_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: hotel in Melaka, Malaysia
  - Page 1 complete. Total IDs: 19
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 29
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 42
Finished search for hotel.
Starting search for: resort in Melaka, Malaysia
  - Page 1 complete. Total IDs: 49
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 53
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 56
Finished search for resort.
Starting search for: airbnb in Melaka, Malaysia
  - Page 1 complete. Total IDs: 57
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 58
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 58
Finished search for airbnb.
Starting search for: homestay in Melaka, Malaysia
Error in search for 'homestay': ZERO_RESULTS
Finished search for homestay.
Starting search for: guesthouse in Melaka, Malaysia
  - Page 1 complete. 

### Phase 2: Place details

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/melaka_stage2_hotel_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/melaka_hotel_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 70
Total IDs already cached: 0
[1/70] Fetching details for ID: ChIJ34mkeubx0TERhHh2JuVaJoI
[2/70] Fetching details for ID: ChIJc8IZUxvu0TERLqU6HlLK7P0
[3/70] Fetching details for ID: ChIJLSldXObx0TERUTYKwO-v7To
[4/70] Fetching details for ID: ChIJxTrBBdnx0TERokTk--oJyJk
[5/70] Fetching details for ID: ChIJvYKdENnx0TER7tJbefc3KJs
[6/70] Fetching details for ID: ChIJcX9QERvu0TERMWtBr5qVl-E
[7/70] Fetching details for ID: ChIJxaa1o-Hx0TERciNSToPVopA
[8/70] Fetching details for ID: ChIJ0RAVytvx0TER0sCy7PW0P8I
[9/70] Fetching details for ID: ChIJKwaGLRfu0TERm0lsKvMkq3w
[10/70] Fetching details for ID: ChIJOwJo-ePx0TERYzPt2Hrp8tg
[11/70] Fetching details for ID: ChIJZXpGe7jx0TERzPkuhJf2woA
[12/70] Fetching details for ID: ChIJ3XRkWB7u0TERHVAYIBSLkJE
[13/70] Fetching details for ID: ChIJazDtVAPu0TERpczA5tEQ_7k
[14/70] Fetching details for ID: ChIJz0ax0qXv0TERbFB4JfhhzX4
[15/70] Fetching details for ID: ChIJ04KM3ojx0TEROwc58zsYMpg
[16/70] Fetching details f

# Ipoh

## Tourist Attractions

### Phase 1: Text Search

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Ipoh, Malaysia"
search_keywords =  [
    "tourist attractions", "museums", "parks", "historical sites",
    "landmarks", "shopping mall", "temple", "mosque", "church",
    "island", "beach", "theme park", "natural preserve",
    "forest", "mountain", "waterfall", "zoo", "aquarium"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/ipoh_tourist_attractions_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: tourist attractions in Ipoh, Malaysia
  - Page 1 complete. Total IDs: 13
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 20
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 31
Finished search for tourist attractions.
Starting search for: museums in Ipoh, Malaysia
  - Page 1 complete. Total IDs: 35
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 35
Finished search for museums.
Starting search for: parks in Ipoh, Malaysia
  - Page 1 complete. Total IDs: 37
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 40
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 41
Finished search for parks.
Starting search for: historical sites in Ipoh, Malaysia
  - Page 1 complete. Total IDs: 42
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 42
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 42
Finished searc

### Phase 2: Place details

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/ipoh_stage2_tourist_attractions_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/ipoh_tourist_attractions_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 107
Total IDs already cached: 0
[1/107] Fetching details for ID: ChIJ3666bnvsyjERKOtgyg_HNsU
[2/107] Fetching details for ID: ChIJQfpGEnzsyjERCu3-Y4MKrqY
[3/107] Fetching details for ID: ChIJXz0-PrviyjERAMs5w2S9g3I
[4/107] Fetching details for ID: ChIJgfqIa27syjERpVcxh0U8JoU
[5/107] Fetching details for ID: ChIJM7BCfWuTyjERGGeJBMh68bA
[6/107] Fetching details for ID: ChIJ35wnylvtyjERLbv82zwlikA
[7/107] Fetching details for ID: ChIJZdUaIIjsyjERzJhq3gdjZdU
[8/107] Fetching details for ID: ChIJ68dFc5-TyjERRX8bpKia0lM
[9/107] Fetching details for ID: ChIJC3xCqWPsyjERI3LE0TE58kU
[10/107] Fetching details for ID: ChIJIVR0x8vryjERfg2QXb6EqPg
[11/107] Fetching details for ID: ChIJMznr9GbsyjER81VrEgda4Ho
[12/107] Fetching details for ID: ChIJtzw37X7syjERC2q-yhmR3lA
[13/107] Fetching details for ID: ChIJq6qauI_syjERyBLMBFikH8Y
[14/107] Fetching details for ID: ChIJIRqTuJSuyjERGHkwbvpNT9I
[15/107] Fetching details for ID: ChIJWyheJuNntTERMmFrMJWf5xU
[16/107] F

## Restaurant

### Phase 1: Text Search

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Ipoh, Malaysia"
search_keywords =  [
    "restaurant", "cafe", "bar", "food", "bistro","hawker centre",
    "food court", "seafood restaurant", "mamak restaurant",
    "malay restaurant", "indian restaurant", "japanese food",
    "western food","korean food", "chinese restaurant"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/ipoh_restaurants_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: restaurant in Ipoh, Malaysia
  - Page 1 complete. Total IDs: 19
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 35
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 55
Finished search for restaurant.
Starting search for: cafe in Ipoh, Malaysia
  - Page 1 complete. Total IDs: 69
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 77
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 84
Finished search for cafe.
Starting search for: bar in Ipoh, Malaysia
  - Page 1 complete. Total IDs: 90
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 91
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 93
Finished search for bar.
Starting search for: food in Ipoh, Malaysia
  - Page 1 complete. Total IDs: 94
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 101
  Waiting 3s for next_page_token...
  - Page 3 comple

### Phase 2: Place details

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/ipoh_stage2_restaurants_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/ipoh_restaurants_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 277
Total IDs already cached: 0
[1/277] Fetching details for ID: ChIJJ2v3WsrtyjERqo_017O70s0
[2/277] Fetching details for ID: ChIJEZ1aF37syjERXN2NDdRoh1A
[3/277] Fetching details for ID: ChIJQfpGEnzsyjERCu3-Y4MKrqY
[4/277] Fetching details for ID: ChIJ3T6hBmDsyjERTqKz-Z6ZfDA
[5/277] Fetching details for ID: ChIJG3GXPADtyjERsYZkkw2x0S0
[6/277] Fetching details for ID: ChIJa03daVyTyjERzRSdN0A8DW8
[7/277] Fetching details for ID: ChIJQSeQpmHsyjERB4Pm33DjbLc
[8/277] Fetching details for ID: ChIJmWHkfGLsyjERav5S6SxCXvE
[9/277] Fetching details for ID: ChIJEXZR-5jsyjERGPNSMf49S6g
[10/277] Fetching details for ID: ChIJhzYz9FPtyjERrktQVlB8-L8
[11/277] Fetching details for ID: ChIJux3zgqjtyjERIInWxRZbYes
[12/277] Fetching details for ID: ChIJZzIOU-vryjERnTTrfidHcU8
[13/277] Fetching details for ID: ChIJ-87fX17tyjER7CELCoTy7V4
[14/277] Fetching details for ID: ChIJ747acGPsyjERpcu0nk7JboA
[15/277] Fetching details for ID: ChIJhamELw_tyjER5BJytgVPcp0
[16/277] F

## Hotel

### Phase 1: Text Search

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Ipoh, Malaysia"
search_keywords =    [
    "hotel", "resort", "airbnb", "homestay", "guesthouse",
    "motel", "hostel", "boutique hotel"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/ipoh_hotel_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: hotel in Ipoh, Malaysia
  - Page 1 complete. Total IDs: 13
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 20
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 29
Finished search for hotel.
Starting search for: resort in Ipoh, Malaysia
  - Page 1 complete. Total IDs: 32
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 32
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 32
Finished search for resort.
Starting search for: airbnb in Ipoh, Malaysia
  - Page 1 complete. Total IDs: 35
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 35
Finished search for airbnb.
Starting search for: homestay in Ipoh, Malaysia
Error in search for 'homestay': ZERO_RESULTS
Finished search for homestay.
Starting search for: guesthouse in Ipoh, Malaysia
  - Page 1 complete. Total IDs: 35
Finished search for guesthouse.
Starting search for: motel in Ipoh,

### Phase 2: Place details

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/ipoh_stage2_hotel_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/ipoh_hotel_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 51
Total IDs already cached: 0
[1/51] Fetching details for ID: ChIJSaFzHWPsyjEReEtqk_bS5Ls
[2/51] Fetching details for ID: ChIJRRrQyyvsyjER-mx7Pu8zEMs
[3/51] Fetching details for ID: ChIJ8z8De1ntyjERzppTnh3hwp8
[4/51] Fetching details for ID: ChIJsf7zwCvsyjERZ33lm23Bw0E
[5/51] Fetching details for ID: ChIJp6E2ybLuyjERDtgnGuVsXxo
[6/51] Fetching details for ID: ChIJW07DCiDtyjERTwnKEBNlZAs
[7/51] Fetching details for ID: ChIJD92Ll2LsyjERoJW78iClB6Y
[8/51] Fetching details for ID: ChIJN9ZokYnsyjERsfFfZhF0oWk
[9/51] Fetching details for ID: ChIJbbcDtB3tyjERbturpi2QVDk
[10/51] Fetching details for ID: ChIJC3xCqWPsyjERI3LE0TE58kU
[11/51] Fetching details for ID: ChIJb3e2nqryyjER2Q6ex7ccnh4
[12/51] Fetching details for ID: ChIJV4YEIWXsyjERcHTQPisOXy4
[13/51] Fetching details for ID: ChIJJR9Vo3DsyjERqkLdmnWq-8o
[14/51] Fetching details for ID: ChIJE0s-JIfryjERuDc8GE_NHNc
[15/51] Fetching details for ID: ChIJsRWj-GTsyjERIxRJszdqO2M
[16/51] Fetching details f

# Langkawi

## Tourist Attractions

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Langkawi, Malaysia"
search_keywords =  [
    "tourist attractions", "museums", "parks", "historical sites",
    "landmarks", "shopping mall", "temple", "mosque", "church",
    "island", "beach", "theme park", "natural preserve",
    "forest", "mountain", "waterfall", "zoo", "aquarium"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/langkawi_tourist_attractions_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: tourist attractions in Langkawi, Malaysia
  - Page 1 complete. Total IDs: 5
Finished search for tourist attractions.
Starting search for: museums in Langkawi, Malaysia
  - Page 1 complete. Total IDs: 10
Finished search for museums.
Starting search for: parks in Langkawi, Malaysia
  - Page 1 complete. Total IDs: 16
Finished search for parks.
Starting search for: historical sites in Langkawi, Malaysia
  - Page 1 complete. Total IDs: 20
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 29
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 30
Finished search for historical sites.
Starting search for: landmarks in Langkawi, Malaysia
  - Page 1 complete. Total IDs: 30
Finished search for landmarks.
Starting search for: shopping mall in Langkawi, Malaysia
  - Page 1 complete. Total IDs: 44
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 45
Finished search for shopping mall.

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/langkawi_stage2_tourist_attractions_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/langkawi_tourist_attractions_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 57
Total IDs already cached: 0
[1/57] Fetching details for ID: ChIJ0SB8KHp_TDARdSKacEeUu6E
[2/57] Fetching details for ID: ChIJ6xJ0xNB5TDAR83Yq6y7lReI
[3/57] Fetching details for ID: ChIJNyWRtoGASzARm9-4bmN18WU
[4/57] Fetching details for ID: ChIJR6YfEEeESzARvG2BSXuqmSg
[5/57] Fetching details for ID: ChIJ_6BX9dCHSzARjJbT_GzTo8k
[6/57] Fetching details for ID: ChIJAd9TL-F2TDARfrojfQscVeI
[7/57] Fetching details for ID: ChIJw2-THMR2TDARwykmuG158jY
[8/57] Fetching details for ID: ChIJSV9bFcd8TDAR1VNW4V90W-Q
[9/57] Fetching details for ID: ChIJ-aKjt96HSzARzJkIu36wYoQ
[10/57] Fetching details for ID: ChIJ__-PsHZ_TDAR-_smdirb69A
[11/57] Fetching details for ID: ChIJP0Cua-GHSzARfhiA7b24v44
[12/57] Fetching details for ID: ChIJd_EoYH5_TDARnBjvtKUG-K8
[13/57] Fetching details for ID: ChIJ-bYYMst5TDARb7ibMWwoMRY
[14/57] Fetching details for ID: ChIJXW0EGBi2zTERiKBASs5S8lM
[15/57] Fetching details for ID: ChIJe9jMqX5_TDARRHCGeNqKMLA
[16/57] Fetching details f

## Restaurant

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Langkawi, Malaysia"
search_keywords =  [
    "restaurant", "cafe", "bar", "food", "bistro","hawker centre",
    "food court", "seafood restaurant", "mamak restaurant",
    "malay restaurant", "indian restaurant", "japanese food",
    "western food","korean food", "chinese restaurant"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/langkawi_restaurants_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: restaurant in Langkawi, Malaysia
  - Page 1 complete. Total IDs: 15
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 29
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 44
Finished search for restaurant.
Starting search for: cafe in Langkawi, Malaysia
  - Page 1 complete. Total IDs: 50
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 52
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 59
Finished search for cafe.
Starting search for: bar in Langkawi, Malaysia
  - Page 1 complete. Total IDs: 63
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 65
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 66
Finished search for bar.
Starting search for: food in Langkawi, Malaysia
  - Page 1 complete. Total IDs: 67
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 70
  Waiting 3s for next_page_token...
  

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/langkawi_stage2_restaurants_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/langkawi_restaurants_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 122
Total IDs already cached: 0
[1/122] Fetching details for ID: ChIJRS0Pyet4TDARiovfSSWk9EU
[2/122] Fetching details for ID: ChIJrVR_-t-HSzARVidwxeP6OUU
[3/122] Fetching details for ID: ChIJwQoTI9KHSzARau3bCYV8mPA
[4/122] Fetching details for ID: ChIJv16Qi_WHSzARud7tKch-B8k
[5/122] Fetching details for ID: ChIJSacGVtCHSzARTs6HPZLhzwg
[6/122] Fetching details for ID: ChIJ60BjXjN_TDARTSGl-gM_eAs
[7/122] Fetching details for ID: ChIJ31oZnmh3TDARmt7S1TvTHIg
[8/122] Fetching details for ID: ChIJHxijZwCHSzARkMBYbQEPaFg
[9/122] Fetching details for ID: ChIJTdycAK-HSzARvZCwcgtgsGY
[10/122] Fetching details for ID: ChIJhettVY-ASzAR_pSsALPhhLs
[11/122] Fetching details for ID: ChIJCyUGp9GHSzAR5GFWD7Q6mck
[12/122] Fetching details for ID: ChIJ78WIUteHSzARtR_8X4OnbVk
[13/122] Fetching details for ID: ChIJl9uBa4h_TDARPhUIlOO5aCM
[14/122] Fetching details for ID: ChIJsd3UdXaHSzAR0j4gduFYnMU
[15/122] Fetching details for ID: ChIJccMmA-CHSzAR23mIyaegX9o
[16/122] F

## Hotel

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Langkawi, Malaysia"
search_keywords =    [
    "hotel", "resort", "airbnb", "homestay", "guesthouse",
    "motel", "hostel", "boutique hotel"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/langkawi_hotel_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: hotel in Langkawi, Malaysia
  - Page 1 complete. Total IDs: 18
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 32
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 47
Finished search for hotel.
Starting search for: resort in Langkawi, Malaysia
  - Page 1 complete. Total IDs: 52
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 55
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 56
Finished search for resort.
Starting search for: airbnb in Langkawi, Malaysia
  - Page 1 complete. Total IDs: 56
Finished search for airbnb.
Starting search for: homestay in Langkawi, Malaysia
Error in search for 'homestay': ZERO_RESULTS
Finished search for homestay.
Starting search for: guesthouse in Langkawi, Malaysia
  - Page 1 complete. Total IDs: 56
Finished search for guesthouse.
Starting search for: motel in Langkawi, Malaysia
  - Page 1 complete. Total IDs: 57
Fi

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/langkawi_stage2_hotel_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/langkawi_hotel_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 66
Total IDs already cached: 0
[1/66] Fetching details for ID: ChIJWcVWoaqHSzARtuRDsSJUMwU
[2/66] Fetching details for ID: ChIJ0zLzCeGHSzAR1XWCbg3_LQM
[3/66] Fetching details for ID: ChIJfxZ9mgqISzARukHx7V-BnoA
[4/66] Fetching details for ID: ChIJuTIUwdKHSzARChMP9E2rTNs
[5/66] Fetching details for ID: ChIJN9KZ0cSHSzARfbKQFFJfEL4
[6/66] Fetching details for ID: ChIJ5Tw9AN-HSzAR6m1YrKJ_SPw
[7/66] Fetching details for ID: ChIJF0Aw9xR4TDAR315MVtiVvkM
[8/66] Fetching details for ID: ChIJf0JJETuGSzAR8jTD6JWZpsc
[9/66] Fetching details for ID: ChIJE9CizK9_TDARfneqyZMkSJs
[10/66] Fetching details for ID: ChIJ756UWNCHSzAR6ta9vNKx2K8
[11/66] Fetching details for ID: ChIJLQEbc2x4TDAROHBIkwTsGec
[12/66] Fetching details for ID: ChIJb4mT6X9_TDARtQyVdsXR_V0
[13/66] Fetching details for ID: ChIJ78O_yd6HSzAR6RazLZWJGMk
[14/66] Fetching details for ID: ChIJW9achPB2TDAR5xxxfaE9uz4
[15/66] Fetching details for ID: ChIJm6lXBLGJSzARGrsf5ulHNP0
[16/66] Fetching details f

# Sabah

## Tourist Attractions

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Sabah, Malaysia"
search_keywords =  [
    "tourist attractions", "museums", "parks", "historical sites",
    "landmarks", "shopping mall", "temple", "mosque", "church",
    "island", "beach", "theme park", "natural preserve",
    "forest", "mountain", "waterfall", "zoo", "aquarium"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/sabah_tourist_attractions_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: tourist attractions in Sabah, Malaysia
  - Page 1 complete. Total IDs: 13
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 22
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 34
Finished search for tourist attractions.
Starting search for: museums in Sabah, Malaysia
  - Page 1 complete. Total IDs: 39
Finished search for museums.
Starting search for: parks in Sabah, Malaysia
  - Page 1 complete. Total IDs: 42
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 48
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 52
Finished search for parks.
Starting search for: historical sites in Sabah, Malaysia
  - Page 1 complete. Total IDs: 53
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 57
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 62
Finished search for historical sites.
Starting search for: landmarks in Sabah, Ma

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/sabah_stage2_tourist_attractions_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/sabah_tourist_attractions_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 146
Total IDs already cached: 0
[1/146] Fetching details for ID: ChIJL0Z-z7BpOzIRIEUWpx2lhNw
[2/146] Fetching details for ID: ChIJ4W53uxw7PzIREADa5so7Zn8
[3/146] Fetching details for ID: ChIJ3beGk1mmOzIR5z7EtkStg-k
[4/146] Fetching details for ID: ChIJJyx2xN2eMDIRp2ExyIPTdHo
[5/146] Fetching details for ID: ChIJXdLunvLYODIRkkLYNdHQOF0
[6/146] Fetching details for ID: ChIJj_DzVyZBOzIRSKNNmzmgmBE
[7/146] Fetching details for ID: ChIJbTsNMTZCOzIR7u32rdFQQmo
[8/146] Fetching details for ID: ChIJ9Ten7QuwOzIRPgO0ApYiNEs
[9/146] Fetching details for ID: ChIJT3ozCCCHPDIRJi2j8x70wXI
[10/146] Fetching details for ID: ChIJ2b8hoIhpOzIRJfuSascwQ3Q
[11/146] Fetching details for ID: ChIJPz75LimfOzIR42t6Q2NF8vQ
[12/146] Fetching details for ID: ChIJN2sNPSloOzIRrPTPP23Jjxc
[13/146] Fetching details for ID: ChIJwWRVFDmkPzIRCIQA8ltjqPQ
[14/146] Fetching details for ID: ChIJb-NKQ0lCOzIRXCpiVlW-YUY
[15/146] Fetching details for ID: ChIJZzdhHIZpOzIRr_9CABl11w8
[16/146] F

## Restaurant

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Sabah, Malaysia"
search_keywords =  [
    "restaurant", "cafe", "bar", "food", "bistro","hawker centre",
    "food court", "seafood restaurant", "mamak restaurant",
    "malay restaurant", "indian restaurant", "japanese food",
    "western food","korean food", "chinese restaurant"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/sabah_restaurants_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: restaurant in Sabah, Malaysia
  - Page 1 complete. Total IDs: 13
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 25
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 37
Finished search for restaurant.
Starting search for: cafe in Sabah, Malaysia
  - Page 1 complete. Total IDs: 47
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 52
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 59
Finished search for cafe.
Starting search for: bar in Sabah, Malaysia
  - Page 1 complete. Total IDs: 62
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 64
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 66
Finished search for bar.
Starting search for: food in Sabah, Malaysia
  - Page 1 complete. Total IDs: 72
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 82
  Waiting 3s for next_page_token...
  - Page 3 com

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/sabah_stage2_restaurants_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/sabah_restaurants_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 239
Total IDs already cached: 0
[1/239] Fetching details for ID: ChIJO-BbgjJoOzIRtZcnw-fl_WA
[2/239] Fetching details for ID: ChIJYYNgHFtpOzIR9UC07KfWnvE
[3/239] Fetching details for ID: ChIJg04hML1pOzIRTuah53Q9fKg
[4/239] Fetching details for ID: ChIJb0WDyyZpOzIRO20JwxKEfl8
[5/239] Fetching details for ID: ChIJv20DM5JpOzIRxLhiiQySdgM
[6/239] Fetching details for ID: ChIJvWAi4gZpOzIRxRM7isJQyOk
[7/239] Fetching details for ID: ChIJa9ghjLhrOzIRNUVck6byCQY
[8/239] Fetching details for ID: ChIJ4d6vjI9pOzIRDUVH5gLGGBw
[9/239] Fetching details for ID: ChIJ7VOQFo9pOzIRaMCgHDAniEg
[10/239] Fetching details for ID: ChIJqbd-G49pOzIRq_y0ZzgKnFY
[11/239] Fetching details for ID: ChIJE1mUkwZrOzIRxqucappq3zQ
[12/239] Fetching details for ID: ChIJfQw4U_9oOzIRRpBjGMmps14
[13/239] Fetching details for ID: ChIJdayIIw9pOzIRS8GCvtAdk-c
[14/239] Fetching details for ID: ChIJTZOrW6xpOzIRPiYR2f8HpNA
[15/239] Fetching details for ID: ChIJDXprDzRoOzIRNPmEIuQIQ-o
[16/239] F

## Hotel

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Sabah, Malaysia"
search_keywords =    [
    "hotel", "resort", "airbnb", "homestay", "guesthouse",
    "motel", "hostel", "boutique hotel"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/langkawi_hotel_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Loading existing Place IDs from cache: /content/drive/MyDrive/tourism_data/langkawi_hotel_stage1_place_ids.json
Loaded 66 IDs from cache. Skipping search.
Starting Stage 1 Search. Current ID count: 66
Starting search for: hotel in Sabah, Malaysia
  - Page 1 complete. Total IDs: 81
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 92
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 102
Finished search for hotel.
Starting search for: resort in Sabah, Malaysia
  - Page 1 complete. Total IDs: 111
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 120
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 123
Finished search for resort.
Starting search for: airbnb in Sabah, Malaysia
  - Page 1 complete. Total IDs: 128
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 128
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 130
Finished search for airbnb.
Starting search for: homestay in Sabah

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/sabah_stage2_hotel_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/sabah_hotel_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 142
Total IDs already cached: 0
[1/142] Fetching details for ID: ChIJ2Y4g5IVpOzIRxxzT8E2zM2Q
[2/142] Fetching details for ID: ChIJuTIUwdKHSzARChMP9E2rTNs
[3/142] Fetching details for ID: ChIJ3QxlSoZpOzIROrNTZlm6eqM
[4/142] Fetching details for ID: ChIJ2_0KP3trOzIRfzX1Kcn_Vmc
[5/142] Fetching details for ID: ChIJE9CizK9_TDARfneqyZMkSJs
[6/142] Fetching details for ID: ChIJa4hGS15oOzIRfX_MVSnkLPI
[7/142] Fetching details for ID: ChIJLQEbc2x4TDAROHBIkwTsGec
[8/142] Fetching details for ID: ChIJXUrSpYhpOzIRR4kDCMuEYpw
[9/142] Fetching details for ID: ChIJw6ZAvAGaOzIRnXX7G9ExtUE
[10/142] Fetching details for ID: ChIJn-NtQNmHSzARiztVMQ-ujxQ
[11/142] Fetching details for ID: ChIJI70C-tTEODIRLbJrnPEd3jw
[12/142] Fetching details for ID: ChIJH96IUcRpOzIRuh9hinGQ1yM
[13/142] Fetching details for ID: ChIJ71KcbgmISzARRFiTw7js3fw
[14/142] Fetching details for ID: ChIJ4yfVhIdpOzIRzdm897yNwwY
[15/142] Fetching details for ID: ChIJ72QNdpWHSzAR2RhdWBAINp4
[16/142] F

# Johor

## Tourist Attractions

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Johor, Malaysia"
search_keywords =  [
    "tourist attractions", "museums", "parks", "historical sites",
    "landmarks", "shopping mall", "temple", "mosque", "church",
    "island", "beach", "theme park", "natural preserve",
    "forest", "mountain", "waterfall", "zoo", "aquarium"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/johor_tourist_attractions_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: tourist attractions in Johor, Malaysia
  - Page 1 complete. Total IDs: 15
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 23
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 33
Finished search for tourist attractions.
Starting search for: museums in Johor, Malaysia
  - Page 1 complete. Total IDs: 37
Finished search for museums.
Starting search for: parks in Johor, Malaysia
  - Page 1 complete. Total IDs: 42
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 50
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 51
Finished search for parks.
Starting search for: historical sites in Johor, Malaysia
  - Page 1 complete. Total IDs: 53
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 55
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 55
Finished search for historical sites.
Starting search for: landmarks in Johor, Ma

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/johor_stage2_tourist_attractions_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/johor_tourist_attractions_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 151
Total IDs already cached: 0
[1/151] Fetching details for ID: ChIJJRc_dkxt2jERY0I4IA8BxlI
[2/151] Fetching details for ID: ChIJDyOtkBc82jERkMqi4Nb1gpo
[3/151] Fetching details for ID: ChIJodTOKK1G0DERtOADLXHUWe8
[4/151] Fetching details for ID: ChIJE4UtmEkz2jERq5SyMrqecxg
[5/151] Fetching details for ID: ChIJUQZO_Qpt2jERvaZBB7spZ_E
[6/151] Fetching details for ID: ChIJB8Il_wCb0DERWgRo6Qqck8s
[7/151] Fetching details for ID: ChIJjXEVnRkN2jERXZsLstIuSUo
[8/151] Fetching details for ID: ChIJ6_HO_yxr2jERxqzWeWTIjJ8
[9/151] Fetching details for ID: ChIJ40mnQFNx2jERG02S3S2H3Bk
[10/151] Fetching details for ID: ChIJIyOvUFNZ0DERHTYOOWnX4As
[11/151] Fetching details for ID: ChIJrxY5OInhzzERc5XEoxHjJOc
[12/151] Fetching details for ID: ChIJM-12PIwL2jERSzVEGdIIW4E
[13/151] Fetching details for ID: ChIJk7KOOe9s2jERXBm5PIqxCX8
[14/151] Fetching details for ID: ChIJVzJNappM2jERuRs20xlleRg
[15/151] Fetching details for ID: ChIJ3WMd2MIS2jERTTvd4lA3aVw
[16/151] F

## Restaurant

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Johor, Malaysia"
search_keywords =  [
    "restaurant", "cafe", "bar", "food", "bistro","hawker centre",
    "food court", "seafood restaurant", "mamak restaurant",
    "malay restaurant", "indian restaurant", "japanese food",
    "western food","korean food", "chinese restaurant"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/johor_restaurants_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: restaurant in Johor, Malaysia
  - Page 1 complete. Total IDs: 18
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 31
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 40
Finished search for restaurant.
Starting search for: cafe in Johor, Malaysia
  - Page 1 complete. Total IDs: 55
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 70
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 82
Finished search for cafe.
Starting search for: bar in Johor, Malaysia
  - Page 1 complete. Total IDs: 82
Finished search for bar.
Starting search for: food in Johor, Malaysia
  - Page 1 complete. Total IDs: 88
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 101
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 106
Finished search for food.
Starting search for: bistro in Johor, Malaysia
  - Page 1 complete. Total IDs: 110
  Waiting 

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/johor_stage2_restaurants_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/johor_restaurants_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 384
Total IDs already cached: 0
[1/384] Fetching details for ID: ChIJfdbvj0Jt0DER0fAtCVwndCc
[2/384] Fetching details for ID: ChIJk_idN3oU2jEReqhHxnv3lgI
[3/384] Fetching details for ID: ChIJ84UVJw9t2jERqzqE46C3l1c
[4/384] Fetching details for ID: ChIJa3ztiMZt2jERU3RU0Z13DkI
[5/384] Fetching details for ID: ChIJm7ZJTXpz2jERcS1XL40MR0o
[6/384] Fetching details for ID: ChIJEcljAGoU2jERqtf7wNTnNlg
[7/384] Fetching details for ID: ChIJc1S5gHNt2jERm0h5RUeuduo
[8/384] Fetching details for ID: ChIJHex3bztt2jERC5u6EBLSJFw
[9/384] Fetching details for ID: ChIJ4XcN9Qpt2jERgh9CVBPHyC8
[10/384] Fetching details for ID: ChIJozdKhzNz2jERfQI0JvliqQI
[11/384] Fetching details for ID: ChIJw0l8mg932jERMPjzKDLj8ps
[12/384] Fetching details for ID: ChIJ6zMcEiZt2jERxw8SYxNsPQ0
[13/384] Fetching details for ID: ChIJU2CakYhu2jER54rxe6lnaCo
[14/384] Fetching details for ID: ChIJ-VPKH79y2jEReuFPjX49Y-s
[15/384] Fetching details for ID: ChIJLymmgIhu2jERHt0jskDOY3o
[16/384] F

## Hotel

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Johor, Malaysia"
search_keywords =    [
    "hotel", "resort", "airbnb", "homestay", "guesthouse",
    "motel", "hostel", "boutique hotel"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/johor_hotel_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: hotel in Johor, Malaysia
  - Page 1 complete. Total IDs: 16
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 34
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 48
Finished search for hotel.
Starting search for: resort in Johor, Malaysia
  - Page 1 complete. Total IDs: 64
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 74
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 81
Finished search for resort.
Starting search for: airbnb in Johor, Malaysia
  - Page 1 complete. Total IDs: 82
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 84
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 84
Finished search for airbnb.
Starting search for: homestay in Johor, Malaysia
Error in search for 'homestay': ZERO_RESULTS
Finished search for homestay.
Starting search for: guesthouse in Johor, Malaysia
  - Page 1 complete. Total

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/johor_stage2_hotel_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/johor_hotel_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 115
Total IDs already cached: 0
[1/115] Fetching details for ID: ChIJde0NdlNt2jER0X1yE2PO68o
[2/115] Fetching details for ID: ChIJe1qqMN0S2jERMWriREjO7O8
[3/115] Fetching details for ID: ChIJZSH_iOls2jERBRob2gWcYYo
[4/115] Fetching details for ID: ChIJE4UtmEkz2jERq5SyMrqecxg
[5/115] Fetching details for ID: ChIJJW1g_Gdt2jERFtm4qwGJNOg
[6/115] Fetching details for ID: ChIJY9gQOTJt2jERpjZ63XIRx5s
[7/115] Fetching details for ID: ChIJd8W3_bdt2jERXM31dloaSPI
[8/115] Fetching details for ID: ChIJyx-EE5YT2jER2iWAnXr2uSU
[9/115] Fetching details for ID: ChIJ-xLxjcET2jERZ3Y3wiv9u3A
[10/115] Fetching details for ID: ChIJRWQSgrlt2jERdhUhYRaoUwE
[11/115] Fetching details for ID: ChIJvYBp6y4T2jERd_8w15yQiLo
[12/115] Fetching details for ID: ChIJE3ufsz9DxTERUpfjOfKbMvs
[13/115] Fetching details for ID: ChIJ6ZjbdH1u2jER-__DZdAw8RQ
[14/115] Fetching details for ID: ChIJAf_zOhZDxTERURVNN68RiOw
[15/115] Fetching details for ID: ChIJeeHflJIz2jERvnYtVkGtRF4
[16/115] F

# Genting Highlands

## Tourist Attractions

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Genting Highlands, Malaysia"
search_keywords =  [
    "tourist attractions", "museums", "parks", "historical sites",
    "landmarks", "shopping mall", "temple", "mosque", "church",
    "island", "beach", "theme park", "natural preserve",
    "forest", "mountain", "waterfall", "zoo", "aquarium"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/genting_tourist_attractions_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: tourist attractions in Genting Highlands, Malaysia
  - Page 1 complete. Total IDs: 5
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 7
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 8
Finished search for tourist attractions.
Starting search for: museums in Genting Highlands, Malaysia
  - Page 1 complete. Total IDs: 13
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 18
Finished search for museums.
Starting search for: parks in Genting Highlands, Malaysia
  - Page 1 complete. Total IDs: 18
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 20
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 22
Finished search for parks.
Starting search for: historical sites in Genting Highlands, Malaysia
  - Page 1 complete. Total IDs: 23
Finished search for historical sites.
Starting search for: landmarks in Genting Highlands, Malaysia
  - Pag

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/genting_stage2_tourist_attractions_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/genting_tourist_attractions_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            df['extract_date'] = datetime.now().date()
            df['region'] = target_region
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 111
Total IDs already cached: 0
[1/111] Fetching details for ID: ChIJ3frksplJzDER5USO8zLKNs0
[2/111] Fetching details for ID: ChIJEQAAAAEUzDERZlhQ3bHMCGY
[3/111] Fetching details for ID: ChIJW7yTH2zIyDEReXMJW6PB-DI
[4/111] Fetching details for ID: ChIJ93UrkwAUzDER73qDqGABE8w
[5/111] Fetching details for ID: ChIJP4nhRdYWzDERIBTC12w_cmk
[6/111] Fetching details for ID: ChIJAxj0sByLzDERrlwJ1iSwq54
[7/111] Fetching details for ID: ChIJ67ktsREVzDERqyfhH8BKfOY
[8/111] Fetching details for ID: ChIJp7mMpJ03zDERQFYGwK_Qnbs
[9/111] Fetching details for ID: ChIJO9cemPUQ2jERvlh8KtwhtAc
[10/111] Fetching details for ID: ChIJlSyJwCtPzDERZF0AU9hYoL8
[11/111] Fetching details for ID: ChIJB9ROJgEUzDER527txVqj4uQ
[12/111] Fetching details for ID: ChIJ9_6dH8dJzDERC7k4SHbqU3o
[13/111] Fetching details for ID: ChIJh22BVdYTzDERY2geTslH0f8
[14/111] Fetching details for ID: ChIJWcg_AbAVzDERXtNYufMRoVc
[15/111] Fetching details for ID: ChIJK2yW5NMVzDERIB5d1vv5p5o
[16/111] F

## Restaurant

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Genting Highlands, Malaysia"
search_keywords =  [
    "restaurant", "cafe", "bar", "food", "bistro","hawker centre",
    "food court", "seafood restaurant", "mamak restaurant",
    "malay restaurant", "indian restaurant", "japanese food",
    "western food","korean food", "chinese restaurant"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/genting_restaurants_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: restaurant in Genting Highlands, Malaysia
  - Page 1 complete. Total IDs: 18
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 33
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 40
Finished search for restaurant.
Starting search for: cafe in Genting Highlands, Malaysia
  - Page 1 complete. Total IDs: 48
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 52
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 56
Finished search for cafe.
Starting search for: bar in Genting Highlands, Malaysia
  - Page 1 complete. Total IDs: 58
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 63
Finished search for bar.
Starting search for: food in Genting Highlands, Malaysia
  - Page 1 complete. Total IDs: 63
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 64
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 65
Fi

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/genting_stage2_restaurant_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/genting_restaurant_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            df['extract_date'] = datetime.now().date()
            df['region'] = target_region
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 107
Total IDs already cached: 0
[1/107] Fetching details for ID: ChIJv31OWr0VzDERwHKNy7PrqZ4
[2/107] Fetching details for ID: ChIJuTCO-gYUzDERHAOLpAsGVdU
[3/107] Fetching details for ID: ChIJx5dffaAVzDERb2yCZhy_V5A
[4/107] Fetching details for ID: ChIJd9T5GAEUzDER5qaPrUZS7mA
[5/107] Fetching details for ID: ChIJu-YtpTQVzDER57L6XBqYs-s
[6/107] Fetching details for ID: ChIJ19XVWS0VzDERNR1KS6BmLp4
[7/107] Fetching details for ID: ChIJE8MGjDwUzDERTqZs_C7y2mI
[8/107] Fetching details for ID: ChIJTSyCYgsVzDERiFgUd2vJjaQ
[9/107] Fetching details for ID: ChIJb58mlxUVzDERBQc4GusvyWU
[10/107] Fetching details for ID: ChIJnS8ZEgEUzDERPCZ8T9g2rvs
[11/107] Fetching details for ID: ChIJESyqEf4VzDERXySMWxpXoAE
[12/107] Fetching details for ID: ChIJzaqZHQEUzDER9mPYZ0nG6mw
[13/107] Fetching details for ID: ChIJ4VZMEQEUzDERMkGqV2KfdiY
[14/107] Fetching details for ID: ChIJ_7uDTE8UzDERzacbRF9s3uc
[15/107] Fetching details for ID: ChIJcb7Ma-4VzDERfT72-Wv_6GE
[16/107] F

## Hotel

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Genting Highland, Malaysia"
search_keywords =    [
    "hotel", "resort", "airbnb", "homestay", "guesthouse",
    "motel", "hostel", "boutique hotel"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/genting_hotel_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: hotel in Genting Highland, Malaysia
  - Page 1 complete. Total IDs: 15
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 17
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 18
Finished search for hotel.
Starting search for: resort in Genting Highland, Malaysia
  - Page 1 complete. Total IDs: 21
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 21
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 22
Finished search for resort.
Starting search for: airbnb in Genting Highland, Malaysia
  - Page 1 complete. Total IDs: 23
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 24
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 24
Finished search for airbnb.
Starting search for: homestay in Genting Highland, Malaysia
Error in search for 'homestay': ZERO_RESULTS
Finished search for homestay.
Starting search for: guesthouse i

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/genting_stage2_hotel_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/genting_hotel_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            df['extract_date'] = datetime.now().date()
            df['region'] = target_region
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 24
Total IDs already cached: 0
[1/24] Fetching details for ID: ChIJF7vjwkUUzDERjJ7rpbbBq2Q
[2/24] Fetching details for ID: ChIJSbPpRKQVzDERdeJAbLLjqXk
[3/24] Fetching details for ID: ChIJZ-TdNrUVzDERKx2YQypIe4c
[4/24] Fetching details for ID: ChIJEQAAAAEUzDERZlhQ3bHMCGY
[5/24] Fetching details for ID: ChIJdT5P3AAUzDERy1xSx2I8xB4
[6/24] Fetching details for ID: ChIJTXw61DwTzDERlflQ1d0QngE
[7/24] Fetching details for ID: ChIJ5-K7zDsUzDERUkoXtVFogIU
[8/24] Fetching details for ID: ChIJYaub50UUzDERk3fsvLbK5HA
[9/24] Fetching details for ID: ChIJ1x0DQrUVzDER6dv7s-oWYjI
[10/24] Fetching details for ID: ChIJgw4DsDITzDERKWQySzc5fwo
[11/24] Fetching details for ID: ChIJh22BVdYTzDERY2geTslH0f8
[12/24] Fetching details for ID: ChIJ5UxErGYVzDERUiIW3dJeHt8
[13/24] Fetching details for ID: ChIJZxpNRrUVzDERlZ8icTRA-kk
[14/24] Fetching details for ID: ChIJWcg_AbAVzDERXtNYufMRoVc
[15/24] Fetching details for ID: ChIJmV3YR8kVzDER3pRF5F_lZnk
[16/24] Fetching details f

# Negeri Sembilan

## Tourist Attractions

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Negeri Sembilan, Malaysia"
search_keywords =  [
    "tourist attractions", "museums", "parks", "historical sites",
    "landmarks", "shopping mall", "temple", "mosque", "church",
    "island", "beach", "theme park", "natural preserve",
    "forest", "mountain", "waterfall", "zoo", "aquarium"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/n9_tourist_attractions_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: tourist attractions in Negeri Sembilan, Malaysia
  - Page 1 complete. Total IDs: 13
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 24
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 35
Finished search for tourist attractions.
Starting search for: museums in Negeri Sembilan, Malaysia
  - Page 1 complete. Total IDs: 36
Finished search for museums.
Starting search for: parks in Negeri Sembilan, Malaysia
  - Page 1 complete. Total IDs: 40
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 45
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 46
Finished search for parks.
Starting search for: historical sites in Negeri Sembilan, Malaysia
  - Page 1 complete. Total IDs: 49
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 51
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 51
Finished search for historical sites.
Sta

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/n9_stage2_tourist_attractions_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/n9_tourist_attractions_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            df['extract_date'] = datetime.now().date()
            df['region'] = target_region
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 162
Total IDs already cached: 0
[1/162] Fetching details for ID: ChIJdVIcudrZzTERNXBZu0ghPlk
[2/162] Fetching details for ID: ChIJ6XoyyYXdzTERbAXOaNFMchM
[3/162] Fetching details for ID: ChIJO_kf_2kDzjERqc9-CQvPaoc
[4/162] Fetching details for ID: ChIJG7HaBMD3zTERd__kj6KsEz8
[5/162] Fetching details for ID: ChIJgZFEus3FzTERY64AF7SkF7s
[6/162] Fetching details for ID: ChIJacj_ZAj3zTERklsXnUNTflg
[7/162] Fetching details for ID: ChIJ4XZDOqb3zTER4q3Mr_tkJ_I
[8/162] Fetching details for ID: ChIJI9nKAfbxzTER9e-Fpvkc0B8
[9/162] Fetching details for ID: ChIJd673u5bnzTERDPtjpfmLtoY
[10/162] Fetching details for ID: ChIJ66_dAS_HzTERCsqE42E13uY
[11/162] Fetching details for ID: ChIJYxoOx09LzDERyRhep7whOUY
[12/162] Fetching details for ID: ChIJe__NkKHgzTER4wDl-fS-ZJ8
[13/162] Fetching details for ID: ChIJM5cwioHdzTERrun_Tkdtn7k
[14/162] Fetching details for ID: ChIJGfoDjCL5zTERwi3bGlv-o7Q
[15/162] Fetching details for ID: ChIJbRYZZgjezTERJmXBxdvrlo8
[16/162] F

## Restaurant

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Negeri Sembilan, Malaysia"
search_keywords =  [
    "restaurant", "cafe", "bar", "food", "bistro","hawker centre",
    "food court", "seafood restaurant", "mamak restaurant",
    "malay restaurant", "indian restaurant", "japanese food",
    "western food","korean food", "chinese restaurant"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/n9_restaurants_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: restaurant in Negeri Sembilan, Malaysia
  - Page 1 complete. Total IDs: 16
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 34
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 49
Finished search for restaurant.
Starting search for: cafe in Negeri Sembilan, Malaysia
  - Page 1 complete. Total IDs: 60
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 67
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 72
Finished search for cafe.
Starting search for: bar in Negeri Sembilan, Malaysia
  - Page 1 complete. Total IDs: 75
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 76
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 77
Finished search for bar.
Starting search for: food in Negeri Sembilan, Malaysia
  - Page 1 complete. Total IDs: 81
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 84
  Waiting 

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/n9_stage2_restaurant_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/n9_restaurant_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            df['extract_date'] = datetime.now().date()
            df['region'] = target_region
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 260
Total IDs already cached: 0
[1/260] Fetching details for ID: ChIJZbNS64j3zTERXpK9qoBaDBQ
[2/260] Fetching details for ID: ChIJy_yZBnTDzTERpuiYNeOiaek
[3/260] Fetching details for ID: ChIJ5eFSFY_dzTEROqYFr1CkE1s
[4/260] Fetching details for ID: ChIJ96VnERLdzTER1yURWH69iP8
[5/260] Fetching details for ID: ChIJWRCc7cDnzTERxDm8dJyidf4
[6/260] Fetching details for ID: ChIJWQiWPFnrzTERb2OpN_Ic5iQ
[7/260] Fetching details for ID: ChIJFaq3qVvdzTERhchhWQRa6UU
[8/260] Fetching details for ID: ChIJs_FW4c3gzTERxf1RR5V0epM
[9/260] Fetching details for ID: ChIJn_WD5ELnzTER9Aw4m-AIkXQ
[10/260] Fetching details for ID: ChIJu130oernzTERZgag2yx1wSc
[11/260] Fetching details for ID: ChIJa3t00FvdzTERXz42MrPE0VA
[12/260] Fetching details for ID: ChIJU71ILZvxzTERFCwytHgbecg
[13/260] Fetching details for ID: ChIJoSitS2THzTERfs78SJV1AWg
[14/260] Fetching details for ID: ChIJ1VTI4Y33zTERMPZ_MkegSRc
[15/260] Fetching details for ID: ChIJ__hT-43nzTER8fZOJgWk0_0
[16/260] F

## Hotel

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Negeri Sembilan, Malaysia"
search_keywords =    [
    "hotel", "resort", "airbnb", "homestay", "guesthouse",
    "motel", "hostel", "boutique hotel"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/n9_hotel_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: hotel in Negeri Sembilan, Malaysia
  - Page 1 complete. Total IDs: 13
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 23
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 35
Finished search for hotel.
Starting search for: resort in Negeri Sembilan, Malaysia
  - Page 1 complete. Total IDs: 35
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 38
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 43
Finished search for resort.
Starting search for: airbnb in Negeri Sembilan, Malaysia
  - Page 1 complete. Total IDs: 43
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 44
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 44
Finished search for airbnb.
Starting search for: homestay in Negeri Sembilan, Malaysia
Error in search for 'homestay': ZERO_RESULTS
Finished search for homestay.
Starting search for: guesthouse in Ne

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/n9_stage2_hotel_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/n9_hotel_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            df['extract_date'] = datetime.now().date()
            df['region'] = target_region
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 57
Total IDs already cached: 0
[1/57] Fetching details for ID: ChIJPYJDn0IDzjEROQ-wmKtchJE
[2/57] Fetching details for ID: ChIJGyT3LH3dzTERhj3HfLS-aDg
[3/57] Fetching details for ID: ChIJm3aGrA3xzTER9yDTqX1QOHo
[4/57] Fetching details for ID: ChIJXbH1Ep9pzjERGzfmr0QrlMI
[5/57] Fetching details for ID: ChIJbQMsi37xzTERadcfJ0yWtdY
[6/57] Fetching details for ID: ChIJRY-2MsrezTERyzgZckutE_M
[7/57] Fetching details for ID: ChIJpaFR1ePxzTERZhMd8yuzrcY
[8/57] Fetching details for ID: ChIJ_ea-PYnxzTERP6COm-8PBXs
[9/57] Fetching details for ID: ChIJb56Uq9LnzTERyjAiyEEIYZ0
[10/57] Fetching details for ID: ChIJ97DjdjLZzTERzre6JrfXDDU
[11/57] Fetching details for ID: ChIJD2rjFJ7xzTER2nmdUkGzdZk
[12/57] Fetching details for ID: ChIJZzgBionnzTERZGEfVvhkS4s
[13/57] Fetching details for ID: ChIJI9nKAfbxzTER9e-Fpvkc0B8
[14/57] Fetching details for ID: ChIJQcbPxOpyzjERnQTApMBl37Y
[15/57] Fetching details for ID: ChIJZyWx_IfnzTER1itwrJSTygI
[16/57] Fetching details f

# Port Dickson

## Tourist Attractions

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Port Dickson, Malaysia"
search_keywords =  [
    "tourist attractions", "museums", "parks", "historical sites",
    "landmarks", "shopping mall", "temple", "mosque", "church",
    "island", "beach", "theme park", "natural preserve",
    "forest", "mountain", "waterfall", "zoo", "aquarium"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/port_dickson_tourist_attractions_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: tourist attractions in Port Dickson, Malaysia
  - Page 1 complete. Total IDs: 12
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 17
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 17
Finished search for tourist attractions.
Starting search for: museums in Port Dickson, Malaysia
  - Page 1 complete. Total IDs: 19
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 19
Finished search for museums.
Starting search for: parks in Port Dickson, Malaysia
  - Page 1 complete. Total IDs: 21
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 24
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 26
Finished search for parks.
Starting search for: historical sites in Port Dickson, Malaysia
  - Page 1 complete. Total IDs: 26
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 28
  Waiting 3s for next_page_token...
  - Page 3 comple

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/port_dickson_stage2_tourist_attractions_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/port_dickson_tourist_attractions_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            df['extract_date'] = datetime.now().date()
            df['region'] = target_region
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 109
Total IDs already cached: 0
[1/109] Fetching details for ID: ChIJdVIcudrZzTERNXBZu0ghPlk
[2/109] Fetching details for ID: ChIJ_5Behn1TzDER1kQ9SX-7tS4
[3/109] Fetching details for ID: ChIJ_ea-PYnxzTERP6COm-8PBXs
[4/109] Fetching details for ID: ChIJG7HaBMD3zTERd__kj6KsEz8
[5/109] Fetching details for ID: ChIJacj_ZAj3zTERklsXnUNTflg
[6/109] Fetching details for ID: ChIJC6g5nH_xzTERiuNSh_uLEwk
[7/109] Fetching details for ID: ChIJHWIvewW3zTERWbv1f7DoL6g
[8/109] Fetching details for ID: ChIJI9nKAfbxzTER9e-Fpvkc0B8
[9/109] Fetching details for ID: ChIJ4XZDOqb3zTER4q3Mr_tkJ_I
[10/109] Fetching details for ID: ChIJd673u5bnzTERDPtjpfmLtoY
[11/109] Fetching details for ID: ChIJ66_dAS_HzTERCsqE42E13uY
[12/109] Fetching details for ID: ChIJe__NkKHgzTER4wDl-fS-ZJ8
[13/109] Fetching details for ID: ChIJM5cwioHdzTERrun_Tkdtn7k
[14/109] Fetching details for ID: ChIJMe6e8J_3zTERO1AKougsZsw
[15/109] Fetching details for ID: ChIJGfoDjCL5zTERwi3bGlv-o7Q
[16/109] F

## Restaurant

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Port Dickson, Malaysia"
search_keywords =  [
    "restaurant", "cafe", "bar", "food", "bistro","hawker centre",
    "food court", "seafood restaurant", "mamak restaurant",
    "malay restaurant", "indian restaurant", "japanese food",
    "western food","korean food", "chinese restaurant"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/port_dickson_restaurants_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: restaurant in Port Dickson, Malaysia
  - Page 1 complete. Total IDs: 17
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 32
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 42
Finished search for restaurant.
Starting search for: cafe in Port Dickson, Malaysia
  - Page 1 complete. Total IDs: 45
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 46
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 49
Finished search for cafe.
Starting search for: bar in Port Dickson, Malaysia
  - Page 1 complete. Total IDs: 49
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 55
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 55
Finished search for bar.
Starting search for: food in Port Dickson, Malaysia
  - Page 1 complete. Total IDs: 55
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 56
  Waiting 3s for next_

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/port_dickson_stage2_restaurant_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/port_dickson_restaurant_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            df['extract_date'] = datetime.now().date()
            df['region'] = target_region
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 98
Total IDs already cached: 0
[1/98] Fetching details for ID: ChIJZbNS64j3zTERXpK9qoBaDBQ
[2/98] Fetching details for ID: ChIJmV7KWgD3zTER9eg7ORCIIj8
[3/98] Fetching details for ID: ChIJWQiWPFnrzTERb2OpN_Ic5iQ
[4/98] Fetching details for ID: ChIJn_WD5ELnzTER9Aw4m-AIkXQ
[5/98] Fetching details for ID: ChIJ_ea-PYnxzTERP6COm-8PBXs
[6/98] Fetching details for ID: ChIJU71ILZvxzTERFCwytHgbecg
[7/98] Fetching details for ID: ChIJ1VTI4Y33zTERMPZ_MkegSRc
[8/98] Fetching details for ID: ChIJrXNnGejtzTERnAUNyUG0tQQ
[9/98] Fetching details for ID: ChIJq08iNQ_3zTERI_hNrY04Jug
[10/98] Fetching details for ID: ChIJ7dDrzuX3zTERP6oNmpX5h0M
[11/98] Fetching details for ID: ChIJS33shRHxzTER7lm6qlJawPg
[12/98] Fetching details for ID: ChIJT5SmqP_xzTERQ83FLXWpnBU
[13/98] Fetching details for ID: ChIJ7zcyUS_vzTERQcW7pLjjh6o
[14/98] Fetching details for ID: ChIJL3dWfjfnzTERJId4MiDkkQQ
[15/98] Fetching details for ID: ChIJr8eODOPxzTER1y4yk3uJRKE
[16/98] Fetching details f

## Hotel

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Port Dickson, Malaysia"
search_keywords =    [
    "hotel", "resort", "airbnb", "homestay", "guesthouse",
    "motel", "hostel", "boutique hotel"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/port_dickson_hotel_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: hotel in Port Dickson, Malaysia
  - Page 1 complete. Total IDs: 13
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 21
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 30
Finished search for hotel.
Starting search for: resort in Port Dickson, Malaysia
  - Page 1 complete. Total IDs: 31
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 33
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 35
Finished search for resort.
Starting search for: airbnb in Port Dickson, Malaysia
  - Page 1 complete. Total IDs: 35
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 35
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 35
Finished search for airbnb.
Starting search for: homestay in Port Dickson, Malaysia
Error in search for 'homestay': ZERO_RESULTS
Finished search for homestay.
Starting search for: guesthouse in Port Dickson, 

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/port_dickson_stage2_hotel_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/port_dickson_hotel_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            df['extract_date'] = datetime.now().date()
            df['region'] = target_region
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 39
Total IDs already cached: 0
[1/39] Fetching details for ID: ChIJm3aGrA3xzTER9yDTqX1QOHo
[2/39] Fetching details for ID: ChIJbQMsi37xzTERadcfJ0yWtdY
[3/39] Fetching details for ID: ChIJma_odZ3xzTERNL1u84IrO8k
[4/39] Fetching details for ID: ChIJpaFR1ePxzTERZhMd8yuzrcY
[5/39] Fetching details for ID: ChIJ_ea-PYnxzTERP6COm-8PBXs
[6/39] Fetching details for ID: ChIJFS03xzr3zTERpyct8m5-SCc
[7/39] Fetching details for ID: ChIJD2rjFJ7xzTER2nmdUkGzdZk
[8/39] Fetching details for ID: ChIJcyZdhznzzTERWZ-FkXdfNiI
[9/39] Fetching details for ID: ChIJI9nKAfbxzTER9e-Fpvkc0B8
[10/39] Fetching details for ID: ChIJH9kKznTxzTEREPWaiRxu08A
[11/39] Fetching details for ID: ChIJe9GAozz3zTERjeUlLVW7F8k
[12/39] Fetching details for ID: ChIJ-TSXjGvxzTER7-imn1Lzd-o
[13/39] Fetching details for ID: ChIJbbeHbgn3zTERMbKzdmVdvrM
[14/39] Fetching details for ID: ChIJPS5CtDH3zTERXociCEWgEkY
[15/39] Fetching details for ID: ChIJbQteVnTxzTERW3YtpQMeB5Y
[16/39] Fetching details f

# Cameron Highlands

## Tourist Attractions

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Cameron Highlands, Malaysia"
search_keywords =  [
    "tourist attractions", "museums", "parks", "historical sites",
    "landmarks", "shopping mall", "temple", "mosque", "church",
    "island", "beach", "theme park", "natural preserve",
    "forest", "mountain", "waterfall", "zoo", "aquarium"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/cameron_tourist_attractions_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Loading existing Place IDs from cache: /content/drive/MyDrive/tourism_data/genting_tourist_attractions_stage1_place_ids.json
Loaded 111 IDs from cache. Skipping search.
Starting Stage 1 Search. Current ID count: 111
Starting search for: tourist attractions in Cameron Highlands, Malaysia
  - Page 1 complete. Total IDs: 127
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 138
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 146
Finished search for tourist attractions.
Starting search for: museums in Cameron Highlands, Malaysia
  - Page 1 complete. Total IDs: 146
Finished search for museums.
Starting search for: parks in Cameron Highlands, Malaysia
  - Page 1 complete. Total IDs: 147
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 149
Finished search for parks.
Starting search for: historical sites in Cameron Highlands, Malaysia
  - Page 1 complete. Total IDs: 150
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/cameron_stage2_tourist_attractions_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/cameron_tourist_attractions_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            df['extract_date'] = datetime.now().date()
            df['region'] = target_region
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 185
Total IDs already cached: 0
[1/185] Fetching details for ID: ChIJ3frksplJzDER5USO8zLKNs0
[2/185] Fetching details for ID: ChIJEQAAAAEUzDERZlhQ3bHMCGY
[3/185] Fetching details for ID: ChIJW7yTH2zIyDEReXMJW6PB-DI
[4/185] Fetching details for ID: ChIJ93UrkwAUzDER73qDqGABE8w
[5/185] Fetching details for ID: ChIJP4nhRdYWzDERIBTC12w_cmk
[6/185] Fetching details for ID: ChIJA5Zl7oJXyjERDeHhhah-_EE
[7/185] Fetching details for ID: ChIJz1qi3KNZyjERV7mouGpSdog
[8/185] Fetching details for ID: ChIJAxj0sByLzDERrlwJ1iSwq54
[9/185] Fetching details for ID: ChIJq3UYLkpYyjEROxx6pDrQEaw
[10/185] Fetching details for ID: ChIJ2_bfFyhYyjERoyMRcMR_Suo
[11/185] Fetching details for ID: ChIJ67ktsREVzDERqyfhH8BKfOY
[12/185] Fetching details for ID: ChIJp7mMpJ03zDERQFYGwK_Qnbs
[13/185] Fetching details for ID: ChIJO9cemPUQ2jERvlh8KtwhtAc
[14/185] Fetching details for ID: ChIJi8lunVpZyjER51L_2ucy3E4
[15/185] Fetching details for ID: ChIJq134zyJZyjERWA8uabv785s
[16/185] F

## Restaurant

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Cameron Highlands, Malaysia"
search_keywords =  [
    "restaurant", "cafe", "bar", "food", "bistro","hawker centre",
    "food court", "seafood restaurant", "mamak restaurant",
    "malay restaurant", "indian restaurant", "japanese food",
    "western food","korean food", "chinese restaurant"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/cameron_restaurants_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: restaurant in Cameron Highlands, Malaysia
  - Page 1 complete. Total IDs: 19
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 33
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 51
Finished search for restaurant.
Starting search for: cafe in Cameron Highlands, Malaysia
  - Page 1 complete. Total IDs: 55
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 58
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 65
Finished search for cafe.
Starting search for: bar in Cameron Highlands, Malaysia
  - Page 1 complete. Total IDs: 66
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 68
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 74
Finished search for bar.
Starting search for: food in Cameron Highlands, Malaysia
  - Page 1 complete. Total IDs: 74
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 77
  

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/cameron_stage2_restaurant_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/cameron_restaurant_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            df['extract_date'] = datetime.now().date()
            df['region'] = target_region
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 109
Total IDs already cached: 0
[1/109] Fetching details for ID: ChIJCaTISaNZyjERxtXWDtU9f3o
[2/109] Fetching details for ID: ChIJ2_bfFyhYyjERoyMRcMR_Suo
[3/109] Fetching details for ID: ChIJq3UYLkpYyjEROxx6pDrQEaw
[4/109] Fetching details for ID: ChIJfeIL6c1ZyjER14V3f1JETp4
[5/109] Fetching details for ID: ChIJq134zyJZyjERWA8uabv785s
[6/109] Fetching details for ID: ChIJq6qqqppXyjERB4VSGj3_EaM
[7/109] Fetching details for ID: ChIJk47mgShYyjERRmv4uOHwWMw
[8/109] Fetching details for ID: ChIJRxRZfYFXyjERlmm7nwkDMVQ
[9/109] Fetching details for ID: ChIJu-RDJGDsyjERf46cskdr9S0
[10/109] Fetching details for ID: ChIJPeQoNhntyjERBs-VKPR2Ph4
[11/109] Fetching details for ID: ChIJz2FCHoxXyjERdTbrUsh3hn0
[12/109] Fetching details for ID: ChIJ20W8gqFZyjERxpUMVqPdzGo
[13/109] Fetching details for ID: ChIJ1eu7VwtZyjERkA_QYQYcUFo
[14/109] Fetching details for ID: ChIJifZPaRxXyjEROfSjNkydvIE
[15/109] Fetching details for ID: ChIJZW2egdRXyjERhWVHT45HSHU
[16/109] F

## Hotel

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Cameron Highlands, Malaysia"
search_keywords =    [
    "hotel", "resort", "airbnb", "homestay", "guesthouse",
    "motel", "hostel", "boutique hotel"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/cameron_hotel_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: hotel in Cameron Highlands, Malaysia
  - Page 1 complete. Total IDs: 9
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 13
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 13
Finished search for hotel.
Starting search for: resort in Cameron Highlands, Malaysia
  - Page 1 complete. Total IDs: 14
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 14
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 14
Finished search for resort.
Starting search for: airbnb in Cameron Highlands, Malaysia
  - Page 1 complete. Total IDs: 14
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 16
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 18
Finished search for airbnb.
Starting search for: homestay in Cameron Highlands, Malaysia
Error in search for 'homestay': ZERO_RESULTS
Finished search for homestay.
Starting search for: guesthous

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/cameron_stage2_hotel_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/cameron_hotel_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            df['extract_date'] = datetime.now().date()
            df['region'] = target_region
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 21
Total IDs already cached: 0
[1/21] Fetching details for ID: ChIJJ8UrKolXyjERtduqhQs20yc
[2/21] Fetching details for ID: ChIJSY3MuAFZyjERRqVpWo0aJdA
[3/21] Fetching details for ID: ChIJ45FWtxFZyjER5G-IXiS2xxk
[4/21] Fetching details for ID: ChIJq6qqqppXyjERB4VSGj3_EaM
[5/21] Fetching details for ID: ChIJa0im2DZYyjER7F0SVJpPVLw
[6/21] Fetching details for ID: ChIJR7gzx1dWyjERwGAfnVfXfb8
[7/21] Fetching details for ID: ChIJ59uE6ohXyjERn991tfMGcCc
[8/21] Fetching details for ID: ChIJPYlFv4hXyjERWZfPfUq6Ajs
[9/21] Fetching details for ID: ChIJpYkeXYlXyjERnqGrZHwoJeg
[10/21] Fetching details for ID: ChIJK5z2xrVZyjER2suZzemth4I
[11/21] Fetching details for ID: ChIJL8j0FMtZyjER4FJAx4-LDVs
[12/21] Fetching details for ID: ChIJS6X0hy9YyjERKtt6ihEXlB4
[13/21] Fetching details for ID: ChIJm-bFTYZXyjER2WdkWPLkj88
[14/21] Fetching details for ID: ChIJt421YpxXyjER1fOSDc33cmw
[15/21] Fetching details for ID: ChIJY_MEtUpYyjERIvbutRSbuq8
[16/21] Fetching details f

# Pahang

## Tourist Attractions

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Pahang, Malaysia"
search_keywords =  [
    "tourist attractions", "museums", "parks", "historical sites",
    "landmarks", "shopping mall", "temple", "mosque", "church",
    "island", "beach", "theme park", "natural preserve",
    "forest", "mountain", "waterfall", "zoo", "aquarium"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/pahang_tourist_attractions_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: tourist attractions in Pahang, Malaysia
  - Page 1 complete. Total IDs: 9
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 23
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 31
Finished search for tourist attractions.
Starting search for: museums in Pahang, Malaysia
  - Page 1 complete. Total IDs: 35
Finished search for museums.
Starting search for: parks in Pahang, Malaysia
  - Page 1 complete. Total IDs: 41
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 48
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 50
Finished search for parks.
Starting search for: historical sites in Pahang, Malaysia
  - Page 1 complete. Total IDs: 54
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 57
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 58
Finished search for historical sites.
Starting search for: landmarks in Pahang

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/pahang_stage2_tourist_attractions_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/pahang_tourist_attractions_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            df['extract_date'] = datetime.now().date()
            df['region'] = target_region
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 175
Total IDs already cached: 0
[1/175] Fetching details for ID: ChIJW7yTH2zIyDEReXMJW6PB-DI
[2/175] Fetching details for ID: ChIJg3PYtmBGzDERdVqsZizorps
[3/175] Fetching details for ID: ChIJ93UrkwAUzDER73qDqGABE8w
[4/175] Fetching details for ID: ChIJA5Zl7oJXyjERDeHhhah-_EE
[5/175] Fetching details for ID: ChIJz1qi3KNZyjERV7mouGpSdog
[6/175] Fetching details for ID: ChIJZ49o_NNJzDER52kKLYt5jk8
[7/175] Fetching details for ID: ChIJM6n6R0WlyDERRBXeQdponTM
[8/175] Fetching details for ID: ChIJq3UYLkpYyjEROxx6pDrQEaw
[9/175] Fetching details for ID: ChIJ0dNo8FClyDERCcvm9THOrrM
[10/175] Fetching details for ID: ChIJy8cSt126yDER1YaoolBRaJ8
[11/175] Fetching details for ID: ChIJi_u9YK-6yDERNdUyj4MidBs
[12/175] Fetching details for ID: ChIJq134zyJZyjERWA8uabv785s
[13/175] Fetching details for ID: ChIJOdJM-ZIbxTERjuIB_Djkt7c
[14/175] Fetching details for ID: ChIJRxRZfYFXyjERlmm7nwkDMVQ
[15/175] Fetching details for ID: ChIJk47mgShYyjERRmv4uOHwWMw
[16/175] F

## Restaurant

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Pahang, Malaysia"
search_keywords =  [
    "restaurant", "cafe", "bar", "food", "bistro","hawker centre",
    "food court", "seafood restaurant", "mamak restaurant",
    "malay restaurant", "indian restaurant", "japanese food",
    "western food","korean food", "chinese restaurant"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/pahang_restaurants_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: restaurant in Pahang, Malaysia
  - Page 1 complete. Total IDs: 17
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 31
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 37
Finished search for restaurant.
Starting search for: cafe in Pahang, Malaysia
  - Page 1 complete. Total IDs: 43
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 49
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 50
Finished search for cafe.
Starting search for: bar in Pahang, Malaysia
  - Page 1 complete. Total IDs: 51
Finished search for bar.
Starting search for: food in Pahang, Malaysia
  - Page 1 complete. Total IDs: 55
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 61
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 65
Finished search for food.
Starting search for: bistro in Pahang, Malaysia
  - Page 1 complete. Total IDs: 66
  Waitin

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/pahang_stage2_restaurant_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/pahang_restaurant_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            df['extract_date'] = datetime.now().date()
            df['region'] = target_region
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 232
Total IDs already cached: 0
[1/232] Fetching details for ID: ChIJVVVVVbG6yDERfDF9FcD5QyI
[2/232] Fetching details for ID: ChIJocwhRnJFzDERw8RKSj_OpW4
[3/232] Fetching details for ID: ChIJaYUrESEWzDERP2aCud2B7Bg
[4/232] Fetching details for ID: ChIJO593_KqxyDER0QHe91YkYJI
[5/232] Fetching details for ID: ChIJY8hBYb66yDER--nexlMl_58
[6/232] Fetching details for ID: ChIJL93u-8-HyzERN-S6DQ65TUw
[7/232] Fetching details for ID: ChIJV09OSZXUyDERd9jnnLYarIU
[8/232] Fetching details for ID: ChIJ-f8Gu5O6yDERWO-xE3qyuuY
[9/232] Fetching details for ID: ChIJxxk9aZy6yDERYAy4nqVjotQ
[10/232] Fetching details for ID: ChIJqWhqqZq6yDERkcKDU6vyBL0
[11/232] Fetching details for ID: ChIJQeA_1oNPzDERFM0ou8cRYL4
[12/232] Fetching details for ID: ChIJkTwSLPS5zjERh7EPOl7UWSY
[13/232] Fetching details for ID: ChIJhTUk1TOFyDERCAV1JZZwUsk
[14/232] Fetching details for ID: ChIJiSdt6oewyDERZz19CXB9EMI
[15/232] Fetching details for ID: ChIJt9EXYiQWzDERb1ppjvv7Y78
[16/232] F

## Hotel

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Pahang, Malaysia"
search_keywords =    [
    "hotel", "resort", "airbnb", "homestay", "guesthouse",
    "motel", "hostel", "boutique hotel"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/pahang_hotel_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Loading existing Place IDs from cache: /content/drive/MyDrive/tourism_data/cameron_hotel_stage1_place_ids.json
Loaded 21 IDs from cache. Skipping search.
Starting Stage 1 Search. Current ID count: 21
Starting search for: hotel in Pahang, Malaysia
  - Page 1 complete. Total IDs: 29
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 38
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 46
Finished search for hotel.
Starting search for: resort in Pahang, Malaysia
  - Page 1 complete. Total IDs: 55
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 64
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 72
Finished search for resort.
Starting search for: airbnb in Pahang, Malaysia
  - Page 1 complete. Total IDs: 72
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 72
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 72
Finished search for airbnb.
Starting search for: homestay in Pahang, Ma

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/pahang_stage2_hotel_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/pahang_hotel_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            df['extract_date'] = datetime.now().date()
            df['region'] = target_region
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 91
Total IDs already cached: 0
[1/91] Fetching details for ID: ChIJARooyJy6yDERhTRtQZTo9Do
[2/91] Fetching details for ID: ChIJq6qqqppXyjERB4VSGj3_EaM
[3/91] Fetching details for ID: ChIJWcg_AbAVzDERXtNYufMRoVc
[4/91] Fetching details for ID: ChIJOQy3yIlexTER_B0YMxnMWEw
[5/91] Fetching details for ID: ChIJE1W0MyWFyDERW-OHQxskHm0
[6/91] Fetching details for ID: ChIJ4VONO8CayDEREvgJqEawBOQ
[7/91] Fetching details for ID: ChIJ59uE6ohXyjERn991tfMGcCc
[8/91] Fetching details for ID: ChIJ6-HeNitIzDER6dHUj8EzUfA
[9/91] Fetching details for ID: ChIJa-1X0VG6yDERUlWpiGkv6ZI
[10/91] Fetching details for ID: ChIJL8j0FMtZyjER4FJAx4-LDVs
[11/91] Fetching details for ID: ChIJ0cMqXteiyDERYiS-bQQt5I4
[12/91] Fetching details for ID: ChIJZzxcc1KxyDERaFPli9pd6rg
[13/91] Fetching details for ID: ChIJS6X0hy9YyjERKtt6ihEXlB4
[14/91] Fetching details for ID: ChIJm_k5xwnDyDERQryziitL8QU
[15/91] Fetching details for ID: ChIJRedOlNGYyTERHnqlFX2KwYU
[16/91] Fetching details f

# Perak

## Tourist Attractions

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Perak, Malaysia"
search_keywords =  [
    "tourist attractions", "museums", "parks", "historical sites",
    "landmarks", "shopping mall", "temple", "mosque", "church",
    "island", "beach", "theme park", "natural preserve",
    "forest", "mountain", "waterfall", "zoo", "aquarium"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/perak_tourist_attractions_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: tourist attractions in Perak, Malaysia
  - Page 1 complete. Total IDs: 17
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 28
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 41
Finished search for tourist attractions.
Starting search for: museums in Perak, Malaysia
  - Page 1 complete. Total IDs: 47
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 48
Finished search for museums.
Starting search for: parks in Perak, Malaysia
  - Page 1 complete. Total IDs: 57
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 59
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 66
Finished search for parks.
Starting search for: historical sites in Perak, Malaysia
  - Page 1 complete. Total IDs: 68
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 71
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 72
Finished s

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/perak_stage2_tourist_attractions_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/perak_tourist_attractions_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            df['extract_date'] = datetime.now().date()
            df['region'] = target_region
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 185
Total IDs already cached: 0
[1/185] Fetching details for ID: ChIJA5Zl7oJXyjERDeHhhah-_EE
[2/185] Fetching details for ID: ChIJz1qi3KNZyjERV7mouGpSdog
[3/185] Fetching details for ID: ChIJAxj0sByLzDERrlwJ1iSwq54
[4/185] Fetching details for ID: ChIJU5kQBUvpyjERpfSNSvTa05s
[5/185] Fetching details for ID: ChIJLXS42xT7yjERos4Leo07sKw
[6/185] Fetching details for ID: ChIJ6WTax5XiyjER8McbWzN8tNc
[7/185] Fetching details for ID: ChIJBTPVzbAVyzERYbsFaaSouVg
[8/185] Fetching details for ID: ChIJOYBWpGHsyjER9Z1JqBVHHUg
[9/185] Fetching details for ID: ChIJJ41JT0vhyjER6TeoCVWsi6Y
[10/185] Fetching details for ID: ChIJ27oxe8tZyjEReyqcu98Bj7Y
[11/185] Fetching details for ID: ChIJ20W8gqFZyjERxpUMVqPdzGo
[12/185] Fetching details for ID: ChIJZzWuCL1ftTERM3Cc0bO_3Ag
[13/185] Fetching details for ID: ChIJe1kWgkPDSjAR7tWBfFiAYwk
[14/185] Fetching details for ID: ChIJi8PB_q1VtTERsmXM-I4CJik
[15/185] Fetching details for ID: ChIJTyFXAn-UyjERTHEWBgUnFDs
[16/185] F

## Restaurant

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Perak, Malaysia"
search_keywords =  [
    "restaurant", "cafe", "bar", "food", "bistro","hawker centre",
    "food court", "seafood restaurant", "mamak restaurant",
    "malay restaurant", "indian restaurant", "japanese food",
    "western food","korean food", "chinese restaurant"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/perak_restaurants_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: restaurant in Perak, Malaysia
  - Page 1 complete. Total IDs: 18
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 33
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 48
Finished search for restaurant.
Starting search for: cafe in Perak, Malaysia
  - Page 1 complete. Total IDs: 60
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 70
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 78
Finished search for cafe.
Starting search for: bar in Perak, Malaysia
  - Page 1 complete. Total IDs: 84
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 85
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 89
Finished search for bar.
Starting search for: food in Perak, Malaysia
  - Page 1 complete. Total IDs: 95
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 103
  Waiting 3s for next_page_token...
  - Page 3 co

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/perak_stage2_restaurant_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/perak_restaurant_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            df['extract_date'] = datetime.now().date()
            df['region'] = target_region
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 351
Total IDs already cached: 0
[1/351] Fetching details for ID: ChIJiQdXvlPtyjER2BguidEOAEQ
[2/351] Fetching details for ID: ChIJVYyd42TsyjERDs-7vOXoFgw
[3/351] Fetching details for ID: ChIJqc8xcn7syjERAkHcRz0NE1Y
[4/351] Fetching details for ID: ChIJaWjwIwDtyjER1ylzjCOPexo
[5/351] Fetching details for ID: ChIJKVfCian6yjERUCVp-ONcZAY
[6/351] Fetching details for ID: ChIJwU0FulzASjARSHTfbr4Le9M
[7/351] Fetching details for ID: ChIJl8_wyrntyjERVICYVdAqgk8
[8/351] Fetching details for ID: ChIJe5vz57nDSjARK8XK6RWTeSE
[9/351] Fetching details for ID: ChIJEXZR-5jsyjERGPNSMf49S6g
[10/351] Fetching details for ID: ChIJA8WBDubsyjERX84WxHZpCX0
[11/351] Fetching details for ID: ChIJGQu5kVHASjARXjq1pkrI6Ts
[12/351] Fetching details for ID: ChIJx3DznyFKyzER0gS_canWx4E
[13/351] Fetching details for ID: ChIJJ2v3WsrtyjERqo_017O70s0
[14/351] Fetching details for ID: ChIJTahJomLsyjER68fEFiH1-RA
[15/351] Fetching details for ID: ChIJAV3TB0-uyjERLafh8qWEPBk
[16/351] F

## Hotel

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Perak, Malaysia"
search_keywords =    [
    "hotel", "resort", "airbnb", "homestay", "guesthouse",
    "motel", "hostel", "boutique hotel"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/perak_hotel_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: hotel in Perak, Malaysia
  - Page 1 complete. Total IDs: 13
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 26
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 34
Finished search for hotel.
Starting search for: resort in Perak, Malaysia
  - Page 1 complete. Total IDs: 43
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 48
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 57
Finished search for resort.
Starting search for: airbnb in Perak, Malaysia
  - Page 1 complete. Total IDs: 57
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 58
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 58
Finished search for airbnb.
Starting search for: homestay in Perak, Malaysia
Error in search for 'homestay': ZERO_RESULTS
Finished search for homestay.
Starting search for: guesthouse in Perak, Malaysia
  - Page 1 complete. Total

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/perak_stage2_hotel_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/perak_hotel_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            df['extract_date'] = datetime.now().date()
            df['region'] = target_region
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 82
Total IDs already cached: 0
[1/82] Fetching details for ID: ChIJV4YEIWXsyjERcHTQPisOXy4
[2/82] Fetching details for ID: ChIJXfYaP2DtyjER1UubTtrMg8E
[3/82] Fetching details for ID: ChIJdStMx2LsyjEREtIJcJtMWh0
[4/82] Fetching details for ID: ChIJ0wYnlX8GyzER-I6899OxqN4
[5/82] Fetching details for ID: ChIJsVRI8dfSNDARJRH6x6j6to4
[6/82] Fetching details for ID: ChIJr5AQqfDVNDARjSWhorUMxaQ
[7/82] Fetching details for ID: ChIJqzcpcFEsyzERCyXjFL952gs
[8/82] Fetching details for ID: ChIJ1TsY4-3TNDARP92FLPTxaIs
[9/82] Fetching details for ID: ChIJQaUDfQmqtTERO0MUaQW1ilc
[10/82] Fetching details for ID: ChIJD7fjEInuyjERrKyoHosvGf8
[11/82] Fetching details for ID: ChIJP_Fas7SvSjARO5nsVRl3RsM
[12/82] Fetching details for ID: ChIJOVWMGSDTNDAR1uXEheMfDKM
[13/82] Fetching details for ID: ChIJKwDVQkLtyjERt_cmG8g7mhA
[14/82] Fetching details for ID: ChIJI5PoYXeuyjERNEwaxvvbJKY
[15/82] Fetching details for ID: ChIJT1mw4e3TNDARzjGgZ3VUUhw
[16/82] Fetching details f

# Terengganu

## Tourist Attractions

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Terengganu, Malaysia"
search_keywords =  [
    "tourist attractions", "museums", "parks", "historical sites",
    "landmarks", "shopping mall", "temple", "mosque", "church",
    "island", "beach", "theme park", "natural preserve",
    "forest", "mountain", "waterfall", "zoo", "aquarium"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/terengganu_tourist_attractions_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: tourist attractions in Terengganu, Malaysia
  - Page 1 complete. Total IDs: 7
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 11
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 19
Finished search for tourist attractions.
Starting search for: museums in Terengganu, Malaysia
  - Page 1 complete. Total IDs: 20
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 25
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 25
Finished search for museums.
Starting search for: parks in Terengganu, Malaysia
  - Page 1 complete. Total IDs: 29
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 32
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 33
Finished search for parks.
Starting search for: historical sites in Terengganu, Malaysia
  - Page 1 complete. Total IDs: 33
  Waiting 3s for next_page_token...
  - Page 2 complete. Total

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/terengganu_stage2_tourist_attractions_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/terengganu_tourist_attractions_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            df['extract_date'] = datetime.now().date()
            df['region'] = target_region
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 104
Total IDs already cached: 0
[1/104] Fetching details for ID: ChIJW7yTH2zIyDEReXMJW6PB-DI
[2/104] Fetching details for ID: ChIJaz6h3-y-tzER5DjXUMYjhlI
[3/104] Fetching details for ID: ChIJ7XDzWfdbtjERxk18yFdsQJw
[4/104] Fetching details for ID: ChIJUbbg8CC9tzERbO-EnBSEKuw
[5/104] Fetching details for ID: ChIJOZlLomt9yDEROYibmcR6exc
[6/104] Fetching details for ID: ChIJ0dNo8FClyDERCcvm9THOrrM
[7/104] Fetching details for ID: ChIJn79er9e9tzERMgcpYnw7Z0Q
[8/104] Fetching details for ID: ChIJlzZFfmh8tjEReMiaC_SIDq0
[9/104] Fetching details for ID: ChIJ9_6dH8dJzDERC7k4SHbqU3o
[10/104] Fetching details for ID: ChIJkeUhT1CRtjER61YLTgs4CTI
[11/104] Fetching details for ID: ChIJAZI_P228tzERypW5fMYgExo
[12/104] Fetching details for ID: ChIJjexkO6C8tzER2hNmnE6GHk4
[13/104] Fetching details for ID: ChIJ0fjClCJ-yDER8Yk3ofLpqvg
[14/104] Fetching details for ID: ChIJ6aG3TQSwtjER21bYmRfyKgA
[15/104] Fetching details for ID: ChIJ2R7lHxO8tzERyX-TSBViH4s
[16/104] F

## Restaurant

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Terengganu, Malaysia"
search_keywords =  [
    "restaurant", "cafe", "bar", "food", "bistro","hawker centre",
    "food court", "seafood restaurant", "mamak restaurant",
    "malay restaurant", "indian restaurant", "japanese food",
    "western food","korean food", "chinese restaurant"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/terengganu_restaurants_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: restaurant in Terengganu, Malaysia
  - Page 1 complete. Total IDs: 16
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 24
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 29
Finished search for restaurant.
Starting search for: cafe in Terengganu, Malaysia
  - Page 1 complete. Total IDs: 32
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 38
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 43
Finished search for cafe.
Starting search for: bar in Terengganu, Malaysia
  - Page 1 complete. Total IDs: 47
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 47
Finished search for bar.
Starting search for: food in Terengganu, Malaysia
  - Page 1 complete. Total IDs: 49
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 52
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 57
Finished search for food.
Star

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/terengganu_stage2_restaurant_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/terengganu_restaurant_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            df['extract_date'] = datetime.now().date()
            df['region'] = target_region
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 128
Total IDs already cached: 0
[1/128] Fetching details for ID: ChIJ__jZ9Ni9tzERyhXfR4D2jj8
[2/128] Fetching details for ID: ChIJQZNUm3m-tzERgTlgKEALKfA
[3/128] Fetching details for ID: ChIJu13hBY6-tzERWKA0h2_vk9A
[4/128] Fetching details for ID: ChIJNWTrgQa_tzERPd1b6H4X9uU
[5/128] Fetching details for ID: ChIJheG3uou-tzERUjgnwV0vG0M
[6/128] Fetching details for ID: ChIJt2en_hG9tzERIzwQjmbTaSk
[7/128] Fetching details for ID: ChIJB-EXE4-_tzERp-IzIAWacVc
[8/128] Fetching details for ID: ChIJb1VhSQC9tzER0dc4Ii6tXdY
[9/128] Fetching details for ID: ChIJ_yFvk1RxyDERUseYV8evlm0
[10/128] Fetching details for ID: ChIJ71653H6-tzERqvq5-hqy2g8
[11/128] Fetching details for ID: ChIJ4_x5JYG_tzER8vDSglK6zCk
[12/128] Fetching details for ID: ChIJcwmm4hi8tzER8Zk2yPF7mGc
[13/128] Fetching details for ID: ChIJaXgX1tbrtzERb1s25GUNiTk
[14/128] Fetching details for ID: ChIJFePUTQS9tzERAnyYWQC64CY
[15/128] Fetching details for ID: ChIJERFvOquVtzERhcStp07Mc5I
[16/128] F

## Hotel

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Terengganu, Malaysia"
search_keywords =    [
    "hotel", "resort", "airbnb", "homestay", "guesthouse",
    "motel", "hostel", "boutique hotel"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/terengganu_hotel_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: hotel in Terengganu, Malaysia
  - Page 1 complete. Total IDs: 14
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 24
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 29
Finished search for hotel.
Starting search for: resort in Terengganu, Malaysia
  - Page 1 complete. Total IDs: 42
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 55
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 64
Finished search for resort.
Starting search for: airbnb in Terengganu, Malaysia
  - Page 1 complete. Total IDs: 64
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 64
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 64
Finished search for airbnb.
Starting search for: homestay in Terengganu, Malaysia
Error in search for 'homestay': ZERO_RESULTS
Finished search for homestay.
Starting search for: guesthouse in Terengganu, Malaysia
 

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/terengganu_stage2_hotel_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/terengganu_hotel_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            df['extract_date'] = datetime.now().date()
            df['region'] = target_region
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 73
Total IDs already cached: 0
[1/73] Fetching details for ID: ChIJqSS6sPO-tzERax4yQHnF3Ww
[2/73] Fetching details for ID: ChIJ00UyWXm-tzERwNmlDLmyZMk
[3/73] Fetching details for ID: ChIJdViwRXi-tzERlEL7j-WwUi8
[4/73] Fetching details for ID: ChIJf8JD5BfbtzERb-GRXNkI9Hk
[5/73] Fetching details for ID: ChIJz4gyK-GGyDERgJU28s3T1UI
[6/73] Fetching details for ID: ChIJDRJqURa8tzERHpMysb4dih8
[7/73] Fetching details for ID: ChIJO22tROuGyDERslLTK8rwNbM
[8/73] Fetching details for ID: ChIJ55cFy6jgtjERHsWEOzUqrWI
[9/73] Fetching details for ID: ChIJH6QNNN-9tzERTYVZZNkh0vc
[10/73] Fetching details for ID: ChIJY1xrPQnvtjER99I3u7F3Zfg
[11/73] Fetching details for ID: ChIJ6we-oLHgtjERvSYLGGwmgFY
[12/73] Fetching details for ID: ChIJP1u_a3O-tzERSxLXcKENymY
[13/73] Fetching details for ID: ChIJ992wjALetjER-4G1AFn8U9E
[14/73] Fetching details for ID: ChIJHRApB00QtzERLWOkar1BCD8
[15/73] Fetching details for ID: ChIJX1DWd-3htjER7BLwyX2V6aY
[16/73] Fetching details f

# Kedah

## Tourist Attractions

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Kedah, Malaysia"
search_keywords =  [
    "tourist attractions", "museums", "parks", "historical sites",
    "landmarks", "shopping mall", "temple", "mosque", "church",
    "island", "beach", "theme park", "natural preserve",
    "forest", "mountain", "waterfall", "zoo", "aquarium"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/kedah_tourist_attractions_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: tourist attractions in Kedah, Malaysia
  - Page 1 complete. Total IDs: 11
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 18
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 24
Finished search for tourist attractions.
Starting search for: museums in Kedah, Malaysia
  - Page 1 complete. Total IDs: 29
Finished search for museums.
Starting search for: parks in Kedah, Malaysia
  - Page 1 complete. Total IDs: 39
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 40
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 43
Finished search for parks.
Starting search for: historical sites in Kedah, Malaysia
  - Page 1 complete. Total IDs: 43
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 47
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 48
Finished search for historical sites.
Starting search for: landmarks in Kedah, Ma

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/kedah_stage2_tourist_attractions_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/kedah_tourist_attractions_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            df['extract_date'] = datetime.now().date()
            df['region'] = target_region
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 142
Total IDs already cached: 0
[1/142] Fetching details for ID: ChIJVVVVlXUmSzARq65UehcIo3c
[2/142] Fetching details for ID: ChIJ-aKjt96HSzARzJkIu36wYoQ
[3/142] Fetching details for ID: ChIJz0_5PcYxSzARpRWT5CPHv8s
[4/142] Fetching details for ID: ChIJSeztzSZFSzAR0s2SRKy4CYU
[5/142] Fetching details for ID: ChIJBZnKhzwnSzARtIgfx76cWn0
[6/142] Fetching details for ID: ChIJ2Rx4voCASzARDUgXoCXXD-o
[7/142] Fetching details for ID: ChIJo8WWQCJ5TDARSGOIfiqZAlU
[8/142] Fetching details for ID: ChIJ0eh8jeR2TDARHMbm07pxat0
[9/142] Fetching details for ID: ChIJZ5hm4FslSzARyVX70660Qpc
[10/142] Fetching details for ID: ChIJLWgHftZZSzARkLb_OwpvSNg
[11/142] Fetching details for ID: ChIJXQ3fPY5wTDARA58oZmoQfms
[12/142] Fetching details for ID: ChIJe1kWgkPDSjAR7tWBfFiAYwk
[13/142] Fetching details for ID: ChIJ36e7SjVYSzARzYYwpAKfgNg
[14/142] Fetching details for ID: ChIJafqpes5-TDARTN-nv29KXrE
[15/142] Fetching details for ID: ChIJfQGqUJd8TDARhI4UPy9SCrY
[16/142] F

## Restaurant

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Kedah, Malaysia"
search_keywords =  [
    "restaurant", "cafe", "bar", "food", "bistro","hawker centre",
    "food court", "seafood restaurant", "mamak restaurant",
    "malay restaurant", "indian restaurant", "japanese food",
    "western food","korean food", "chinese restaurant"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/kedah_restaurants_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: restaurant in Kedah, Malaysia
  - Page 1 complete. Total IDs: 14
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 21
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 32
Finished search for restaurant.
Starting search for: cafe in Kedah, Malaysia
  - Page 1 complete. Total IDs: 38
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 41
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 48
Finished search for cafe.
Starting search for: bar in Kedah, Malaysia
  - Page 1 complete. Total IDs: 49
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 51
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 52
Finished search for bar.
Starting search for: food in Kedah, Malaysia
  - Page 1 complete. Total IDs: 57
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 67
  Waiting 3s for next_page_token...
  - Page 3 com

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/kedah_stage2_restaurant_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/kedah_restaurant_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            df['extract_date'] = datetime.now().date()
            df['region'] = target_region
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 230
Total IDs already cached: 0
[1/230] Fetching details for ID: ChIJee70v-9GSzARtmqFiwB8IrI
[2/230] Fetching details for ID: ChIJa1HvBisoSzAREPvNgQi0Bek
[3/230] Fetching details for ID: ChIJS-wj1VVbSzARHvX3ztUPvgg
[4/230] Fetching details for ID: ChIJR5V6ZpvLSjAR5W2PqWLUDGQ
[5/230] Fetching details for ID: ChIJJaRJd1FFSzARXnDEAvJkFTY
[6/230] Fetching details for ID: ChIJ03FIwTZaSzAR3eupQWsfYLE
[7/230] Fetching details for ID: ChIJ07jvZQQxSzAR4X9xeM9F0lQ
[8/230] Fetching details for ID: ChIJhf0KmoCASzARmwjbMGp_iv0
[9/230] Fetching details for ID: ChIJJQYtN93DSjARfUfNraU9iXw
[10/230] Fetching details for ID: ChIJR5QtTJZESzARcw92_N3rjjI
[11/230] Fetching details for ID: ChIJg0PrESjDSjAR4hrqIFtU5Bs
[12/230] Fetching details for ID: ChIJraCE5CUpSzAR3gZCEqvwrJg
[13/230] Fetching details for ID: ChIJFX3LmJRFSzARW68GTPHWkjg
[14/230] Fetching details for ID: ChIJo7_4_8NESzARjyhVjOqd8io
[15/230] Fetching details for ID: ChIJn5zHmZlFSzARzGeVzyOv3N4
[16/230] F

## Hotel

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Kedah, Malaysia"
search_keywords =    [
    "hotel", "resort", "airbnb", "homestay", "guesthouse",
    "motel", "hostel", "boutique hotel"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/kedah_hotel_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: hotel in Kedah, Malaysia
  - Page 1 complete. Total IDs: 13
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 23
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 32
Finished search for hotel.
Starting search for: resort in Kedah, Malaysia
  - Page 1 complete. Total IDs: 37
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 46
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 59
Finished search for resort.
Starting search for: airbnb in Kedah, Malaysia
  - Page 1 complete. Total IDs: 59
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 59
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 59
Finished search for airbnb.
Starting search for: homestay in Kedah, Malaysia
Error in search for 'homestay': ZERO_RESULTS
Finished search for homestay.
Starting search for: guesthouse in Kedah, Malaysia
  - Page 1 complete. Total

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/kedah_stage2_hotel_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/kedah_hotel_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            df['extract_date'] = datetime.now().date()
            df['region'] = target_region
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 74
Total IDs already cached: 0
[1/74] Fetching details for ID: ChIJt0FGgcrXSjARQtOO0KhcO7k
[2/74] Fetching details for ID: ChIJSYKYuMSHSzARGMjMTwf6p5g
[3/74] Fetching details for ID: ChIJBZnKhzwnSzARtIgfx76cWn0
[4/74] Fetching details for ID: ChIJH13Ayax4TDARor6ALrakgjw
[5/74] Fetching details for ID: ChIJH9c0u7lESzAR-GLxcnM9jRc
[6/74] Fetching details for ID: ChIJT7XpZjXWSjARNOPPb7To4gk
[7/74] Fetching details for ID: ChIJDczn9HpESzARJfAuUTbgLSs
[8/74] Fetching details for ID: ChIJk2geTHx_TDARKB5Ng93--SU
[9/74] Fetching details for ID: ChIJta1minh3TDARbXXN55lTbdw
[10/74] Fetching details for ID: ChIJqVvW0bgpSzARyhxvn20E8xs
[11/74] Fetching details for ID: ChIJHXsfzwV4TDARfvc-x9uORXw
[12/74] Fetching details for ID: ChIJf2b36sspSzARqB3eUl93gOA
[13/74] Fetching details for ID: ChIJ53L8HwpbSzARAJCPoAwjQ5E
[14/74] Fetching details for ID: ChIJb-RpRy6GSzARmsjQOhCALfU
[15/74] Fetching details for ID: ChIJwVzl08pZSzARGVt823FkWTI
[16/74] Fetching details f

# Kelantan

## Tourist Attractions

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Kelantan, Malaysia"
search_keywords =  [
    "tourist attractions", "museums", "parks", "historical sites",
    "landmarks", "shopping mall", "temple", "mosque", "church",
    "island", "beach", "theme park", "natural preserve",
    "forest", "mountain", "waterfall", "zoo", "aquarium"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/kelantan_tourist_attractions_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: tourist attractions in Kelantan, Malaysia
  - Page 1 complete. Total IDs: 14
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 25
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 31
Finished search for tourist attractions.
Starting search for: museums in Kelantan, Malaysia
  - Page 1 complete. Total IDs: 32
Finished search for museums.
Starting search for: parks in Kelantan, Malaysia
  - Page 1 complete. Total IDs: 36
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 37
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 39
Finished search for parks.
Starting search for: historical sites in Kelantan, Malaysia
  - Page 1 complete. Total IDs: 41
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 44
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 44
Finished search for historical sites.
Starting search for: landmarks 

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/kelantan_stage2_tourist_attractions_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/kelantan_tourist_attractions_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            df['extract_date'] = datetime.now().date()
            df['region'] = target_region
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 119
Total IDs already cached: 0
[1/119] Fetching details for ID: ChIJ7WVPJRcxtDERqyk5GeRyiO4
[2/119] Fetching details for ID: ChIJXTeveuWvtjERVAQzxGipLJQ
[3/119] Fetching details for ID: ChIJn8LPxtq2tjERJPt-NMDvZcE
[4/119] Fetching details for ID: ChIJz1qi3KNZyjERV7mouGpSdog
[5/119] Fetching details for ID: ChIJt571H8-vtjERTPvte-nQKh4
[6/119] Fetching details for ID: ChIJ0dNo8FClyDERCcvm9THOrrM
[7/119] Fetching details for ID: ChIJQxJqABTAtjER0J87zxbM0LM
[8/119] Fetching details for ID: ChIJ8ze8ApaNtjERr9rqTmAryvo
[9/119] Fetching details for ID: ChIJOZlLomt9yDEROYibmcR6exc
[10/119] Fetching details for ID: ChIJg_561emvtjEReNrqDAkDBF8
[11/119] Fetching details for ID: ChIJlzZFfmh8tjEReMiaC_SIDq0
[12/119] Fetching details for ID: ChIJkeUhT1CRtjER61YLTgs4CTI
[13/119] Fetching details for ID: ChIJSwPckAystjERus8_BB4A4Vw
[14/119] Fetching details for ID: ChIJ6aG3TQSwtjER21bYmRfyKgA
[15/119] Fetching details for ID: ChIJY6gfio-RtjERg9LJmwvzdQI
[16/119] F

## Restaurant

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Kelantan, Malaysia"
search_keywords =  [
    "restaurant", "cafe", "bar", "food", "bistro","hawker centre",
    "food court", "seafood restaurant", "mamak restaurant",
    "malay restaurant", "indian restaurant", "japanese food",
    "western food","korean food", "chinese restaurant"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/kelantan_restaurants_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: restaurant in Kelantan, Malaysia
  - Page 1 complete. Total IDs: 14
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 24
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 30
Finished search for restaurant.
Starting search for: cafe in Kelantan, Malaysia
  - Page 1 complete. Total IDs: 42
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 50
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 57
Finished search for cafe.
Starting search for: bar in Kelantan, Malaysia
  - Page 1 complete. Total IDs: 61
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 68
Finished search for bar.
Starting search for: food in Kelantan, Malaysia
  - Page 1 complete. Total IDs: 68
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 70
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 70
Finished search for food.
Starting sea

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/kelantan_stage2_restaurant_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/kelantan_restaurant_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            df['extract_date'] = datetime.now().date()
            df['region'] = target_region
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 133
Total IDs already cached: 0
[1/133] Fetching details for ID: ChIJXQ3NRmywtjERuQTx0oXNSlk
[2/133] Fetching details for ID: ChIJx4VPRtCvtjEROw5m6g645NM
[3/133] Fetching details for ID: ChIJE4U3mnSwtjERX2pMHZ9NMDk
[4/133] Fetching details for ID: ChIJXRja1gOwtjERh6r7eOhwtlU
[5/133] Fetching details for ID: ChIJxa4Q5FultjERRiNuMZzXNiY
[6/133] Fetching details for ID: ChIJtzQbVsSxtjERBwgN6T5CIHE
[7/133] Fetching details for ID: ChIJ-17AeSewtjERK7mxOu0PltI
[8/133] Fetching details for ID: ChIJfc7vrkKvtjERnSzhq5Vc-aM
[9/133] Fetching details for ID: ChIJeQ6BvR8AtjERIAKTBlRQlAY
[10/133] Fetching details for ID: ChIJiRDDucivtjER6yQtJptWMg4
[11/133] Fetching details for ID: ChIJE0cvA72ctjERm1tnPaepzvA
[12/133] Fetching details for ID: ChIJ4aN4OsavtjERzq7hZe66knE
[13/133] Fetching details for ID: ChIJifZPaRxXyjEROfSjNkydvIE
[14/133] Fetching details for ID: ChIJURg2BqK6tjER8TS6t6N7RAk
[15/133] Fetching details for ID: ChIJA8WBDubsyjERX84WxHZpCX0
[16/133] F

## Hotel

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Kelantan, Malaysia"
search_keywords =    [
    "hotel", "resort", "airbnb", "homestay", "guesthouse",
    "motel", "hostel", "boutique hotel"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/kelantan_hotel_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: hotel in Kelantan, Malaysia
  - Page 1 complete. Total IDs: 8
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 16
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 19
Finished search for hotel.
Starting search for: resort in Kelantan, Malaysia
  - Page 1 complete. Total IDs: 21
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 24
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 25
Finished search for resort.
Starting search for: airbnb in Kelantan, Malaysia
  - Page 1 complete. Total IDs: 25
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 26
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 26
Finished search for airbnb.
Starting search for: homestay in Kelantan, Malaysia
Error in search for 'homestay': ZERO_RESULTS
Finished search for homestay.
Starting search for: guesthouse in Kelantan, Malaysia
  - Page 1 c

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/kelantan_stage2_hotel_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/kelantan_hotel_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            df['extract_date'] = datetime.now().date()
            df['region'] = target_region
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 32
Total IDs already cached: 0
[1/32] Fetching details for ID: ChIJ3WaZvJXrtjER8biUkWxg_Ec
[2/32] Fetching details for ID: ChIJ7WVPJRcxtDERqyk5GeRyiO4
[3/32] Fetching details for ID: ChIJE4U3mnSwtjERX2pMHZ9NMDk
[4/32] Fetching details for ID: ChIJ_4fyxUuwtjERnSxScwBlOds
[5/32] Fetching details for ID: ChIJldBtrcevtjERDarW3X1gbWQ
[6/32] Fetching details for ID: ChIJQxJqABTAtjER0J87zxbM0LM
[7/32] Fetching details for ID: ChIJIxfWWHMxtDERaYvzZ_v25dU
[8/32] Fetching details for ID: ChIJlS9kYp_rtjERyvXO74rii2g
[9/32] Fetching details for ID: ChIJV1eg_fqltjER18ViHQbLFlU
[10/32] Fetching details for ID: ChIJxbXPuMWvtjERmaVq7WcUGWs
[11/32] Fetching details for ID: ChIJKYbgwN-xtjERUQNzA1jPZpc
[12/32] Fetching details for ID: ChIJ4aN4OsavtjERzq7hZe66knE
[13/32] Fetching details for ID: ChIJmWMFEWm_tjERAe6DqMqhBGA
[14/32] Fetching details for ID: ChIJkdzTqrYGyjERkYlRcZQmNB0
[15/32] Fetching details for ID: ChIJd5ZjK86vtjERJJ8orgXDMvo
[16/32] Fetching details f

# Perlis

## Tourist Attractions

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Perlis, Malaysia"
search_keywords =  [
    "tourist attractions", "museums", "parks", "historical sites",
    "landmarks", "shopping mall", "temple", "mosque", "church",
    "island", "beach", "theme park", "natural preserve",
    "forest", "mountain", "waterfall", "zoo", "aquarium"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/perlis_tourist_attractions_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Loading existing Place IDs from cache: /content/drive/MyDrive/tourism_data/kelantan_tourist_attractions_stage1_place_ids.json
Loaded 119 IDs from cache. Skipping search.
Starting Stage 1 Search. Current ID count: 119
Starting search for: tourist attractions in Perlis, Malaysia
  - Page 1 complete. Total IDs: 130
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 131
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 131
Finished search for tourist attractions.
Starting search for: museums in Perlis, Malaysia
  - Page 1 complete. Total IDs: 132
Finished search for museums.
Starting search for: parks in Perlis, Malaysia
  - Page 1 complete. Total IDs: 134
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 134
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 134
Finished search for parks.
Starting search for: historical sites in Perlis, Malaysia
  - Page 1 complete. Total IDs: 134
  Waiting 3s for next_page_token...
  -

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/perlis_stage2_tourist_attractions_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/perlis_tourist_attractions_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            df['extract_date'] = datetime.now().date()
            df['region'] = target_region
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 161
Total IDs already cached: 0
[1/161] Fetching details for ID: ChIJ7WVPJRcxtDERqyk5GeRyiO4
[2/161] Fetching details for ID: ChIJXTeveuWvtjERVAQzxGipLJQ
[3/161] Fetching details for ID: ChIJn8LPxtq2tjERJPt-NMDvZcE
[4/161] Fetching details for ID: ChIJH_v_L1uRTDAR_B_6G__BnrE
[5/161] Fetching details for ID: ChIJz1qi3KNZyjERV7mouGpSdog
[6/161] Fetching details for ID: ChIJ2Rx4voCASzARDUgXoCXXD-o
[7/161] Fetching details for ID: ChIJt571H8-vtjERTPvte-nQKh4
[8/161] Fetching details for ID: ChIJ0dNo8FClyDERCcvm9THOrrM
[9/161] Fetching details for ID: ChIJQxJqABTAtjER0J87zxbM0LM
[10/161] Fetching details for ID: ChIJ8ze8ApaNtjERr9rqTmAryvo
[11/161] Fetching details for ID: ChIJOZlLomt9yDEROYibmcR6exc
[12/161] Fetching details for ID: ChIJBxHYjnB_TDAR6cE0tHah7_s
[13/161] Fetching details for ID: ChIJg_561emvtjEReNrqDAkDBF8
[14/161] Fetching details for ID: ChIJlzZFfmh8tjEReMiaC_SIDq0
[15/161] Fetching details for ID: ChIJkeUhT1CRtjER61YLTgs4CTI
[16/161] F

## Restaurant

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Perlis, Malaysia"
search_keywords =  [
    "restaurant", "cafe", "bar", "food", "bistro","hawker centre",
    "food court", "seafood restaurant", "mamak restaurant",
    "malay restaurant", "indian restaurant", "japanese food",
    "western food","korean food", "chinese restaurant"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/perlis_restaurants_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: restaurant in Perlis, Malaysia
  - Page 1 complete. Total IDs: 8
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 19
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 29
Finished search for restaurant.
Starting search for: cafe in Perlis, Malaysia
  - Page 1 complete. Total IDs: 34
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 34
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 34
Finished search for cafe.
Starting search for: bar in Perlis, Malaysia
  - Page 1 complete. Total IDs: 34
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 34
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 34
Finished search for bar.
Starting search for: food in Perlis, Malaysia
  - Page 1 complete. Total IDs: 34
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 35
  Waiting 3s for next_page_token...
  - Page 3 

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/perlis_stage2_restaurant_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/perlis_restaurant_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            df['extract_date'] = datetime.now().date()
            df['region'] = target_region
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 72
Total IDs already cached: 0
[1/72] Fetching details for ID: ChIJoVqo0urATDARijmqg6IN3h0
[2/72] Fetching details for ID: ChIJw3b_r7eZTDAR_1CFCOFvATw
[3/72] Fetching details for ID: ChIJByetZbCiTDARDez-LO7PWtg
[4/72] Fetching details for ID: ChIJhTsX74ebTDARsA5ug9Kx7dU
[5/72] Fetching details for ID: ChIJoXLx-2uiTDAR3q0OXRWs7Xw
[6/72] Fetching details for ID: ChIJQf46UJxhSzARWtjYtW5iyNU
[7/72] Fetching details for ID: ChIJo8bkzC6aTDAReTlkOn9CdVA
[8/72] Fetching details for ID: ChIJvzCMtaOZTDARLLfkpqv2uAA
[9/72] Fetching details for ID: ChIJHVxGe5mXTDARqxaWFkjqmNQ
[10/72] Fetching details for ID: ChIJF-bQNciZTDARUErnknBfhL8
[11/72] Fetching details for ID: ChIJkfHPor-ZTDARFkESFovQjUc
[12/72] Fetching details for ID: ChIJH_v_L1uRTDAR_B_6G__BnrE
[13/72] Fetching details for ID: ChIJVXZuWISZTDARgxzPRkmkMt0
[14/72] Fetching details for ID: ChIJA6Tg97CZTDARrLf_05XOSZQ
[15/72] Fetching details for ID: ChIJWeiA7p6bTDAR5lwbFQolCZA
[16/72] Fetching details f

## Hotel

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Perlis, Malaysia"
search_keywords =    [
    "hotel", "resort", "airbnb", "homestay", "guesthouse",
    "motel", "hostel", "boutique hotel"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/perlis_hotel_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: hotel in Perlis, Malaysia
  - Page 1 complete. Total IDs: 8
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 12
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 15
Finished search for hotel.
Starting search for: resort in Perlis, Malaysia
  - Page 1 complete. Total IDs: 15
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 15
Finished search for resort.
Starting search for: airbnb in Perlis, Malaysia
  - Page 1 complete. Total IDs: 15
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 15
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 15
Finished search for airbnb.
Starting search for: homestay in Perlis, Malaysia
Error in search for 'homestay': ZERO_RESULTS
Finished search for homestay.
Starting search for: guesthouse in Perlis, Malaysia
  - Page 1 complete. Total IDs: 15
Finished search for guesthouse.
Starting search for: motel

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/perlis_stage2_hotel_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/perlis_hotel_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            df['extract_date'] = datetime.now().date()
            df['region'] = target_region
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 15
Total IDs already cached: 0
[1/15] Fetching details for ID: ChIJ5y2xX3icTDAR-oYg2E6UIcI
[2/15] Fetching details for ID: ChIJn2xGbYGbTDAR5uVSWRS6jro
[3/15] Fetching details for ID: ChIJ0Qrk7oObTDARAAIgUaO6-KU
[4/15] Fetching details for ID: ChIJM8DwfDqbTDARGWUOOuxKLq0
[5/15] Fetching details for ID: ChIJIw0GEbuZTDAReNOhXkYG4R4
[6/15] Fetching details for ID: ChIJf_NFGn-fTDAR5SLoHiaGAL8
[7/15] Fetching details for ID: ChIJ-2VZZGOiTDARAIKS6Ty025Q
[8/15] Fetching details for ID: ChIJ_31lo0-YTDARN4aVpMZbQn4
[9/15] Fetching details for ID: ChIJRd2ke6SZTDARJ6-91pCtJlI
[10/15] Fetching details for ID: ChIJuZz1nbGZTDARA9OdEhoSFEM
[11/15] Fetching details for ID: ChIJ6aR_zqmZTDARi_uMv8OdIqg
[12/15] Fetching details for ID: ChIJI4gLxN2VTDARUnFoUT5yjjc
[13/15] Fetching details for ID: ChIJhXDbCX6bTDAR3VfkG6HH8Pg
[14/15] Fetching details for ID: ChIJIdpM6KebTDARa8P5t_ksiGo
[15/15] Fetching details for ID: ChIJ_Xce8vWYTDARNSA0NAKIaJ4

*** FINAL SAVE: 15 remain

# Putrajaya

## Tourist Attractions

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Putrajaya, Malaysia"
search_keywords =  [
    "tourist attractions", "museums", "parks", "historical sites",
    "landmarks", "shopping mall", "temple", "mosque", "church",
    "island", "beach", "theme park", "natural preserve",
    "forest", "mountain", "waterfall", "zoo", "aquarium"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/putrajaya_tourist_attractions_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: tourist attractions in Putrajaya, Malaysia
  - Page 1 complete. Total IDs: 16
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 21
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 24
Finished search for tourist attractions.
Starting search for: museums in Putrajaya, Malaysia
  - Page 1 complete. Total IDs: 27
Finished search for museums.
Starting search for: parks in Putrajaya, Malaysia
  - Page 1 complete. Total IDs: 30
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 32
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 36
Finished search for parks.
Starting search for: historical sites in Putrajaya, Malaysia
  - Page 1 complete. Total IDs: 36
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 39
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 40
Finished search for historical sites.
Starting search for: landma

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/putrajaya_stage2_tourist_attractions_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/putrajaya_tourist_attractions_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            df['extract_date'] = datetime.now().date()
            df['region'] = target_region
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 121
Total IDs already cached: 0
[1/121] Fetching details for ID: ChIJs3w_chS2zTERU10F1-BrA3w
[2/121] Fetching details for ID: ChIJW7yTH2zIyDEReXMJW6PB-DI
[3/121] Fetching details for ID: ChIJHWIvewW3zTERWbv1f7DoL6g
[4/121] Fetching details for ID: ChIJ2xqE4T22zTERWLus9fYLJwU
[5/121] Fetching details for ID: ChIJvwv8fxI1zDERRB1idU9EsJE
[6/121] Fetching details for ID: ChIJTYDynzE_zDERbZiCWRohO3M
[7/121] Fetching details for ID: ChIJXQ3fPY5wTDARA58oZmoQfms
[8/121] Fetching details for ID: ChIJse2diXG1zTERFGvnQDQdDpM
[9/121] Fetching details for ID: ChIJidjpUw4zzDERyC4NcpL7Adg
[10/121] Fetching details for ID: ChIJq8YZlTRKzDERW1gNLhIdBpw
[11/121] Fetching details for ID: ChIJL-Qvn9JOzDERBLmeTTrrgf0
[12/121] Fetching details for ID: ChIJa3p9Gun_yjERV5pjRH-Xs5Q
[13/121] Fetching details for ID: ChIJietkC5u3zTERrOGDEn7NC_0
[14/121] Fetching details for ID: ChIJI5vw85W3zTERSCVGvYTT-sY
[15/121] Fetching details for ID: ChIJiXI-WgnKzTERYwHEk3fHJU0
[16/121] F

## Restaurant

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Putrajaya, Malaysia"
search_keywords =  [
    "restaurant", "cafe", "bar", "food", "bistro","hawker centre",
    "food court", "seafood restaurant", "mamak restaurant",
    "malay restaurant", "indian restaurant", "japanese food",
    "western food","korean food", "chinese restaurant"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/putrajaya_restaurants_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: restaurant in Putrajaya, Malaysia
  - Page 1 complete. Total IDs: 16
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 28
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 42
Finished search for restaurant.
Starting search for: cafe in Putrajaya, Malaysia
  - Page 1 complete. Total IDs: 48
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 51
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 52
Finished search for cafe.
Starting search for: bar in Putrajaya, Malaysia
  - Page 1 complete. Total IDs: 62
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 68
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 71
Finished search for bar.
Starting search for: food in Putrajaya, Malaysia
  - Page 1 complete. Total IDs: 71
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 73
  Waiting 3s for next_page_token..

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/putrajaya_stage2_restaurant_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/putrajaya_restaurant_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            df['extract_date'] = datetime.now().date()
            df['region'] = target_region
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Total IDs to process (not in cache): 200
Total IDs already cached: 0
[1/200] Fetching details for ID: ChIJy4_3pki2zTER8dNLkzEExVw
[2/200] Fetching details for ID: ChIJU3enXW_JzTERDUWvuTXjq-k
[3/200] Fetching details for ID: ChIJncID1hG2zTERhbffI2UZqrU
[4/200] Fetching details for ID: ChIJa9NcGMRMzDERxIxdhef0g70
[5/200] Fetching details for ID: ChIJda4G-9JJzDERpIrfCT5tfuI
[6/200] Fetching details for ID: ChIJW-N4qUe2zTERJjcwlbZjhsE
[7/200] Fetching details for ID: ChIJU8s2yfLLzTEROmealplgyy4
[8/200] Fetching details for ID: ChIJfbA_c5zLzTERitIcZgPBmyc
[9/200] Fetching details for ID: ChIJ16p1b6TLzTERPhKN4iyXaTA
[10/200] Fetching details for ID: ChIJT1Xlqg7KzTER3IEzp9WPFy8
[11/200] Fetching details for ID: ChIJf-9-L2-2zTERKqjbgPZoFRU
[12/200] Fetching details for ID: ChIJHdefH0G2zTER-7WJgCWxzdA
[13/200] Fetching details for ID: ChIJIc0oNybIzTERcrNPSWgqOeA
[14/200] Fetching details for ID: ChIJBWhDk2W2zTERgiLKfI8tNcM
[15/200] Fetching details for ID: ChIJb0eWhvi2zTER-p6qkkYu_38
[16/200] F

## Hotel

In [None]:
# --- Configuration ---
api_key = API_KEY
target_region = "Putrajaya, Malaysia"
search_keywords =    [
    "hotel", "resort", "airbnb", "homestay", "guesthouse",
    "motel", "hostel", "boutique hotel"
]
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
cache_file = '/content/drive/MyDrive/tourism_data/putrajaya_hotel_stage1_place_ids.json'

# --- Caching and Setup ---
collected_place_ids = set()

if os.path.exists(cache_file):
    print(f"Loading existing Place IDs from cache: {cache_file}")
    try:
        with open(cache_file, 'r') as f:
            collected_place_ids = set(json.load(f))
        print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
        # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
    except json.JSONDecodeError:
        print("Error reading cache file. Starting fresh search.")

print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
# ----------------------------------------------------

# --- Iteration Loop  ---
for keyword in search_keywords:
  current_query = f"{keyword} in {target_region}"
  params = {
      'query': current_query,
      'key': api_key,
      'region': 'my'
  }

  print(f"Starting search for: {current_query}")
  next_page_token = True # Initialize for the while loop

  # Pagination Loop (up to 3 pages)
  page_count = 0
  while next_page_token and page_count < 3:

      # THROTTLING: Delay for pagetoken or between quick calls
      if isinstance(next_page_token, str):
          # Crucial delay for the next_page_token to become valid
          print("  Waiting 3s for next_page_token...")
          time.sleep(3)
          params = {
              'pagetoken': next_page_token,
              'key': api_key
          }

      # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
      else: # For the very first call
          time.sleep(1)

      # Make the API call
      response = requests.get(base_url, params=params).json()

      # Extract results
      if response.get('status') == 'OK':
          for place in response['results']:
              # Filter: only keep popular places with user_ratings_total greater than 300
              if place.get('user_ratings_total', 0) >= 300:
                  collected_place_ids.add(place['place_id'])

          # Check for next page
          next_page_token = response.get('next_page_token', None)
          page_count += 1
          print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

      else:
          print(f"Error in search for '{keyword}': {response.get('status')}")
          next_page_token = None  # Stop pagination

  print(f"Finished search for {keyword}.")
  time.sleep(1) # Short break between main keyword searches

# --- Final Output and Caching ---
print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

# Save the final set of IDs to the cache file
with open(cache_file, 'w') as f:
    json.dump(list(collected_place_ids), f)
print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

Starting Stage 1 Search. Current ID count: 0
Starting search for: hotel in Putrajaya, Malaysia
  - Page 1 complete. Total IDs: 17
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 21
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 21
Finished search for hotel.
Starting search for: resort in Putrajaya, Malaysia
  - Page 1 complete. Total IDs: 22
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 22
Finished search for resort.
Starting search for: airbnb in Putrajaya, Malaysia
  - Page 1 complete. Total IDs: 22
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 23
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 24
Finished search for airbnb.
Starting search for: homestay in Putrajaya, Malaysia
Error in search for 'homestay': ZERO_RESULTS
Finished search for homestay.
Starting search for: guesthouse in Putrajaya, Malaysia
  - Page 1 complete. Total IDs: 24
Finished search for guesthouse.
Starting s

In [None]:
# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = '/content/drive/MyDrive/tourism_data/putrajaya_stage2_hotel_details_cache.json'
csv_output_prefix = '/content/drive/MyDrive/tourism_data/putrajaya_hotel_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            df['extract_date'] = datetime.now().date()
            df['region'] = target_region
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

# East Malaysia

## Tourist Attractions

In [None]:
regions =[
    "Semporna, Malaysia", "Sepilok, Malaysia", "Kinabatangan, Malaysia", "Kuching, Sarawak, Malaysia", "Labuan, Malaysia"
]
api_key = API_KEY

for target_region in regions:
  search_keywords =  [
      "tourist attractions", "museums", "parks", "historical sites",
      "landmarks", "shopping mall", "temple", "mosque", "church",
      "island", "beach", "theme park", "natural preserve",
      "forest", "mountain", "waterfall", "zoo", "aquarium"
  ]
  base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
  cache_file = f'/content/drive/MyDrive/tourism_data/{target_region}_tourist_attractions_stage1_place_ids.json'

  # --- Caching and Setup ---
  collected_place_ids = set()

  if os.path.exists(cache_file):
      print(f"Loading existing Place IDs from cache: {cache_file}")
      try:
          with open(cache_file, 'r') as f:
              collected_place_ids = set(json.load(f))
          print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
          # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
      except json.JSONDecodeError:
          print("Error reading cache file. Starting fresh search.")

  print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
  # ----------------------------------------------------

  # --- Iteration Loop  ---
  for keyword in search_keywords:
    current_query = f"{keyword} in {target_region}"
    params = {
        'query': current_query,
        'key': api_key,
        'region': 'my'
    }

    print(f"Starting search for: {current_query}")
    next_page_token = True # Initialize for the while loop

    # Pagination Loop (up to 3 pages)
    page_count = 0
    while next_page_token and page_count < 3:

        # THROTTLING: Delay for pagetoken or between quick calls
        if isinstance(next_page_token, str):
            # Crucial delay for the next_page_token to become valid
            print("  Waiting 3s for next_page_token...")
            time.sleep(3)
            params = {
                'pagetoken': next_page_token,
                'key': api_key
            }

        # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
        else: # For the very first call
            time.sleep(1)

        # Make the API call
        response = requests.get(base_url, params=params).json()

        # Extract results
        if response.get('status') == 'OK':
            for place in response['results']:
                # Filter: only keep popular places with user_ratings_total greater than 300
                if place.get('user_ratings_total', 0) >= 300:
                    collected_place_ids.add(place['place_id'])

            # Check for next page
            next_page_token = response.get('next_page_token', None)
            page_count += 1
            print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

        else:
            print(f"Error in search for '{keyword}': {response.get('status')}")
            next_page_token = None  # Stop pagination

    print(f"Finished search for {keyword}.")
    time.sleep(1) # Short break between main keyword searches

  # --- Final Output and Caching ---
  print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

  # Save the final set of IDs to the cache file
  with open(cache_file, 'w') as f:
      json.dump(list(collected_place_ids), f)
  print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = f'/content/drive/MyDrive/tourism_data/{target_region}_stage2_tourist_attractions_details_cache.json'
csv_output_prefix = f'/content/drive/MyDrive/tourism_data/{target_region}_tourist_attractions_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            df['extract_date'] = datetime.now().date()
            df['region'] = target_region
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Starting Stage 1 Search. Current ID count: 0
Starting search for: tourist attractions in Semporna, Malaysia
  - Page 1 complete. Total IDs: 0
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 1
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 3
Finished search for tourist attractions.
Starting search for: museums in Semporna, Malaysia
  - Page 1 complete. Total IDs: 3
Finished search for museums.
Starting search for: parks in Semporna, Malaysia
  - Page 1 complete. Total IDs: 4
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 9
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 13
Finished search for parks.
Starting search for: historical sites in Semporna, Malaysia
  - Page 1 complete. Total IDs: 14
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 14
Finished search for historical sites.
Starting search for: landmarks in Semporna, Malaysia
  - Page 1 complete. Total IDs: 14
  Waiting 3s for nex

## Restaurant

In [None]:
regions =[
    "Semporna, Malaysia", "Sepilok, Malaysia", "Kinabatangan, Malaysia", "Kuching, Sarawak, Malaysia", "Labuan, Malaysia"
]
api_key = API_KEY

for target_region in regions:
  search_keywords =  [
    "restaurant", "cafe", "bar", "food", "bistro","hawker centre",
    "food court", "seafood restaurant", "mamak restaurant",
    "malay restaurant", "indian restaurant", "japanese food",
    "western food","korean food", "chinese restaurant"
  ]
  base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
  cache_file = f'/content/drive/MyDrive/tourism_data/{target_region}_restaurant_stage1_place_ids.json'

  # --- Caching and Setup ---
  collected_place_ids = set()

  if os.path.exists(cache_file):
      print(f"Loading existing Place IDs from cache: {cache_file}")
      try:
          with open(cache_file, 'r') as f:
              collected_place_ids = set(json.load(f))
          print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
          # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
      except json.JSONDecodeError:
          print("Error reading cache file. Starting fresh search.")

  print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
  # ----------------------------------------------------

  # --- Iteration Loop  ---
  for keyword in search_keywords:
    current_query = f"{keyword} in {target_region}"
    params = {
        'query': current_query,
        'key': api_key,
        'region': 'my'
    }

    print(f"Starting search for: {current_query}")
    next_page_token = True # Initialize for the while loop

    # Pagination Loop (up to 3 pages)
    page_count = 0
    while next_page_token and page_count < 3:

        # THROTTLING: Delay for pagetoken or between quick calls
        if isinstance(next_page_token, str):
            # Crucial delay for the next_page_token to become valid
            print("  Waiting 3s for next_page_token...")
            time.sleep(3)
            params = {
                'pagetoken': next_page_token,
                'key': api_key
            }

        # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
        else: # For the very first call
            time.sleep(1)

        # Make the API call
        response = requests.get(base_url, params=params).json()

        # Extract results
        if response.get('status') == 'OK':
            for place in response['results']:
                # Filter: only keep popular places with user_ratings_total greater than 300
                if place.get('user_ratings_total', 0) >= 300:
                    collected_place_ids.add(place['place_id'])

            # Check for next page
            next_page_token = response.get('next_page_token', None)
            page_count += 1
            print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

        else:
            print(f"Error in search for '{keyword}': {response.get('status')}")
            next_page_token = None  # Stop pagination

    print(f"Finished search for {keyword}.")
    time.sleep(1) # Short break between main keyword searches

  # --- Final Output and Caching ---
  print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

  # Save the final set of IDs to the cache file
  with open(cache_file, 'w') as f:
      json.dump(list(collected_place_ids), f)
  print(f"Final Place IDs saved to {cache_file}")

  # End of Stage 1

  # --- Configuration ---
  api_key = API_KEY
  place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
  input_id_file = cache_file
  cache_details_file = f'/content/drive/MyDrive/tourism_data/{target_region}_stage2_restaurant_details_cache.json'
  csv_output_prefix = f'/content/drive/MyDrive/tourism_data/{target_region}_restaurant_place_data'
  batch_size = 200 # Save a new CSV file every 300 rows

  # --- Setup ---
  # 1. Load the list of Place IDs to process
  if not os.path.exists(input_id_file):
      print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
      exit()

  with open(input_id_file, 'r') as f:
      all_place_ids = json.load(f)

  # 2. Load the cache of already processed Place Details
  details_cache = {}
  if os.path.exists(cache_details_file):
      print(f"Loading existing Place Details from cache: {cache_details_file}")
      try:
          with open(cache_details_file, 'r') as f:
              details_cache = json.load(f)
          print(f"Loaded {len(details_cache)} place details from cache.")
      except json.JSONDecodeError:
          print("Error reading details cache. Starting with an empty cache.")

  # Remove IDs already in the cache from the list to process
  ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
  processed_count = len(details_cache)
  total_ids_to_process = len(ids_to_process)

  print(f"Total IDs to process (not in cache): {total_ids_to_process}")
  print(f"Total IDs already cached: {processed_count}")

  # 3. Initialize data list for the current batch
  current_data_batch = []
  csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

  # --- Processing Loop (Place Details) ---
  for i, place_id in enumerate(ids_to_process):

      # THROTTLING: Delay between every Place Details API call
      time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

      current_index = processed_count + i + 1

      print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

      params = {
          'place_id': place_id,
          'key': api_key,
          'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
      }

      try:
          response = requests.get(place_details_url, params=params).json()
      except requests.exceptions.RequestException as e:
          print(f"   Network error for {place_id}: {e}. Skipping.")
          continue # Skip to the next ID on network failure

      status = response.get('status')

      if status == 'OK':

          result = response['result']

          # --- Extract Reviews ---
          reviews = result.get('reviews', [])

          # OPTION 1: Extract only the text of the first review and its rating
          first_review_text = reviews[0]['text'] if reviews else None
          first_review_rating = reviews[0]['rating'] if reviews else None

          # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
          # This keeps all the data, including author name, time, and text for all 5 reviews.
          reviews_json = json.dumps(reviews)


          # --- Extract Operating Hours (Weekday Text) ---
          opening_hours = result.get('opening_hours', {}).get('weekday_text')
          opening_hours_text = "\n".join(opening_hours) if opening_hours else None


          row = {
              'place_id': place_id,
              'name': result.get('name'),
              'address': result.get('formatted_address'),
              'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
              'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
              'types': result.get('types'),
              'googleMapsUri': result.get('url'),
              'priceRange': result.get('price_level'),
              'rating': result.get('rating'),
              'userRatingCount': result.get('user_ratings_total'),
              'operating_hours': opening_hours_text,
              'reviews_data': reviews_json
          }

          current_data_batch.append(row)
          details_cache[place_id] = row

          # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

          # PERIODIC SAVE: Check if the batch size is reached
          if len(current_data_batch) >= batch_size:

              # Save the current batch to CSV
              df = pd.DataFrame(current_data_batch)
              df['extract_date'] = datetime.now().date()
              df['region'] = target_region
              csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
              df.to_csv(csv_filename, index=False, encoding='utf-8')
              print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

              # Reset for the next batch
              current_data_batch = []
              csv_file_index += 1

              # Save the updated cache file
              with open(cache_details_file, 'w') as f:
                  json.dump(details_cache, f, indent=4)
              print(f"Updated cache saved to {cache_details_file}.")


      elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
          # ERROR HANDLING: Implement simple backoff for critical errors
          print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
          time.sleep(60)
          continue

      else:
          print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
          details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

  # --- Final Cleanup and Save ---

  # Save any remaining data in the batch
  if current_data_batch:
      df = pd.DataFrame(current_data_batch)
      df['extract_date'] = datetime.now().date()
      df['region'] = target_region
      csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
      df.to_csv(csv_filename, index=False, encoding='utf-8')
      print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

  # Final cache save
  with open(cache_details_file, 'w') as f:
      json.dump(details_cache, f, indent=4)
  print(f"Final cache saved to {cache_details_file}.")

  print("\nProcessing complete.")

Starting Stage 1 Search. Current ID count: 0
Starting search for: restaurant in Semporna, Malaysia
  - Page 1 complete. Total IDs: 5
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 5
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 7
Finished search for restaurant.
Starting search for: cafe in Semporna, Malaysia
  - Page 1 complete. Total IDs: 8
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 8
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 8
Finished search for cafe.
Starting search for: bar in Semporna, Malaysia
  - Page 1 complete. Total IDs: 8
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 10
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 10
Finished search for bar.
Starting search for: food in Semporna, Malaysia
  - Page 1 complete. Total IDs: 10
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 10
  Waiting 3s for next_page_token...
  - Page 

## Hotel

In [None]:
regions =[
    "Semporna, Malaysia", "Sepilok, Malaysia", "Kinabatangan, Malaysia", "Kuching, Sarawak, Malaysia", "Labuan, Malaysia"
]
api_key = API_KEY

for target_region in regions:
  search_keywords =    [
    "hotel", "resort", "airbnb", "homestay", "guesthouse",
    "motel", "hostel", "boutique hotel"
  ]
  base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
  cache_file = f'/content/drive/MyDrive/tourism_data/{target_region}_hotel_stage1_place_ids.json'

  # --- Caching and Setup ---
  collected_place_ids = set()

  if os.path.exists(cache_file):
      print(f"Loading existing Place IDs from cache: {cache_file}")
      try:
          with open(cache_file, 'r') as f:
              collected_place_ids = set(json.load(f))
          print(f"Loaded {len(collected_place_ids)} IDs from cache. Skipping search.")
          # If cache exists, we skip the search for simplicity, otherwise, remove this 'exit' to re-run the search and append new IDs.
      except json.JSONDecodeError:
          print("Error reading cache file. Starting fresh search.")

  print(f"Starting Stage 1 Search. Current ID count: {len(collected_place_ids)}")
  # ----------------------------------------------------

  # --- Iteration Loop  ---
  for keyword in search_keywords:
    current_query = f"{keyword} in {target_region}"
    params = {
        'query': current_query,
        'key': api_key,
        'region': 'my'
    }

    print(f"Starting search for: {current_query}")
    next_page_token = True # Initialize for the while loop

    # Pagination Loop (up to 3 pages)
    page_count = 0
    while next_page_token and page_count < 3:

        # THROTTLING: Delay for pagetoken or between quick calls
        if isinstance(next_page_token, str):
            # Crucial delay for the next_page_token to become valid
            print("  Waiting 3s for next_page_token...")
            time.sleep(3)
            params = {
                'pagetoken': next_page_token,
                'key': api_key
            }

        # THROTTLING: Delay between immediate API calls (initial search or page 1/2)
        else: # For the very first call
            time.sleep(1)

        # Make the API call
        response = requests.get(base_url, params=params).json()

        # Extract results
        if response.get('status') == 'OK':
            for place in response['results']:
                # Filter: only keep popular places with user_ratings_total greater than 300
                if place.get('user_ratings_total', 0) >= 300:
                    collected_place_ids.add(place['place_id'])

            # Check for next page
            next_page_token = response.get('next_page_token', None)
            page_count += 1
            print(f"  - Page {page_count} complete. Total IDs: {len(collected_place_ids)}")

        else:
            print(f"Error in search for '{keyword}': {response.get('status')}")
            next_page_token = None  # Stop pagination

    print(f"Finished search for {keyword}.")
    time.sleep(1) # Short break between main keyword searches

  # --- Final Output and Caching ---
  print(f"\nStage 1 {target_region} complete. Total unique Place IDs collected: {len(collected_place_ids)}")

  # Save the final set of IDs to the cache file
  with open(cache_file, 'w') as f:
      json.dump(list(collected_place_ids), f)
  print(f"Final Place IDs saved to {cache_file}")

# End of Stage 1

# --- Configuration ---
api_key = API_KEY
place_details_url = "https://maps.googleapis.com/maps/api/place/details/json"
input_id_file = cache_file
cache_details_file = f'/content/drive/MyDrive/tourism_data/{target_region}_stage2_hotel_details_cache.json'
csv_output_prefix = f'/content/drive/MyDrive/tourism_data/{target_region}_hotel_place_data'
batch_size = 200 # Save a new CSV file every 300 rows

# --- Setup ---
# 1. Load the list of Place IDs to process
if not os.path.exists(input_id_file):
    print(f"Error: Input file '{input_id_file}' not found. Run Stage 1 first.")
    exit()

with open(input_id_file, 'r') as f:
    all_place_ids = json.load(f)

# 2. Load the cache of already processed Place Details
details_cache = {}
if os.path.exists(cache_details_file):
    print(f"Loading existing Place Details from cache: {cache_details_file}")
    try:
        with open(cache_details_file, 'r') as f:
            details_cache = json.load(f)
        print(f"Loaded {len(details_cache)} place details from cache.")
    except json.JSONDecodeError:
        print("Error reading details cache. Starting with an empty cache.")

# Remove IDs already in the cache from the list to process
ids_to_process = [pid for pid in all_place_ids if pid not in details_cache]
processed_count = len(details_cache)
total_ids_to_process = len(ids_to_process)

print(f"Total IDs to process (not in cache): {total_ids_to_process}")
print(f"Total IDs already cached: {processed_count}")

# 3. Initialize data list for the current batch
current_data_batch = []
csv_file_index = (processed_count // batch_size) + 1 # Start next CSV file index

# --- Processing Loop (Place Details) ---
for i, place_id in enumerate(ids_to_process):

    # THROTTLING: Delay between every Place Details API call
    time.sleep(1.2) # A safe delay to stay well under 60 QPM limit

    current_index = processed_count + i + 1

    print(f"[{current_index}/{len(all_place_ids)}] Fetching details for ID: {place_id}")

    params = {
        'place_id': place_id,
        'key': api_key,
        'fields': 'name,formatted_address,geometry,types,url,price_level,rating,user_ratings_total,reviews,opening_hours'
    }

    try:
        response = requests.get(place_details_url, params=params).json()
    except requests.exceptions.RequestException as e:
        print(f"   Network error for {place_id}: {e}. Skipping.")
        continue # Skip to the next ID on network failure

    status = response.get('status')

    if status == 'OK':

        result = response['result']

        # --- Extract Reviews ---
        reviews = result.get('reviews', [])

        # OPTION 1: Extract only the text of the first review and its rating
        first_review_text = reviews[0]['text'] if reviews else None
        first_review_rating = reviews[0]['rating'] if reviews else None

        # OPTION 2 (RECOMMENDED for raw data): Save the whole list of reviews as a JSON string
        # This keeps all the data, including author name, time, and text for all 5 reviews.
        reviews_json = json.dumps(reviews)


        # --- Extract Operating Hours (Weekday Text) ---
        opening_hours = result.get('opening_hours', {}).get('weekday_text')
        opening_hours_text = "\n".join(opening_hours) if opening_hours else None


        row = {
            'place_id': place_id,
            'name': result.get('name'),
            'address': result.get('formatted_address'),
            'latitude': result.get('geometry', {}).get('location', {}).get('lat'),
            'longitude': result.get('geometry', {}).get('location', {}).get('lng'),
            'types': result.get('types'),
            'googleMapsUri': result.get('url'),
            'priceRange': result.get('price_level'),
            'rating': result.get('rating'),
            'userRatingCount': result.get('user_ratings_total'),
            'operating_hours': opening_hours_text,
            'reviews_data': reviews_json
        }

        current_data_batch.append(row)
        details_cache[place_id] = row

        # NOTE: Removed the duplicate 'current_data_batch.append(row)' and 'details_cache[place_id] = row' lines

        # PERIODIC SAVE: Check if the batch size is reached
        if len(current_data_batch) >= batch_size:

            # Save the current batch to CSV
            df = pd.DataFrame(current_data_batch)
            df['extract_date'] = datetime.now().date()
            df['region'] = target_region
            csv_filename = f'{csv_output_prefix}_{csv_file_index}.csv'
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"*** SAVED {len(df)} rows to {csv_filename} ***")

            # Reset for the next batch
            current_data_batch = []
            csv_file_index += 1

            # Save the updated cache file
            with open(cache_details_file, 'w') as f:
                json.dump(details_cache, f, indent=4)
            print(f"Updated cache saved to {cache_details_file}.")


    elif status == 'OVER_QUERY_LIMIT' or status == 'UNKNOWN_ERROR':
        # ERROR HANDLING: Implement simple backoff for critical errors
        print(f"Critical Error: {status}. Waiting 60 seconds and restarting loop.")
        time.sleep(60)
        continue

    else:
        print(f"   - No result or error ({status}) for ID: {place_id}. Skipping.")
        details_cache[place_id] = {'status': status} # Cache the failure to avoid re-call

# --- Final Cleanup and Save ---

# Save any remaining data in the batch
if current_data_batch:
    df = pd.DataFrame(current_data_batch)
    df['extract_date'] = datetime.now().date()
    df['region'] = target_region
    csv_filename = f'{csv_output_prefix}_final_{csv_file_index}.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n*** FINAL SAVE: {len(df)} remaining rows to {csv_filename} ***")

# Final cache save
with open(cache_details_file, 'w') as f:
    json.dump(details_cache, f, indent=4)
print(f"Final cache saved to {cache_details_file}.")

print("\nProcessing complete.")

Starting Stage 1 Search. Current ID count: 0
Starting search for: hotel in Semporna, Malaysia
  - Page 1 complete. Total IDs: 2
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 4
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 5
Finished search for hotel.
Starting search for: resort in Semporna, Malaysia
  - Page 1 complete. Total IDs: 6
  Waiting 3s for next_page_token...
  - Page 2 complete. Total IDs: 6
  Waiting 3s for next_page_token...
  - Page 3 complete. Total IDs: 6
Finished search for resort.
Starting search for: airbnb in Semporna, Malaysia
  - Page 1 complete. Total IDs: 6
Finished search for airbnb.
Starting search for: homestay in Semporna, Malaysia
Error in search for 'homestay': ZERO_RESULTS
Finished search for homestay.
Starting search for: guesthouse in Semporna, Malaysia
  - Page 1 complete. Total IDs: 6
Finished search for guesthouse.
Starting search for: motel in Semporna, Malaysia
  - Page 1 complete. Total IDs: 6
  Waiting 3