<a href="https://colab.research.google.com/github/Lu8na/IT362/blob/main/Logbook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Import necessary libraries
import json, time, random
from datetime import datetime
import requests

# SETTINGS - Configuration variables for the script
API_KEY = "AIzaSyBFu74b6afMqfo5t8koIMwgenEIDBfVVtc"  # Google Places API key

# API endpoints for Google Places
NEARBY_URL  = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"
DETAILS_URL = "https://maps.googleapis.com/maps/api/place/details/json"

# Collection targets and parameters
TARGET_PLACES = 700  # Target number of places to collect
RADIUS = 4000  # Search radius in meters
PLACE_TYPES = ["beauty_salon", "spa"]  # Types of businesses to search for

# List of UK city coordinates (latitude, longitude)
UK_CITIES = [
    (51.5074, -0.1278),   # London
    (53.4808, -2.2426),   # Manchester
    (52.4862, -1.8904),   # Birmingham
    (55.8642, -4.2518),   # Glasgow
    (53.8008, -1.5491),   # Leeds
    (51.4545, -2.5879),   # Bristol
    (50.8225, -0.1372),   # Brighton
    (52.9548, -1.1581),   # Nottingham
    (51.4816, -3.1791),   # Cardiff
    (54.9783, -1.6178)    # Newcastle
]

# OUTPUT FILES - File paths for storing data
RAW_SEARCH_FILE   = "raw_search_responses.jsonl"  # Raw API responses
UNSTRUCTURED_FILE = "unstructured_reviews.jsonl"  # Extracted reviews

# Helper function to make API calls with retry logic
def get_json(url, params):
    for attempt in range(5):  # Try up to 5 times
        r = requests.get(url, params=params, timeout=30)
        data = r.json()
        if data.get("status") == "OVER_QUERY_LIMIT":  # If rate limited
            time.sleep(2**attempt + random.random())  # Exponential backoff
            continue
        return data
    return data


# MAIN EXECUTION STARTS HERE

seen = set()  # Track seen place IDs to avoid duplicates
place_ids = []  # Store collected place IDs

# PHASE 1: COLLECT PLACE IDs FROM NEARBY SEARCH
with open(RAW_SEARCH_FILE, "w", encoding="utf-8") as raw_f:
    # Iterate through all UK cities
    for lat, lng in UK_CITIES:
        # Iterate through each place type (beauty_salon, spa)
        for t in PLACE_TYPES:

            # Make API call to find places nearby
            data = get_json(NEARBY_URL, {
                "location": f"{lat},{lng}",
                "radius": RADIUS,
                "type": t,
                "key": API_KEY
            })

            # Save raw response to file for debugging/backup
            raw_f.write(json.dumps({
                "type": t,
                "lat": lat,
                "lng": lng,
                "response": data,
                "collected_at": datetime.now().isoformat()
            }, ensure_ascii=False) + "\n")

            # Skip if API returned an error
            if data.get("status") not in ("OK", "ZERO_RESULTS"):
                continue

            # Process paginated results (up to 3 pages)
            page = 1
            while True:
                # Extract place IDs from results
                for p in data.get("results", []):
                    pid = p.get("place_id")
                    if pid and pid not in seen:  # Avoid duplicates
                        seen.add(pid)
                        place_ids.append(pid)
                        if len(place_ids) >= TARGET_PLACES:
                            break

                # Stop if we reached target number of places
                if len(place_ids) >= TARGET_PLACES:
                    break

                # Check for next page of results
                token = data.get("next_page_token")
                if not token or page >= 3:  # Limit to 3 pages
                    break

                # Wait before requesting next page (API requirement)
                time.sleep(2)
                data = get_json(NEARBY_URL, {
                    "pagetoken": token,
                    "key": API_KEY
                })
                page += 1

            # Break loops if target reached
            if len(place_ids) >= TARGET_PLACES:
                break
        if len(place_ids) >= TARGET_PLACES:
            break

print("Places collected:", len(place_ids))

# PHASE 2: FETCH REVIEWS FOR EACH PLACE
reviews_count = 0

with open(UNSTRUCTURED_FILE, "w", encoding="utf-8") as out_f:
    # For each collected place ID, get detailed information including reviews
    for pid in place_ids:

        # Request place details (only reviews field to save quota)
        d = get_json(DETAILS_URL, {
            "place_id": pid,
            "fields": "reviews",
            "key": API_KEY
        })

        # Process reviews if API call successful
        if d.get("status") == "OK":
            revs = d.get("result", {}).get("reviews", []) or []
            for rev in revs:
                txt = rev.get("text")
                if not txt:  # Skip empty reviews
                    continue

                # Save each review with metadata
                out_f.write(json.dumps({
                    "place_id": pid,
                    "review_text": txt,
                    "rating": rev.get("rating"),
                    "time": rev.get("time"),
                    "relative_time_description": rev.get("relative_time_description"),
                    "language": rev.get("language")
                }, ensure_ascii=False) + "\n")

                reviews_count += 1

        # Rate limiting delay between requests
        time.sleep(0.2)

print("Total reviews collected:", reviews_count)

Places collected: 0
Total reviews collected: 0


In [3]:
# Import necessary libraries for JSON handling and data manipulation
import json  # For parsing JSON data from the file
import pandas as pd  # For creating and manipulating DataFrames

# Initialize an empty list to store each review as a dictionary/row
rows = []

# Open the JSONL file (JSON Lines format - each line is a separate JSON object)
with open("unstructured_reviews.jsonl", "r", encoding="utf-8") as f:
    # Read the file line by line
    for line in f:
        # Parse the JSON string from the current line into a Python dictionary
        obj = json.loads(line)

        # Extract specific fields from the JSON object and structure them into a clean dictionary
        rows.append({
            "place_id": obj.get("place_id"),  # Unique identifier for the place/business
            "review_text": obj.get("review_text"),  # The actual review content written by the user
            "rating": obj.get("rating"),  # Star rating (usually 1-5)
            "time": obj.get("time"),  # Timestamp of when the review was written
            "relative_time_description": obj.get("relative_time_description"),  # Human-readable time (e.g., "2 weeks ago")
            "language": obj.get("language")  # Language of the review (if detected by Google)
        })

# Convert the list of dictionaries into a pandas DataFrame
# Each dictionary becomes a row, keys become column headers
df = pd.DataFrame(rows)

# Export the DataFrame to a CSV file for easy viewing and analysis
df.to_csv(
    "unstructured_reviews_readable.csv",  # Output filename
    index=False,  # Don't include the automatic row index in the CSV
    encoding="utf-8-sig"  # UTF-8 with BOM for better Excel compatibility
)