In [19]:
import requests
import time
from pyspark.sql.types import *
from pyspark.sql import SparkSession, functions as F

In [20]:
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.driver.memory', '4g')
    .config('spark.executor.memory', '2g')
    .getOrCreate()
)

In [21]:
from pyspark.sql.types import StructType, StructField, StringType

# Define schema for the Spark DataFrame
schema = StructType([
    StructField("Name", StringType(), True),
    StructField("Address", StringType(), True),
    StructField("Postcode", StringType(), True),
    StructField("Rating", StringType(), True),
])


In [22]:
# Your Google Places API key
API_KEY = 'AIzaSyDYDPdLTa7c2WJCDLfiujiOnYzG3mYthHY'

# Base URL for Google Places API
url = 'https://maps.googleapis.com/maps/api/place/textsearch/json'

# Load the postcode data (Assuming the file is correctly loaded into a DataFrame)
postcodes_sdf = spark.read.parquet("../data/postcodes/postcodes.parquet")

In [23]:
postcodes_sdf

postcode,locality,long,lat
3000,MELBOURNE,144.9825846,-37.81443733
3001,MELBOURNE,144.9825846,-37.81443733
3002,EAST MELBOURNE,144.9825846,-37.81443733
3003,WEST MELBOURNE,144.949592,-37.810871
3004,MELBOURNE,144.9825846,-37.81443733
3004,ST KILDA ROAD CEN...,144.970161,-37.844246
3004,ST KILDA ROAD MEL...,144.976,-37.8368
3005,WORLD TRADE CENTRE,144.950858,-37.824608
3006,SOUTH WHARF,144.9520744,-37.82528675
3006,SOUTHBANK,144.965926,-37.823258


In [24]:
postcodes_sdf = postcodes_sdf.dropDuplicates(['postcode'])

In [25]:
# Define the schema properly beforehand
schema = StructType([
    StructField("name", StringType(), True),
    StructField("address", StringType(), True),
    StructField("postcode", StringType(), True),
    StructField("rating", StringType(), True)
])

# Initialize the variable
variable = 'Groceries'
variable_metadata = spark.createDataFrame([], schema)

# Iterate through the rows in the Spark DataFrame
for row in postcodes_sdf.collect():
    postcode = row['postcode']
    
    # Define the search query using postcode
    params = {
        'query': f'{variable} in {postcode}, Victoria, Australia',
        'key': API_KEY,
        'type': variable,  # Corrected from a set to a string
        'region': 'AU'
    }

    response = requests.get(url, params=params)
        
    # Check if the response was successful
    if response.status_code == 200:
        results = response.json().get('results', [])
        row_list = []
            
        # Loop over the results and collect the data
        for place in results:
            name = place.get('name')
            address = place.get('formatted_address')
            rating = place.get('rating', 'N/A')
            row_list.append((name, address, postcode, rating))
        
        # Convert the list of rows to a DataFrame in one go
        if row_list:
            row_df = spark.createDataFrame(row_list, schema)
            variable_metadata = variable_metadata.union(row_df)
            
        # Introduce a short delay to avoid hitting API rate limits
        time.sleep(1)  # Adjust based on API usage limits
    else:
        print(f"{variable}: Error fetching data for postcode {postcode}: {response.status_code}, {response.text}")
    
    print(f'Searching for {variable} in {postcode}')

# Write the final DataFrame to a Parquet file
try:
    variable_metadata.write.mode("overwrite").parquet(f'../data/landing/{variable}.parquet')
    print(f"Data successfully written for {variable}")
except Exception as e:
    print(f"An error occurred: {e}")


Searching for Groceries in 3000
Searching for Groceries in 3001
Searching for Groceries in 3002
Searching for Groceries in 3003
Searching for Groceries in 3004
Searching for Groceries in 3005
Searching for Groceries in 3006
Searching for Groceries in 3008
Searching for Groceries in 3010
Searching for Groceries in 3011
Searching for Groceries in 3012
Searching for Groceries in 3013
Searching for Groceries in 3015
Searching for Groceries in 3016
Searching for Groceries in 3018
Searching for Groceries in 3019
Searching for Groceries in 3020
Searching for Groceries in 3021
Searching for Groceries in 3022
Searching for Groceries in 3023
Searching for Groceries in 3024
Searching for Groceries in 3025
Searching for Groceries in 3026
Searching for Groceries in 3027
Searching for Groceries in 3028
Searching for Groceries in 3029
Searching for Groceries in 3030
Searching for Groceries in 3031
Searching for Groceries in 3032
Searching for Groceries in 3033
Searching for Groceries in 3034
Searchin

[Stage 46:>                                                         (0 + 8) / 8]

Data successfully written for Groceries


                                                                                