In [42]:
import requests
import csv
import time
import pandas as pd

In [43]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Liveability Index Variables").getOrCreate()
df = spark.read.csv("../data/postcodes/postcodes.csv", header=True, inferSchema=True)

                                                                                

In [44]:
# Your Google Places API key
API_KEY = 'AIzaSyDYDPdLTa7c2WJCDLfiujiOnYzG3mYthHY'

# Base URL for Google Places API
url = 'https://maps.googleapis.com/maps/api/place/textsearch/json'

# Explanatory variables
variables = ['Hospitals & Clinics', 'Schools', 'Groceries']

# Load the postcode data (Assuming the file is correctly loaded into a DataFrame)
postcodes_sdf = spark.read.parquet('../data/postcodes/postcodes.parquet')

In [45]:
postcodes_sdf.show()

+--------+--------------------+-----------+------------+
|postcode|            locality|       long|         lat|
+--------+--------------------+-----------+------------+
|    3000|           MELBOURNE|144.9825846|-37.81443733|
|    3001|           MELBOURNE|144.9825846|-37.81443733|
|    3002|      EAST MELBOURNE|144.9825846|-37.81443733|
|    3003|      WEST MELBOURNE| 144.949592|  -37.810871|
|    3004|           MELBOURNE|144.9825846|-37.81443733|
|    3004|ST KILDA ROAD CEN...| 144.970161|  -37.844246|
|    3004|ST KILDA ROAD MEL...|    144.976|    -37.8368|
|    3005|  WORLD TRADE CENTRE| 144.950858|  -37.824608|
|    3006|         SOUTH WHARF|144.9520744|-37.82528675|
|    3006|           SOUTHBANK| 144.965926|  -37.823258|
|    3008|           DOCKLANDS| 144.948039|  -37.814719|
|    3010|UNIVERSITY OF MEL...| 144.961351|  -37.796152|
|    3011|           FOOTSCRAY| 144.907953|  -37.807101|
|    3011|              SEDDON| 144.907953|  -37.807101|
|    3011|         SEDDON WEST|

In [46]:
columns = ['locality', 'state', 'long', 'lat']
postcodes_sdf = postcodes_sdf.drop(*columns)
postcodes_sdf = postcodes_sdf.dropDuplicates()
postcodes_sdf = postcodes_sdf.orderBy('postcode')
postcodes_sdf.show()

+--------+
|postcode|
+--------+
|    3000|
|    3001|
|    3002|
|    3003|
|    3004|
|    3005|
|    3006|
|    3008|
|    3010|
|    3011|
|    3012|
|    3013|
|    3015|
|    3016|
|    3018|
|    3019|
|    3020|
|    3021|
|    3022|
|    3023|
+--------+
only showing top 20 rows



In [47]:
from pyspark.sql.types import StructType, StructField, StringType

# Define schema for the Spark DataFrame
schema = StructType([
    StructField("Name", StringType(), True),
    StructField("Address", StringType(), True),
    StructField("Postcode", StringType(), True),
    StructField("Rating", StringType(), True),
])


In [48]:
## for testing purposes
postcodes_sdf2 = postcodes_sdf.filter(postcodes_sdf['postcode'] < 3004)

In [51]:
# Iterate through all variables and initialize a temporary dataframe
for variable in variables :
    variable_metadata = spark.createDataFrame([], schema)

    # Loop through each row in the dataframe
    for row in postcodes_sdf.collect():
        postcode = row['postcode']
        
        print(f'searching for {variable} in {postcode}')
        # Define the search query using postcode
        params = {
            'query': f'{variable} in {postcode}, Victoria, Australia',
            'key': API_KEY,
            'type': {variable},
            'region': 'AU'
        }

        response = requests.get(url, params=params)
        
        # Check if the response was successful
        if response.status_code == 200:
            results = response.json().get('results', [])
            
            # Write each place's details to the CSV file
            for place in results:
                name = place.get('name')
                address = place.get('formatted_address')
                rating = place.get('rating', 'N/A')
                row = [(name, address, postcode, rating)]
                row_df = spark.createDataFrame(row, schema)
                variable_metadata = variable_metadata.union(row_df)
            
            # Introduce a short delay to avoid hitting rate limits of the API
            time.sleep(1)  # 1-second delay between requests
        else:
            print(f"{variable}: Error fetching data for postcode {postcode}: {response.status_code}, {response.text}")
        print(f'searching for {variable} in {postcode}')

    try:
        variable_metadata.write.mode("overwrite").parquet(f'../data/landing/{variable}.parquet')
        print(f"Data successfully written for {variable}")
    except Exception as e:
       print(f"An error occured: {e}")
       break; 





searching for Hospitals & Clinics in 3000


KeyboardInterrupt: 

In [16]:
# Load the CSV file
df = pd.read_csv('../data/landing/supermarkets_by_postcode.csv')

# Filter the rows where the Postcode in the Address matches exactly the Postcode column
df_filtered = df[df.apply(lambda row: str(row['Postcode']) in row['Address'], axis=1)]

# Drop duplicates from the filtered DataFrame
df_filtered_unique = df_filtered.drop_duplicates()

# Reset the index of the filtered DataFrame without duplicates
df_filtered_unique_reset = df_filtered_unique.reset_index(drop=True)

df_filtered_unique_reset.tail(20)

df_filtered_unique_reset.to_csv('../data/raw/supermarket.csv', index=False)