In [2]:
import requests
import csv
import time
import pandas as pd


In [3]:
from pyspark.sql import SparkSession
#Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("Liveability")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.driver.memory', '4g')
    .config('spark.executor.memory', '2g')
    .getOrCreate()
)
df = spark.read.parquet("../data/postcodes/postcodes.parquet", header=True, inferSchema=True)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/03 22:05:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

In [4]:
# Your Google Places API key
API_KEY = 'AIzaSyDKBch72s8hyaVK4GsnrOhA5AnWT4IIYXI'

# Base URL for Google Places API
url = 'https://maps.googleapis.com/maps/api/place/textsearch/json'

# Load the postcode data (Assuming the file is correctly loaded into a DataFrame)
postcodes_sdf = spark.read.parquet('../data/postcodes/postcodes.parquet')

In [5]:
from urllib.request import urlretrieve
import os

# from the current directory , we create separate files for our variables
output_relative_dir = '../data/raw_variables/'
variables = ['Hospitals & Clinics', 'Schools', 'Groceries']

# check if it exists as it makedir will raise an error if it does exist
if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)
    

In [6]:
columns = ['locality', 'state', 'long', 'lat']
postcodes_sdf = postcodes_sdf.drop(*columns)
postcodes_sdf = postcodes_sdf.dropDuplicates()
postcodes_sdf = postcodes_sdf.orderBy('postcode')


In [7]:
from pyspark.sql.types import StructType, StructField, StringType

# Define schema for the Spark DataFrame
schema = StructType([
    StructField("Name", StringType(), True),
    StructField("Address", StringType(), True),
    StructField("Postcode", StringType(), True),
    StructField("Rating", StringType(), True),
])


In [8]:
## for testing purposes
postcodes_sdf2 = postcodes_sdf.filter(postcodes_sdf['postcode'] < 3004)

In [9]:
def get_chunks(postcodes_sdf) -> dict:
    """function that splits up postcodes into chunks of 50 so that if we are kicked halfway during scraping we don't lose too much progress
    """
    chunk_dict = {}
    i = 3000
    j = 3050
    
    while i < 3997:
        
        temp = postcodes_sdf.filter((postcodes_sdf['postcode'] >= i) & (postcodes_sdf['postcode'] < j))

        chunk_dict[f'chunk_{i}'] = temp
        j += 50
        i += 50

    return chunk_dict

chunk_dict = get_chunks(postcodes_sdf)

In [10]:
#### Scraping task 1: schools
# Iterate through all variables and initialize a temporary dataframe
from pyspark.sql.types import StructType, StructField, StringType

def variables_scrape(chunk, file_suffix):
    variables = ['Hospitals & Clinics', 'Schools', 'Groceries']
    schema = StructType([
    StructField("Name", StringType(), True),
    StructField("Address", StringType(), True),
    StructField("Postcode", StringType(), True),
    StructField("Rating", StringType(), True),])
    
    variable_metadata = spark.createDataFrame([], schema)

    postcodes_sdf.filter(postcodes_sdf['postcode'] <= 3000 + 250)
    
    for variable in variables:
        # Loop through each row in the dataframe
        for row in chunk.collect():
            postcode = row['postcode']
                
            print(f'searching for {variable} in {postcode}')
            # Define the search query using postcode
            params = {
                'query': f'{variable} in {postcode}, Victoria, Australia',
                'key': API_KEY,
                'type': {variable},
                'region': 'AU'
            }

            response = requests.get(url, params=params)
                
            # Check if the response was successful
            if response.status_code == 200:
                print(response.json())
                results = response.json().get('results', [])
                print(results)
                    
                # Write each place's details to the CSV file
                for place in results:
                    print(place)
                    address = place.get('formatted_address')
                    status = place.get('business_status')
                    
                    if (f'{postcode}' in address) & (status == 'OPERATIONAL'):
                        print('match found')
                        name = place.get('name')
                        rating = place.get('rating', 'N/A')
                        row = [(name, address, postcode, rating)]
                        row_df = spark.createDataFrame(row, schema)
                        variable_metadata = variable_metadata.union(row_df)
                    
                # Introduce a short delay to avoid hitting rate limits of the API
                time.sleep(1)  # 1-second delay between requests
            else:
                print(f"{variable}: Error fetching data for postcode {postcode}: {response.status_code}, {response.text}")
            print(f'searching for {variable} in {postcode}')

        try: 
            variable_metadata.write.mode("overwrite").parquet(f'../data/raw_variables/{variable}/{variable}_{file_suffix}.parquet')
            print(f"Data successfully written for {variable}")
        except Exception as e:
            print(f"An error occured: {e}")

In [11]:
def run_chunk(starting_chunk: int) -> None:
    """Function that scrapes domain.com.au in chunks of 25 postcodes 7 times (split amongst group members)
    
    Parameters:
    starting_chunk - starting chunk number that we want

    Return:
    None 
    """
    i = starting_chunk
    
    while i < starting_chunk + 200: 
        variables_scrape(chunk_dict[f"chunk_{i}"], i) #i.split("_")[1])
        i += 50

In [None]:
### Davyn 
starting_chunk = 3150
run_chunk(starting_chunk)

In [None]:
### Arpan
starting_chunk = 3000 + 200
run_chunk(starting_chunk)

In [None]:
### Rachel
starting_chunk = 3000 + 400
run_chunk(starting_chunk)

In [None]:
### Nathan
starting_chunk = 3000 + 600
run_chunk(starting_chunk)

In [None]:
### Pris
starting_chunk = 3000 + 800
run_chunk(starting_chunk)

In [15]:
testing_sdf = spark.read.parquet('../data/raw_variables/Schools/Schools_3450.parquet')
testing_sdf

Name,Address,Postcode,Rating
Blackwood Special...,Special School Ou...,3458,4.5
BreastScreen Mary...,Maryborough Distr...,3465,0.0
Trentham District...,Trentham District...,3458,5.0
"FLO Program, Etty...","35 Etty St, Castl...",3450,0.0
Olinda Primary Sc...,Olinda State Scho...,3494,3.9
Dhelkaya Health -...,Castlemaine Healt...,3450,0.0
Terra Australis D...,Lot 2 Railway Cre...,3460,4.6
Dorevitch Pathology,Maryborough & Dis...,3465,4.7
Roseberry House E...,"123 Inkerman St, ...",3465,5.0
Milla Dance Studi...,57/31 Lyttleton S...,3450,5.0


In [None]:
sdf = spark.read.parquet('../data/raw_variables/Groceries')

# Create new parquet of raw data
sdf \
    .coalesce(1) \
    .write \
    .mode('overwrite') \
    .parquet('../data/scrapped/groceries_data.parquet')





In [None]:
sdf = spark.read.parquet('../data/raw_variables/Hospitals & Clinics')

# Create new parquet of raw data
sdf \
    .coalesce(1) \
    .write \
    .mode('overwrite') \
    .parquet('../data/scrapped/Hospitals_&_Clinics_data.parquet')

In [None]:
sdf = spark.read.parquet('../data/raw_variables/Groceries')
# Create new parquet of raw data
sdf \
    .coalesce(1) \
    .write \
    .mode('overwrite') \
    .parquet('../data/scrapped/groceries_data.parquet')

In [None]:
#### DO NOT RUN FROM HERE ONWARDS #########
## DO NOT RUN
# Iterate through all variables and initialize a temporary dataframe
for variable in variables :
    variable_metadata = spark.createDataFrame([], schema)

    # Loop through each row in the dataframe
    for row in postcodes_sdf.collect():
        postcode = row['postcode']
        
        print(f'searching for {variable} in {postcode}')
        # Define the search query using postcode
        params = {
            'query': f'{variable} in {postcode}, Victoria, Australia',
            'key': API_KEY,
            'type': {variable},
            'region': 'AU'
        }

        response = requests.get(url, params=params)
        
        # Check if the response was successful
        if response.status_code == 200:
            results = response.json().get('results', [])
            
            # Write each place's details to the CSV file
            for place in results:
                name = place.get('name')
                address = place.get('formatted_address')
                rating = place.get('rating', 'N/A')
                row = [(name, address, postcode, rating)]
                row_df = spark.createDataFrame(row, schema)
                variable_metadata = variable_metadata.union(row_df)
            
            # Introduce a short delay to avoid hitting rate limits of the API
            time.sleep(1)  # 1-second delay between requests
        else:
            print(f"{variable}: Error fetching data for postcode {postcode}: {response.status_code}, {response.text}")
        print(f'searching for {variable} in {postcode}')

    try:
        variable_metadata.write.mode("overwrite").parquet(f'../data/landing/{variable}.parquet')
        print(f"Data successfully written for {variable}")
    except Exception as e:
       print(f"An error occured: {e}")

In [16]:
# Load the CSV file
df = pd.read_csv('../data/landing/supermarkets_by_postcode.csv')

# Filter the rows where the Postcode in the Address matches exactly the Postcode column
df_filtered = df[df.apply(lambda row: str(row['Postcode']) in row['Address'], axis=1)]

# Drop duplicates from the filtered DataFrame
df_filtered_unique = df_filtered.drop_duplicates()

# Reset the index of the filtered DataFrame without duplicates
df_filtered_unique_reset = df_filtered_unique.reset_index(drop=True)

df_filtered_unique_reset.tail(20)

df_filtered_unique_reset.to_csv('../data/raw/supermarket.csv', index=False)