In [1]:
import requests
import csv
import time
import pandas as pd


In [2]:
from pyspark.sql import SparkSession
#Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("Liveability")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.driver.memory', '4g')
    .config('spark.executor.memory', '2g')
    .getOrCreate()
)
df = spark.read.parquet("../data/postcodes/postcodes.parquet", header=True, inferSchema=True)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/05 22:48:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

In [14]:
# Your Google Places API key
API_KEY = 'AIzaSyDKBch72s8hyaVK4GsnrOhA5AnWT4IIYXI'

# Base URL for Google Places API
url = 'https://maps.googleapis.com/maps/api/place/textsearch/json'

# Load the postcode data (Assuming the file is correctly loaded into a DataFrame)
postcodes_sdf = spark.read.parquet('../data/postcodes/postcodes.parquet')

In [5]:
from urllib.request import urlretrieve
import os

# from the current directory , we create separate files for our variables
output_relative_dir = '../data/raw_variables/'
variables = ['Hospitals & Clinics', 'Schools', 'Groceries']

# check if it exists as it makedir will raise an error if it does exist
if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)
    

In [6]:
columns = ['locality', 'state', 'long', 'lat']
postcodes_sdf = postcodes_sdf.drop(*columns)
postcodes_sdf = postcodes_sdf.dropDuplicates()
postcodes_sdf = postcodes_sdf.orderBy('postcode')


In [7]:
from pyspark.sql.types import StructType, StructField, StringType

# Define schema for the Spark DataFrame
schema = StructType([
    StructField("Name", StringType(), True),
    StructField("Address", StringType(), True),
    StructField("Postcode", StringType(), True),
    StructField("Rating", StringType(), True),
])


In [8]:
## for testing purposes
postcodes_sdf2 = postcodes_sdf.filter(postcodes_sdf['postcode'] < 3004)

In [9]:
def get_chunks(postcodes_sdf) -> dict:
    """function that splits up postcodes into chunks of 50 so that if we are kicked halfway during scraping we don't lose too much progress
    """
    chunk_dict = {}
    i = 3000
    j = 3050
    
    while i < 3997:
        
        temp = postcodes_sdf.filter((postcodes_sdf['postcode'] >= i) & (postcodes_sdf['postcode'] < j))

        chunk_dict[f'chunk_{i}'] = temp
        j += 50
        i += 50

    return chunk_dict

chunk_dict = get_chunks(postcodes_sdf)

In [10]:
#### Scraping task 1: schools
# Iterate through all variables and initialize a temporary dataframe
from pyspark.sql.types import StructType, StructField, StringType

def variables_scrape(chunk, file_suffix):
    variables = ['Hospitals & Clinics', 'Schools', 'Groceries']
    schema = StructType([
    StructField("Name", StringType(), True),
    StructField("Address", StringType(), True),
    StructField("Postcode", StringType(), True),
    StructField("Rating", StringType(), True),])
    
    variable_metadata = spark.createDataFrame([], schema)

    postcodes_sdf.filter(postcodes_sdf['postcode'] <= 3000 + 250)
    
    for variable in variables:
        # Loop through each row in the dataframe
        for row in chunk.collect():
            postcode = row['postcode']
                
            print(f'searching for {variable} in {postcode}')
            # Define the search query using postcode
            params = {
                'query': f'{variable} in {postcode}, Victoria, Australia',
                'key': API_KEY,
                'type': {variable},
                'region': 'AU'
            }

            response = requests.get(url, params=params)
                
            # Check if the response was successful
            if response.status_code == 200:
                print(response.json())
                results = response.json().get('results', [])
                print(results)
                    
                # Write each place's details to the CSV file
                for place in results:
                    print(place)
                    address = place.get('formatted_address')
                    status = place.get('business_status')
                    
                    if (f'{postcode}' in address) & (status == 'OPERATIONAL'):
                        print('match found')
                        name = place.get('name')
                        rating = place.get('rating', 'N/A')
                        row = [(name, address, postcode, rating)]
                        row_df = spark.createDataFrame(row, schema)
                        variable_metadata = variable_metadata.union(row_df)
                    
                # Introduce a short delay to avoid hitting rate limits of the API
                time.sleep(1)  # 1-second delay between requests
            else:
                print(f"{variable}: Error fetching data for postcode {postcode}: {response.status_code}, {response.text}")
            print(f'searching for {variable} in {postcode}')

        try: 
            variable_metadata.write.mode("overwrite").parquet(f'../data/raw_variables/{variable}/{variable}_{file_suffix}.parquet')
            print(f"Data successfully written for {variable}")
        except Exception as e:
            print(f"An error occured: {e}")

In [11]:
def run_chunk(starting_chunk: int) -> None:
    """Function that scrapes domain.com.au in chunks of 25 postcodes 7 times (split amongst group members)
    
    Parameters:
    starting_chunk - starting chunk number that we want

    Return:
    None 
    """
    i = starting_chunk
    
    while i < starting_chunk + 200: 
        variables_scrape(chunk_dict[f"chunk_{i}"], i) #i.split("_")[1])
        i += 50

In [None]:
### Davyn 
starting_chunk = 3150
run_chunk(starting_chunk)

In [None]:
### Arpan
starting_chunk = 3000 + 200
run_chunk(starting_chunk)

In [None]:
### Rachel
starting_chunk = 3000 + 400
run_chunk(starting_chunk)

In [None]:
### Nathan
starting_chunk = 3000 + 600
run_chunk(starting_chunk)

In [None]:
### Pris
starting_chunk = 3000 + 800
run_chunk(starting_chunk)

In [15]:
testing_sdf = spark.read.parquet('../data/raw_variables/Schools/Schools_3450.parquet')
testing_sdf

Name,Address,Postcode,Rating
Blackwood Special...,Special School Ou...,3458,4.5
BreastScreen Mary...,Maryborough Distr...,3465,0.0
Trentham District...,Trentham District...,3458,5.0
"FLO Program, Etty...","35 Etty St, Castl...",3450,0.0
Olinda Primary Sc...,Olinda State Scho...,3494,3.9
Dhelkaya Health -...,Castlemaine Healt...,3450,0.0
Terra Australis D...,Lot 2 Railway Cre...,3460,4.6
Dorevitch Pathology,Maryborough & Dis...,3465,4.7
Roseberry House E...,"123 Inkerman St, ...",3465,5.0
Milla Dance Studi...,57/31 Lyttleton S...,3450,5.0


In [17]:
sdf = spark.read.parquet('../data/raw_variables/Groceries/*')
# Create new parquet of raw data
sdf \
    .coalesce(1) \
    .write \
    .mode('overwrite') \
    .parquet('../data/scraped/groceries_data.parquet')

24/10/03 22:29:08 WARN TaskSetManager: Stage 80 contains a task of very large size (2247 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [18]:
sdf = spark.read.parquet('../data/raw_variables/Hospitals & Clinics/*')
# Create new parquet of raw data
sdf \
    .coalesce(1) \
    .write \
    .mode('overwrite') \
    .parquet('../data/scraped/Hospitals_&_Clinics_data.parquet')

                                                                                

In [19]:
sdf = spark.read.parquet('../data/raw_variables/Schools/*')
# Create new parquet of raw data
sdf \
    .coalesce(1) \
    .write \
    .mode('overwrite') \
    .parquet('../data/scraped/Schools_data.parquet')

24/10/03 22:30:06 WARN TaskSetManager: Stage 84 contains a task of very large size (1457 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [23]:
sdf = spark.read.parquet('../data/scraped/Schools_data.parquet')
len(sdf.columns), sdf.count()

(4, 5606)

In [24]:
sdf = spark.read.parquet('../data/scraped/Groceries_data.parquet')
len(sdf.columns), sdf.count()

(4, 8546)

In [25]:
sdf = spark.read.parquet('../data/scraped/Hospitals_&_Clinics_data.parquet')
len(sdf.columns), sdf.count()

(4, 2059)

In [4]:
from pyspark.sql import functions as F

In [5]:
school_sdf = spark.read.parquet('../data/scraped/Schools_data.parquet')
# Group by Postcode and count the number of schools
schools_per_postcode = school_sdf.groupBy('Postcode').agg(F.count('Name').alias('School_Count'))
schools_per_postcode

Postcode,School_Count
3414,5
3959,1
3015,10
3441,2
3517,5
3858,5
3281,2
3200,3
3121,28
3249,4


In [6]:
groceries_sdf = spark.read.parquet('../data/scraped/groceries_data.parquet')
# Group by Postcode and count the number of groceriess
groceries_per_postcode = groceries_sdf.groupBy('Postcode').agg(F.count('Name').alias('groceries_Count'))
groceries_per_postcode

Postcode,groceries_Count
3414,6
3959,1
3517,10
3015,15
3441,2
3858,10
3281,2
3200,5
3121,44
3266,9


In [7]:
hc_sdf = spark.read.parquet('../data/scraped/Hospitals_&_Clinics_data.parquet')
# Group by Postcode and count the number of hcs
hc_per_postcode = hc_sdf.groupBy('Postcode').agg(F.count('Name').alias('hc_Count'))
hc_per_postcode

Postcode,hc_Count
3414,1
3517,3
3015,2
3858,1
3121,15
3266,2
3057,2
3167,1
3898,1
3875,15


In [10]:
# Perform a join on Postcode column to combine all three DataFrames
combined_df = schools_per_postcode \
    .join(groceries_per_postcode, on='Postcode', how='outer') \
    .join(hc_per_postcode, on='Postcode', how='outer')

combined_df

Postcode,School_Count,groceries_Count,hc_Count
3000,24,42,8.0
3002,14,17,10.0
3003,7,14,2.0
3004,7,12,2.0
3006,11,23,6.0
3008,3,3,1.0
3010,1,1,
3011,31,49,14.0
3012,9,14,5.0
3013,14,21,3.0


In [9]:
qra_sdf = spark.read.parquet('../data/landing/qra.parquet')
qra_sdf

postcode,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
3206,0.1037037037037036,0.1073825503355705,0.0060606060606061,0.0,0.06024096385542177,0.0,0.0886363636363636,0.1482254697286011,0.0727272727272727,0.016949152542372836,0.06666666666666665,0.03125,-0.01515151515151...,-0.01538461538461533,0.0,0.01953125,0.0574712643678161,0.0144927536231884,0.0214285714285713,-0.00349650349650...,-0.0771929824561403,0.0114068441064638,0.1466165413533835,0.1147540983606556
3206,0.1037037037037036,0.1073825503355705,0.0060606060606061,0.0,0.06024096385542177,0.0,0.0886363636363636,0.1482254697286011,0.0727272727272727,0.016949152542372836,0.06666666666666665,0.03125,-0.01515151515151...,-0.01538461538461533,0.0,0.01953125,0.0574712643678161,0.0144927536231884,0.0214285714285713,-0.00349650349650...,-0.0771929824561403,0.0114068441064638,0.1466165413533835,0.1147540983606556
3182,0.1037037037037036,0.1073825503355705,0.0060606060606061,0.0,0.06024096385542177,0.0,0.0886363636363636,0.1482254697286011,0.0727272727272727,0.016949152542372836,0.06666666666666665,0.03125,-0.01515151515151...,-0.01538461538461533,0.0,0.01953125,0.0574712643678161,0.0144927536231884,0.0214285714285713,-0.00349650349650...,-0.0771929824561403,0.0114068441064638,0.1466165413533835,0.1147540983606556
3143,0.056910569105691,0.0384615384615385,0.04444444444444451,0.02127659574468077,0.04166666666666674,0.0266666666666666,0.1363636363636364,0.0925714285714285,0.1140167364016735,0.06572769953051649,0.047577092511013275,0.042893187552565104,-0.02822580645161...,0.020746887966804906,0.0081300813008129,0.0161290322580645,0.0976190476190477,0.0903832248734635,-0.0450928381962865,0.002083333333333437,-0.0679140679140679,-0.0200743494423791,0.1783004552352047,0.1075338055376691
3054,0.0578512396694215,0.0437499999999999,0.04041916167664672,0.03309352517985609,0.0,0.0557103064066852,0.1609498680738785,0.1113636363636363,0.0664621676891614,0.08149568552253106,0.06028368794326244,0.04264214046822734,-0.00561347233360...,0.029032258064516148,0.0109717868338556,0.0542635658914729,0.0073529411764705,0.0204379562043794,0.0515021459227467,-0.02857142857142858,-0.1106442577030812,0.0,0.188976377952756,0.0596026490066226
3053,0.0942028985507246,-0.0463576158940397,0.0625,0.02614379084967311,0.2738853503184713,0.04,0.0490384615384615,0.0678276810265812,0.0652360515021459,0.055600322320709106,-0.01145038167938...,0.027027027027026973,-0.00751879699248...,0.030303030303030276,0.0220588235294116,0.0323741007194244,0.0452961672473868,0.0266666666666666,0.0357142857142858,-0.03448275862068961,-0.2012987012987013,0.0650406504065039,0.2946564885496183,0.0613207547169811
3052,0.0942028985507246,-0.0463576158940397,0.0625,0.02614379084967311,0.2738853503184713,0.04,0.0490384615384615,0.0678276810265812,0.0652360515021459,0.055600322320709106,-0.01145038167938...,0.027027027027026973,-0.00751879699248...,0.030303030303030276,0.0220588235294116,0.0323741007194244,0.0452961672473868,0.0266666666666666,0.0357142857142858,-0.03448275862068961,-0.2012987012987013,0.0650406504065039,0.2946564885496183,0.0613207547169811
3000,0.0349999999999999,0.0193236714975846,-0.01421800947867...,9.615384615384581E-4,0.032660902977905915,0.0334883720930232,0.0846084608460846,0.1161825726141079,0.0631970260223049,0.02447552447552437,0.03754266211604085,0.015131578947368496,-0.00518470511989...,-0.01628664495114...,0.009933774834437,0.022950819672131,0.0480769230769231,0.036697247706422,0.0548672566371681,-0.09116331096196872,-0.2246153846153846,0.1309523809523809,0.3677192982456141,0.1287839917906619
3004,0.0349999999999999,0.0193236714975846,-0.01421800947867...,9.615384615384581E-4,0.032660902977905915,0.0334883720930232,0.0846084608460846,0.1161825726141079,0.0631970260223049,0.02447552447552437,0.03754266211604085,0.015131578947368496,-0.00518470511989...,-0.01628664495114...,0.009933774834437,0.022950819672131,0.0480769230769231,0.036697247706422,0.0548672566371681,-0.09116331096196872,-0.2246153846153846,0.1309523809523809,0.3677192982456141,0.1287839917906619
3066,0.0545454545454544,0.0431034482758621,0.11239669421487597,0.06240713224368499,0.116083916083916,0.2343358395989974,0.0812182741116751,0.1370892018779341,0.0635838150289016,0.022515527950310643,0.08200455580865595,0.010526315789473717,0.01736111111111116,0.017064846416382284,-0.0020134228187919,0.0221923335574982,0.0361842105263157,0.0222222222222221,0.0372670807453416,-0.01916167664670...,-0.1269841269841269,0.0664335664335664,0.1868852459016392,0.0917127071823205


In [12]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, FloatType

In [13]:
# Define the UDF to get latitude and longitude from Google API
def get_geolocation(postcode):
    url = f"https://maps.googleapis.com/maps/api/geocode/json?address={postcode},Victoria,Australia&key={API_KEY}"
    response = requests.get(url)
    if response.status_code == 200:
        result = response.json()
        if result['results']:
            location = result['results'][0]['geometry']['location']
            return location['lat'], location['lng']
    return None, None

# Split the function into two UDFs: one for latitude, one for longitude
def get_latitude(postcode):
    lat, lng = get_geolocation(postcode)
    return lat

def get_longitude(postcode):
    lat, lng = get_geolocation(postcode)
    return lng


In [15]:
# Register UDFs with PySpark
get_latitude_udf = udf(get_latitude, FloatType())
get_longitude_udf = udf(get_longitude, FloatType())

# Assuming you have a DataFrame `combined_update_sdf` with a 'Postcode' column
# For example:
combined_update_sdf = combined_df

# Add latitude and longitude columns to your DataFrame
combined_update_sdf = combined_update_sdf.withColumn('Latitude', get_latitude_udf(combined_update_sdf['Postcode']))
combined_update_sdf = combined_update_sdf.withColumn('Longitude', get_longitude_udf(combined_update_sdf['Postcode']))

# Show the updated DataFrame with geolocation data
combined_update_sdf

                                                                                

Postcode,School_Count,groceries_Count,hc_Count,Latitude,Longitude
3000,24,42,8.0,-37.815205,144.96394
3002,14,17,10.0,-37.816143,144.98045
3003,7,14,2.0,-37.81145,144.9254
3004,7,12,2.0,-37.83016,144.98045
3006,11,23,6.0,-37.824547,144.96394
3008,3,3,1.0,-37.817066,144.94191
3010,1,1,,-37.798447,144.9621
3011,31,49,14.0,-37.79602,144.90063
3012,9,14,5.0,-37.814625,144.84563
3013,14,21,3.0,-37.819813,144.88138


In [16]:
combined_update_sdf \
    .coalesce(1) \
    .write \
    .mode('overwrite') \
    .parquet('../data/scraped/combined_data_with_geolocation.parquet')

                                                                                

In [17]:
# Define the function to get suburb/locality using Google API
def get_suburb_name(postcode):
    url = f"https://maps.googleapis.com/maps/api/geocode/json?address={postcode},Victoria,Australia&key={API_KEY}"
    response = requests.get(url)
    if response.status_code == 200:
        result = response.json()
        if result['results']:
            for component in result['results'][0]['address_components']:
                if 'locality' in component['types']:  # Extract the locality (suburb)
                    return component['long_name']
    return None

In [19]:
# Register the UDF with PySpark
get_suburb_name_udf = udf(get_suburb_name, StringType())

# Assuming you have a DataFrame 'combined_df' with 'Postcode' column
# Apply the UDF to add suburb names to your DataFrame
combined_sdf_with_names = combined_update_sdf.withColumn('Postcode_Name', get_suburb_name_udf(combined_df['Postcode']))

# Show the DataFrame with suburb names
combined_sdf_with_names

                                                                                

Postcode,School_Count,groceries_Count,hc_Count,Latitude,Longitude,Postcode_Name
3000,24,42,8.0,-37.815205,144.96394,Melbourne
3002,14,17,10.0,-37.816143,144.98045,East Melbourne
3003,7,14,2.0,-37.81145,144.9254,West Melbourne
3004,7,12,2.0,-37.83016,144.98045,Melbourne
3006,11,23,6.0,-37.824547,144.96394,South Wharf
3008,3,3,1.0,-37.817066,144.94191,Docklands
3010,1,1,,-37.798447,144.9621,Parkville
3011,31,49,14.0,-37.79602,144.90063,Footscray
3012,9,14,5.0,-37.814625,144.84563,Tottenham
3013,14,21,3.0,-37.819813,144.88138,Yarraville


In [20]:
combined_sdf_with_names \
                .coalesce(1) \
                .write \
                .mode('overwrite') \
                .parquet('../data/scraped/combined_data_with_geolocation.parquet')

                                                                                

In [4]:
combined_sdf = spark.read.parquet('../data/scraped/combined_data_with_geolocation.parquet')
combined_sdf

Postcode,School_Count,groceries_Count,hc_Count,Latitude,Longitude,Postcode_Name
3000,24,42,8.0,-37.815205,144.96394,Melbourne
3002,14,17,10.0,-37.816143,144.98045,East Melbourne
3003,7,14,2.0,-37.81145,144.9254,West Melbourne
3004,7,12,2.0,-37.83016,144.98045,Melbourne
3006,11,23,6.0,-37.824547,144.96394,South Wharf
3008,3,3,1.0,-37.817066,144.94191,Docklands
3010,1,1,,-37.798447,144.9621,Parkville
3011,31,49,14.0,-37.79602,144.90063,Footscray
3012,9,14,5.0,-37.814625,144.84563,Tottenham
3013,14,21,3.0,-37.819813,144.88138,Yarraville


In [3]:
merged_sdf = spark.read.parquet('../data/curated/merged_df.parquet')
merged_sdf

                                                                                

postcode,cost,suburb,furnished,property_type,beds,baths,parking,region,lgaregion,total male population - 2021,total female population - 2021,total population - 2021,australian citizens,median rent,median family weekly income,median age,total region male population - 2022,total region female population - 2022
3163,550.0,carnegie,0,Apartment / Unit ...,1.0,1.0,1,Major Cities of A...,Glen Eira,16007,16810,32810,24999,391,2598,36,2538968.0,2598457.0
3163,550.0,carnegie,0,Apartment / Unit ...,1.0,1.0,1,Major Cities of A...,Glen Eira,16007,16810,32810,24999,391,2598,36,2538968.0,2598457.0
3163,550.0,carnegie,0,Apartment / Unit ...,1.0,1.0,1,Major Cities of A...,Glen Eira,16007,16810,32810,24999,391,2598,36,2538968.0,2598457.0
3163,550.0,carnegie,0,Apartment / Unit ...,1.0,1.0,1,Major Cities of A...,Glen Eira,16007,16810,32810,24999,391,2598,36,2538968.0,2598457.0
3182,575.0,st-kilda,0,Apartment / Unit ...,1.0,1.0,0,Major Cities of A...,Port Phillip,11278,11156,22438,16227,381,2791,37,2538968.0,2598457.0
3182,575.0,st-kilda,0,Apartment / Unit ...,1.0,1.0,0,Major Cities of A...,Port Phillip,11278,11156,22438,16227,381,2791,37,2538968.0,2598457.0
3182,575.0,st-kilda,0,Apartment / Unit ...,1.0,1.0,0,Major Cities of A...,Port Phillip,11278,11156,22438,16227,381,2791,37,2538968.0,2598457.0
3004,625.0,melbourne,0,Apartment / Unit ...,1.0,1.0,0,Major Cities of A...,Melbourne,5655,5827,11482,7103,451,2727,37,2538968.0,2598457.0
3004,625.0,melbourne,0,Apartment / Unit ...,1.0,1.0,0,Major Cities of A...,Melbourne,5655,5827,11482,7103,451,2727,37,2538968.0,2598457.0
3004,625.0,melbourne,0,Apartment / Unit ...,1.0,1.0,0,Major Cities of A...,Melbourne,5655,5827,11482,7103,451,2727,37,2538968.0,2598457.0


In [5]:
liveability_sdf = merged_sdf \
            .join(combined_sdf, on='Postcode', how='outer')

In [6]:
liveability_sdf 

                                                                                

postcode,cost,suburb,furnished,property_type,beds,baths,parking,region,lgaregion,total male population - 2021,total female population - 2021,total population - 2021,australian citizens,median rent,median family weekly income,median age,total region male population - 2022,total region female population - 2022,School_Count,groceries_Count,hc_Count,Latitude,Longitude,Postcode_Name
3000,945.0,melbourne,0,Apartment / Unit ...,3.0,1.0,0,Major Cities of A...,Melbourne,21548,21539,43084,14713,370,1857,28,2538968.0,2598457.0,24,42,8,-37.815205,144.96394,Melbourne
3000,460.0,melbourne,0,Apartment / Unit ...,1.0,1.0,0,Major Cities of A...,Melbourne,21548,21539,43084,14713,370,1857,28,2538968.0,2598457.0,24,42,8,-37.815205,144.96394,Melbourne
3000,1250.0,melbourne,0,Apartment / Unit ...,2.0,2.0,0,Major Cities of A...,Melbourne,21548,21539,43084,14713,370,1857,28,2538968.0,2598457.0,24,42,8,-37.815205,144.96394,Melbourne
3000,740.0,melbourne,1,Apartment / Unit ...,2.0,2.0,0,Major Cities of A...,Melbourne,21548,21539,43084,14713,370,1857,28,2538968.0,2598457.0,24,42,8,-37.815205,144.96394,Melbourne
3000,810.0,melbourne,0,Apartment / Unit ...,2.0,2.0,0,Major Cities of A...,Melbourne,21548,21539,43084,14713,370,1857,28,2538968.0,2598457.0,24,42,8,-37.815205,144.96394,Melbourne
3000,550.0,melbourne,0,Apartment / Unit ...,1.0,1.0,1,Major Cities of A...,Melbourne,21548,21539,43084,14713,370,1857,28,2538968.0,2598457.0,24,42,8,-37.815205,144.96394,Melbourne
3000,600.0,melbourne,0,Apartment / Unit ...,1.0,1.0,0,Major Cities of A...,Melbourne,21548,21539,43084,14713,370,1857,28,2538968.0,2598457.0,24,42,8,-37.815205,144.96394,Melbourne
3000,620.0,melbourne,0,Apartment / Unit ...,2.0,1.0,0,Major Cities of A...,Melbourne,21548,21539,43084,14713,370,1857,28,2538968.0,2598457.0,24,42,8,-37.815205,144.96394,Melbourne
3000,950.0,melbourne,0,Apartment / Unit ...,2.0,2.0,1,Major Cities of A...,Melbourne,21548,21539,43084,14713,370,1857,28,2538968.0,2598457.0,24,42,8,-37.815205,144.96394,Melbourne
3000,1200.0,melbourne,1,Apartment / Unit ...,2.0,2.0,0,Major Cities of A...,Melbourne,21548,21539,43084,14713,370,1857,28,2538968.0,2598457.0,24,42,8,-37.815205,144.96394,Melbourne


In [7]:
liveability_sdf \
        .coalesce(1) \
        .write \
        .mode('overwrite') \
        .parquet('../data/scraped/liveability_data.parquet')

                                                                                

In [21]:
spark.stop()