In [18]:
import os
from pyspark.sql import SparkSession, functions as F

import sys
current_dir = os.getcwd()
scripts_path = os.path.join(current_dir, '../../scripts')
sys.path.append(os.path.abspath(scripts_path))
import preprocess_function as process

In [7]:
# Create a spark session
spark = (
    SparkSession.builder.appName("MAST30034 Project 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()
)

In [8]:
# Create folder to save dataset
base_dir = '../../data/'
landing_dir = os.path.join(base_dir, 'landing')
raw_dir = os.path.join(base_dir, 'raw')

if not os.path.exists(base_dir):
    os.makedirs(base_dir)


subfolder = 'Supermarkets'


if not os.path.exists(os.path.join(raw_dir, subfolder)):
    os.makedirs(os.path.join(raw_dir, subfolder))

In [9]:
sdf = spark.read.parquet(f"{landing_dir}/{subfolder}/supermarkets_with_postcode.parquet")


In [10]:
sdf.limit(5)

Supermarket Name,Address,Suburb,Postcode
Aldi,8 Franklin Street...,Melbourne,3000
Coles,2 Elizabeth Stree...,Melbourne,3915
Coles,Melbourne Central...,Melbourne,3000
Friendly Grocer,"Shop 1, 360 Colli...",Melbourne,3429
IGA,470 Collins Stree...,Melbourne,3000


In [11]:
sdf.printSchema()

root
 |-- Supermarket Name: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Suburb: string (nullable = true)
 |-- Postcode: string (nullable = true)



In [12]:
# dataset shape
process.print_dataset_shape(f"Shape of original dataset: ", sdf)

Shape of original dataset:  - Shape: (173 rows, 4 columns)


In [14]:
# check missing value
missing = process.check_missing_values(sdf)

Missing values:
 - Postcode: 1 missing values


In [17]:
missing_postcode_data = sdf.filter(sdf['Postcode'].isNull()).toPandas()
missing_postcode_data


Unnamed: 0,Supermarket Name,Address,Suburb,Postcode
0,IGA,"Alira Village, 36 Adakite Drive, Berwick",Berwick,


In [19]:
# Manually searched for the postcode corresponding to "36 Adakite Dr, Berwick" and filling it in for the missing value
sdf_filled = sdf.withColumn(
    'Postcode', 
    F.when(
        (sdf['Address'] == 'Alira Village, 36 Adakite Drive, Berwick') & sdf['Postcode'].isNull(), '3806'
    ).otherwise(sdf['Postcode'])
)

# Display the updated data
sdf_filled.show()


+--------------------+--------------------+-----------+--------+
|    Supermarket Name|             Address|     Suburb|Postcode|
+--------------------+--------------------+-----------+--------+
|                Aldi|8 Franklin Street...|  Melbourne|    3000|
|               Coles|2 Elizabeth Stree...|  Melbourne|    3915|
|               Coles|Melbourne Central...|  Melbourne|    3000|
|     Friendly Grocer|Shop 1, 360 Colli...|  Melbourne|    3429|
|                 IGA|470 Collins Stree...|  Melbourne|    3000|
|                 IGA|19 Commercial Roa...|  Melbourne|    3168|
|                 IGA|333 Exhibition St...|  Melbourne|    3000|
|                 IGA|84 Flinders Stree...|  Melbourne|    3000|
|                 IGA|35-41 Lonsdale St...|  Melbourne|    3175|
|                 IGA|85 Queen Street, ...|  Melbourne|    3931|
|          Woolworths|600 Bourke Street...|  Melbourne|    3000|
|          Woolworths|60 Elizabeth Stre...|  Melbourne|    3121|
|          Woolworths|388

In [20]:
process.check_missing_values(sdf_filled)

{}

In [21]:
# save data
output_file_path = f"{raw_dir}/{subfolder}/supermarkets_info.parquet"
sdf_filled.write.mode("overwrite").parquet(output_file_path)
    
process.print_dataset_shape(f"Finished processing and saving data: ", sdf_filled)
sdf_filled.limit(5)

Finished processing and saving data:  - Shape: (173 rows, 4 columns)


Supermarket Name,Address,Suburb,Postcode
Aldi,8 Franklin Street...,Melbourne,3000
Coles,2 Elizabeth Stree...,Melbourne,3915
Coles,Melbourne Central...,Melbourne,3000
Friendly Grocer,"Shop 1, 360 Colli...",Melbourne,3429
IGA,470 Collins Stree...,Melbourne,3000
