In [26]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder.appName('PostalCodeToCoordinates').getOrCreate()

vancouver_df = spark.read.csv('Vancouver_Property_Value_Data.csv', header=True, inferSchema=True)

# Load the Canadian postal codes DataFrame with latitude and longitude
postal_codes_df = spark.read.csv('CanadianPostalCodes202312.csv', header=True, inferSchema=True)

# Join the DataFrames on the postal code column
joined_df = vancouver_df.join(postal_codes_df, vancouver_df.PROPERTY_POSTAL_CODE == postal_codes_df.POSTAL_CODE, 'left') \
                         .select(vancouver_df["*"], postal_codes_df.LATITUDE, postal_codes_df.LONGITUDE)

# Drop the duplicated postal code column if exists
joined_df = joined_df.drop(postal_codes_df.POSTAL_CODE)

joined_df.show(truncate=False)





+--------------------+----------+--------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+-------------------------

                                                                                

In [34]:
from pyspark.sql.functions import col, isnan, trim
# Cleans out empty rows
cleaned_df = joined_df.filter(col("LATITUDE").isNotNull())
# This will remove rows where LATITUDE or LONGITUDE are not numbers or are null/empty after trimming whitespaces
cleaned_df = cleaned_df.filter(
    ~(
        isnan(trim(col("LATITUDE")).cast("double")) | 
        col("LATITUDE").isNull() | 
        (trim(col("LATITUDE")) == "") |
        isnan(trim(col("LONGITUDE")).cast("double")) | 
        col("LONGITUDE").isNull() | 
        (trim(col("LONGITUDE")) == "")
    )
)

In [37]:
print(joined_df.count())
print(cleaned_df.count())
cleaned_df.show()


                                                                                

15611


                                                                                

15557
+--------------------+----------+--------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+-------------------

In [40]:
cleaned_df.coalesce(1).write.option("header", "true").csv("add-cords", mode="overwrite")
# Stop the Spark session
spark.stop()

                                                                                