In [23]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, split
from pyspark.sql.types import DoubleType
from geopy.distance import great_circle

spark = SparkSession.builder.appName("DistanceCalculation").getOrCreate()

df = spark.read.csv('Cleaned/ML-Model-Cleaned.csv', header=True, inferSchema=True).cache()

24/04/07 12:44:01 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
24/04/07 12:44:01 WARN CacheManager: Asked to cache already cached data.


In [24]:
df = df.drop('CLOSEST_GRAFFITI_METERS')
df.show()


+--------------------+----------+--------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+-------------------------

In [25]:
from pyspark.sql.functions import col, when, lit

# This code shows how many rows have 0s in the Current Land Value and Improvement Value columns that we need to predict. 
feature_columns = [col for col in df.columns if 'CURRENT_LAND_VALUE' not in col and 'CURRENT_IMPROVEMENT_VALUE' not in col]
df = df.withColumn('temp_zero_count', lit(0))
for c in feature_columns:
    df = df.withColumn('temp_zero_count', when(col(c) == 0, 1).otherwise(0) + col('temp_zero_count'))
zero_rows_count = df.filter(col('temp_zero_count') > 0).count()
print(f"Number of rows with zeros in feature columns: {zero_rows_count} out of {df.count()}")
df = df.drop('temp_zero_count')

Number of rows with zeros in feature columns: 756 out of 15496


In [27]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_extract
from pyspark.sql.types import StringType, FloatType

# Define a regular expression pattern to match the numbers and text
distance_pattern = r'\{(\d+\.\d+),'
school_type_pattern = r',\s*([^}]+)\}'

# Use regexp_extract to create new columns for distance and school type
df = df.withColumn('SCHOOL_DISTANCE', regexp_extract('CLOSEST_SCHOOL_METERS', distance_pattern, 1).cast(FloatType()))
df = df.withColumn('SCHOOL_TYPE', regexp_extract('CLOSEST_SCHOOL_METERS', school_type_pattern, 1).cast(StringType()))

df.drop('CLOSEST_SCHOOL_METERS')
df.show()


+--------------------+----------+--------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+-------------------------

In [29]:
# Use regexp_extract to create new columns for distance and area
df = df.withColumn('PARK_DISTANCE', regexp_extract('CLOSEST_PARK_METERS', r'\{(\d+\.\d+),', 1).cast(FloatType()))
df = df.withColumn('PARK_AREA_HECTARES', regexp_extract('CLOSEST_PARK_METERS', r',\s*(\d+\.\d+)\}', 1).cast(FloatType()))

df.drop('CLOSEST_PARK_METERS')
df.show()

+--------------------+----------+--------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+-------------------------

In [31]:
pattern_year = r'\{(\d{4}),'
pattern_type = r',\s*([^,]+),'
pattern_distance = r',\s*(\d+\.\d+)\}'

# Use regexp_extract to create new columns for year, type, and distance
df = df.withColumn('CULTURAL_SPACE_YEAR', regexp_extract('CULTURAL_SPACES_DETAILS', pattern_year, 1).cast(IntegerType()))
df = df.withColumn('CULTURAL_SPACE_TYPE', regexp_extract('CULTURAL_SPACES_DETAILS', pattern_type, 1).cast(StringType()))
df = df.withColumn('CULTURAL_SPACE_DISTANCE', regexp_extract('CULTURAL_SPACES_DETAILS', pattern_distance, 1).cast(FloatType()))

df.drop('CULTURAL_SPACES_DETAILS')
df.show()


+--------------------+----------+--------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+-------------------------

In [54]:
df_migration1 = spark.read.csv("Cleaned/greater_vancouver_immigration.csv", header=True, inferSchema=True)

df_migration1_filtered = df_migration1.filter(df_migration1['Region'] == 'Greater Vancouver')

df_migration1_filtered = df_migration1_filtered.withColumn("Year", split(df_migration1_filtered["Census Year"], "-")[0].cast('integer'))

df_migration1_filtered = df_migration1_filtered.filter((df_migration1_filtered['Year'] >= 2006))

df_migration1_filtered = df_migration1_filtered.select('Year', 'Immigrants', 'Emigrants')

df_migration1_filtered.show()

+----+----------+---------+
|Year|Immigrants|Emigrants|
+----+----------+---------+
|2006|     32245|     9838|
|2007|     36617|     9449|
|2008|     35977|     8334|
|2009|     36850|     8906|
|2010|     32857|     8783|
|2011|     29630|     9542|
|2012|     29822|     9560|
|2013|     30466|     9775|
|2014|     25159|     9752|
|2015|     34750|     9574|
|2016|     28023|     8595|
|2017|     33295|     7995|
|2018|     34562|     6842|
|2019|     36449|     4932|
|2020|     27998|     5061|
|2021|     65975|     7052|
+----+----------+---------+



In [58]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.sql.functions import col, lit
# Since 2022-2024 is missing from our data we predict these values
vectorAssembler = VectorAssembler(inputCols=["Year"], outputCol="features")
df_vector = vectorAssembler.transform(df_migration1_filtered)

lr_immigrants = LinearRegression(featuresCol="features", labelCol="Immigrants")
model_immigrants = lr_immigrants.fit(df_vector)

lr_emigrants = LinearRegression(featuresCol="features", labelCol="Emigrants")
model_emigrants = lr_emigrants.fit(df_vector)

years_to_predict = spark.createDataFrame([(2022,), (2023,), (2024,)], ["Year"])
df_predict_vector = vectorAssembler.transform(years_to_predict)

predictions_immigrants = model_immigrants.transform(df_predict_vector).select(col("Year"), col("prediction").cast("integer").alias("Immigrants"))
predictions_emigrants = model_emigrants.transform(df_predict_vector).select(col("Year"), col("prediction").cast("integer").alias("Emigrants"))

predictions_combined = predictions_immigrants.join(predictions_emigrants, "Year")


24/04/07 13:57:11 WARN Instrumentation: [d0f22374] regParam is zero, which might cause numerical instability and overfitting.
24/04/07 13:57:12 WARN Instrumentation: [f4bf41ab] regParam is zero, which might cause numerical instability and overfitting.


In [60]:
# merge back to dataset
additional_columns = [col_name for col_name in df_migration1_filtered.columns if col_name not in ["Year", "Immigrants", "Emigrants"]]
for col_name in additional_columns:
    predictions_combined = predictions_combined.withColumn(col_name, lit(None))

predictions_combined = predictions_combined.select(df_migration1_filtered.columns)

df_immigration_final = df_migration1_filtered.union(predictions_combined)

df_immigration_final.orderBy("Year").show()

+----+----------+---------+
|Year|Immigrants|Emigrants|
+----+----------+---------+
|2006|     32245|     9838|
|2007|     36617|     9449|
|2008|     35977|     8334|
|2009|     36850|     8906|
|2010|     32857|     8783|
|2011|     29630|     9542|
|2012|     29822|     9560|
|2013|     30466|     9775|
|2014|     25159|     9752|
|2015|     34750|     9574|
|2016|     28023|     8595|
|2017|     33295|     7995|
|2018|     34562|     6842|
|2019|     36449|     4932|
|2020|     27998|     5061|
|2021|     65975|     7052|
|2022|     39204|     6311|
|2023|     39768|     6068|
|2024|     40331|     5825|
+----+----------+---------+



In [37]:
df_migration2 = spark.read.csv("Cleaned/yearly_provincial_migration_into_bc.csv", header=True, inferSchema=True)
# Filter rows for the years 2006 to 2021
df_migration2_filtered = df_migration2.filter((df_migration2['Year'] >= 2006))

df_migration2_filtered = df_migration2_filtered.select('Year', 'Total_migration_into_BC', 'Total_migration_out_of_BC')

df_migration2_filtered.show()

+----+-----------------------+-------------------------+
|Year|Total_migration_into_BC|Total_migration_out_of_BC|
+----+-----------------------+-------------------------+
|2006|                  58830|                    46031|
|2007|                  57032|                    40256|
|2008|                  53663|                    42814|
|2009|                  48109|                    38437|
|2010|                  50050|                    43838|
|2011|                  49017|                    48306|
|2012|                  48291|                    52613|
|2013|                  50172|                    47658|
|2014|                  61032|                    45173|
|2015|                  68375|                    45548|
|2016|                  65364|                    41778|
|2017|                  59738|                    44445|
|2018|                  59231|                    46508|
|2019|                  60338|                    46075|
|2020|                  62889| 

In [51]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.sql.functions import lit

# Since 2022-2024 is missing from our data we predict these values
vectorAssembler = VectorAssembler(inputCols=["Year"], outputCol="features")
df_vector = vectorAssembler.transform(df_migration2_filtered)

df_train = df_vector.filter(df_vector["Year"] <= 2021)
years_to_predict = [2022, 2023, 2024]
df_test = spark.createDataFrame([(year,) for year in years_to_predict], ["Year"])
df_test_vector = vectorAssembler.transform(df_test)

lr = LinearRegression(featuresCol="features", labelCol="Total_migration_into_BC")
model_into = lr.fit(df_train)
predictions_into = model_into.transform(df_test_vector)

lr = LinearRegression(featuresCol="features", labelCol="Total_migration_out_of_BC")
model_out = lr.fit(df_train)
predictions_out = model_out.transform(df_test_vector)

predictions_into = predictions_into.withColumnRenamed("prediction", "Total_migration_into_BC")
predictions_out = predictions_out.withColumnRenamed("prediction", "Total_migration_out_of_BC")

24/04/07 13:39:36 WARN Instrumentation: [a68756c9] regParam is zero, which might cause numerical instability and overfitting.
24/04/07 13:39:37 WARN Instrumentation: [469511cd] regParam is zero, which might cause numerical instability and overfitting.


In [53]:
# Merge back to dataset
predictions_combined = predictions_into.join(predictions_out, "Year")
predictions_combined = predictions_combined.select(df_migration2_filtered.columns)

df_migration_final = df_migration2_filtered.union(predictions_combined)
df_migration_final = df_migration_final.withColumn("Total_migration_into_BC", col("Total_migration_into_BC").cast("integer"))
df_migration_final = df_migration_final.withColumn("Total_migration_out_of_BC", col("Total_migration_out_of_BC").cast("integer"))
df_migration_final.orderBy("Year").show()

+----+-----------------------+-------------------------+
|Year|Total_migration_into_BC|Total_migration_out_of_BC|
+----+-----------------------+-------------------------+
|2006|                  58830|                    46031|
|2007|                  57032|                    40256|
|2008|                  53663|                    42814|
|2009|                  48109|                    38437|
|2010|                  50050|                    43838|
|2011|                  49017|                    48306|
|2012|                  48291|                    52613|
|2013|                  50172|                    47658|
|2014|                  61032|                    45173|
|2015|                  68375|                    45548|
|2016|                  65364|                    41778|
|2017|                  59738|                    44445|
|2018|                  59231|                    46508|
|2019|                  60338|                    46075|
|2020|                  62889| 

In [63]:
from pyspark.sql.functions import first, lit, col
# Pivot data so Years are in the column spot
pivot_immigration = df_immigration_final.groupBy(lit(1).alias("dummy")).pivot("Year").agg(first("Immigrants"))
pivot_emigration = df_immigration_final.groupBy(lit(1).alias("dummy")).pivot("Year").agg(first("Emigrants"))

# Rename columns for immigration and emigration
for year in range(2006, 2025):
    pivot_immigration = pivot_immigration.withColumnRenamed(str(year), f"IMMIGRATION_{year}")
    pivot_emigration = pivot_emigration.withColumnRenamed(str(year), f"EMIGRATION_{year}")

pivot_migration_in = df_migration_final.groupBy(lit(1).alias("dummy")).pivot("Year").agg(first("Total_migration_into_BC"))
pivot_migration_out = df_migration_final.groupBy(lit(1).alias("dummy")).pivot("Year").agg(first("Total_migration_out_of_BC"))

# Rename columns for migration in and out
for year in range(2006, 2025):
    pivot_migration_in = pivot_migration_in.withColumnRenamed(str(year), f"MIGRATION_IN_{year}")
    pivot_migration_out = pivot_migration_out.withColumnRenamed(str(year), f"MIGRATION_OUT_{year}")


combined_df = pivot_immigration.join(pivot_emigration, "dummy", "outer") \
                               .join(pivot_migration_in, "dummy", "outer") \
                               .join(pivot_migration_out, "dummy", "outer") \
                               .drop("dummy") 


combined_df.show(truncate=False)

                                                                                

+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+------------------+-----------

In [64]:
df = df.crossJoin(combined_df)
df.show()

                                                                                

+--------------------+----------+--------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+-------------------------

In [84]:
df_construction = spark.read.csv('Cleaned/YearlyConstructionNeighbours.csv', header=True, inferSchema=True)
df_construction = df_construction.select('Year', "Vancouver")

df_construction.show()

+----+---------+
|Year|Vancouver|
+----+---------+
|2010|    15217|
|2011|    17867|
|2012|    19027|
|2013|    18696|
|2014|    19212|
|2015|    20863|
|2016|    27914|
|2017|    26204|
|2018|    23404|
|2019|    28141|
|2020|    22371|
|2021|    26013|
|2022|    25983|
|2023|    33244|
+----+---------+



In [85]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.sql.functions import col

# Prepare the data for machine learning
vectorAssembler = VectorAssembler(inputCols=["Year"], outputCol="features")
df_vector = vectorAssembler.transform(df_construction)

# Define the linear regression model
lr = LinearRegression(featuresCol="features", labelCol="Vancouver")

# Fit the model on the data
model = lr.fit(df_vector)

# Prepare a DataFrame for the year you want to predict (2024 in this case)
years_to_predict = [(year,) for year in range(2006, 2010)] + [(2024,)]
df_predict = spark.createDataFrame(years_to_predict, ["Year"])
df_predict_vector = vectorAssembler.transform(df_predict)

# Predict the value for Vancouver in 2024
predictions = model.transform(df_predict_vector).select(col("Year"), col("prediction").alias("Predicted_Vancouver"))

predictions_formatted = predictions.select(
    col("Year"),
    col("Predicted_Vancouver").alias("Vancouver")
)

# Union the original DataFrame with the formatted predictions
df_combined = df_construction.union(predictions_formatted)

df_combined = df_combined.withColumn("Vancouver", df_combined["Vancouver"].cast("integer"))

df_combined_sorted = df_combined.orderBy("Year")

df_combined_sorted.show()


24/04/07 15:35:30 WARN Instrumentation: [b99240b6] regParam is zero, which might cause numerical instability and overfitting.


+----+---------+
|Year|Vancouver|
+----+---------+
|2006|    12474|
|2007|    13491|
|2008|    14508|
|2009|    15525|
|2010|    15217|
|2011|    17867|
|2012|    19027|
|2013|    18696|
|2014|    19212|
|2015|    20863|
|2016|    27914|
|2017|    26204|
|2018|    23404|
|2019|    28141|
|2020|    22371|
|2021|    26013|
|2022|    25983|
|2023|    33244|
|2024|    30782|
+----+---------+



In [86]:
df_combined_sorted = df_combined_sorted.groupBy(lit("1").alias("dummy")).pivot("Year").agg(first("Vancouver"))

for year in range(2006, 2025):  
    df_combined_sorted = df_combined_sorted.withColumnRenamed(str(year), f"NEW_CONSTRUCTION_{year}")

# Drop the dummy column used for grouping
df_combined_sorted = df_combined_sorted.drop("dummy")
df_combined_sorted.show()

+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+
|NEW_CONSTRUCTION_2006|NEW_CONSTRUCTION_2007|NEW_CONSTRUCTION_2008|NEW_CONSTRUCTION_2009|NEW_CONSTRUCTION_2010|NEW_CONSTRUCTION_2011|NEW_CONSTRUCTION_2012|NEW_CONSTRUCTION_2013|NEW_CONSTRUCTION_2014|NEW_CONSTRUCTION_2015|NEW_CONSTRUCTION_2016|NEW_CONSTRUCTION_2017|NEW_CONSTRUCTION_2018|NEW_CONSTRUCTION_2019|NEW_CONSTRUCTION_2020|NEW_CONSTRUCTION_2021|NEW_CONSTRUCTION_2022|NEW_CONSTRUCTION_2023|NEW_CONSTRUCTION_2024|
+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+-----

In [87]:
df = df.crossJoin(df_combined_sorted)
df.show()

                                                                                

+--------------------+----------+--------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+-------------------------

In [90]:
df.coalesce(1).write.csv("Cleaned/ML-Data.csv", mode="overwrite", header=True)

                                                                                

In [None]:
spark.stop()