In [102]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql import functions as F

In [103]:
# Initialize Spark session
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

In [104]:
# Read the Parquet file into a Spark DataFrame
sdf = spark.read.parquet("../data/curated/prop_data_cleaned2.parquet")
sdf.printSchema()

root
 |-- name: string (nullable = true)
 |-- property_url: string (nullable = true)
 |-- beds: integer (nullable = true)
 |-- baths: integer (nullable = true)
 |-- parkings: integer (nullable = true)
 |-- cost: integer (nullable = true)
 |-- postal_code: integer (nullable = true)
 |-- built_in_wardrobes: integer (nullable = true)
 |-- dishwasher: integer (nullable = true)
 |-- air_conditioning: integer (nullable = true)
 |-- floorboards: integer (nullable = true)
 |-- secure_parking: integer (nullable = true)
 |-- intercom: integer (nullable = true)
 |-- heating: integer (nullable = true)
 |-- ensuite: integer (nullable = true)
 |-- balcony_deck: integer (nullable = true)
 |-- gym: integer (nullable = true)
 |-- furnished: integer (nullable = true)
 |-- close_to_shops: integer (nullable = true)
 |-- close_to_transport: integer (nullable = true)
 |-- fully_fenced: integer (nullable = true)
 |-- close_to_schools: integer (nullable = true)
 |-- ducted_heating: integer (nullable = true)
 

In [105]:
def shape(sdf: DataFrame) -> None:
    """
    Returns the shape of a Spark DataFrame as a tuple (number of rows, number of columns).
    """
    num_rows = sdf.count()
    num_columns = len(sdf.columns)
    print(f"Shape of the DataFrame: {num_rows} rows, {num_columns} columns.")

shape(sdf)

Shape of the DataFrame: 9568 rows, 40 columns.


In [106]:
sdf.columns

['name',
 'property_url',
 'beds',
 'baths',
 'parkings',
 'cost',
 'postal_code',
 'built_in_wardrobes',
 'dishwasher',
 'air_conditioning',
 'floorboards',
 'secure_parking',
 'intercom',
 'heating',
 'ensuite',
 'balcony_deck',
 'gym',
 'furnished',
 'close_to_shops',
 'close_to_transport',
 'fully_fenced',
 'close_to_schools',
 'ducted_heating',
 'split_system_heating',
 'swimming_pool',
 'remote_garage',
 'balcony',
 'study',
 'garden_courtyard',
 'pets_allowed',
 'internal_laundry',
 'alarm_system',
 'prop_type_index',
 'parks',
 'schools',
 'supermarkets',
 'hospitals',
 'shopping_districts',
 'CBD',
 'train_stations']

In [107]:
# Select relevant features for the model
feature_columns = [
 'beds',
 'baths',
 'parkings',
 'postal_code',
 'built_in_wardrobes',
 'dishwasher',
 'air_conditioning',
 'floorboards',
 'secure_parking',
 'intercom',
 'heating',
 'ensuite',
 'balcony_deck',
 'gym',
 'furnished',
 'close_to_shops',
 'close_to_transport',
 'fully_fenced',
 'close_to_schools',
 'ducted_heating',
 'split_system_heating',
 'swimming_pool',
 'remote_garage',
 'balcony',
 'study',
 'garden_courtyard',
 'pets_allowed',
 'internal_laundry',
 'alarm_system',
 'prop_type_index',]
#  'parks',
#  'schools',
#  'supermarkets',
#  'hospitals',
#  'shopping_districts',
#  'CBD',
#  'train_stations']

In [90]:
# feature_columns = [
#     'beds', 'baths', 'parkings', 'prop_type_index',
#     # 'parks', 'schools', 'supermarkets', 'hospitals', 'shopping_districts', 'CBD', 'train_stations'
# ]

In [91]:
# Create a VectorAssembler to combine feature columns into a single vector
assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')

# Transform the features into a single vector column
data = sdf.select('cost', *feature_columns)
data = assembler.transform(data)

In [92]:
# Create a StandardScaler instance
scaler = StandardScaler(inputCol='features', outputCol='scaled_features', withMean=True, withStd=True)

# Fit the scaler to the data
scaler_model = scaler.fit(data)

# Transform the data using the fitted scaler
data = scaler_model.transform(data)

In [93]:
# Split the data into training and test sets
train_data, test_data = data.randomSplit([0.7, 0.3], seed=1003)

In [94]:
# Initialize the Linear Regression model
lr = LinearRegression(featuresCol='scaled_features', labelCol='cost')

# Fit the model to the training data
lr_model = lr.fit(train_data)

24/10/01 22:41:37 WARN Instrumentation: [7bf58649] regParam is zero, which might cause numerical instability and overfitting.


In [95]:
# Print model summary
print("Intercept: " + str(lr_model.intercept))
print("RMSE: " + str(lr_model.summary.rootMeanSquaredError))
print("r2: " + str(lr_model.summary.r2))

# Get the coefficients and feature names
coefficients = lr_model.coefficients.toArray()
feature_names = feature_columns

# Create a DataFrame from the coefficients and feature names
coefficients_df = pd.DataFrame({
    'feature': feature_names,
    'coefficient': coefficients
})

# Sort the DataFrame by the 'coefficient' column in decreasing order
coefficients_df = coefficients_df.sort_values(by='coefficient', ascending=False)

# Evaluate the model on the test data
test_results = lr_model.evaluate(test_data)

# Print evaluation metrics
print("Test RMSE: ", test_results.rootMeanSquaredError)
print("Test r2: ", test_results.r2)

coefficients_df

Intercept: 596.6640519454334
RMSE: 139.13544082137483
r2: 0.33490626225029996
Test RMSE:  142.19273615855732
Test r2:  0.3232753858150933


Unnamed: 0,feature,coefficient
0,beds,62.304892
1,baths,51.250768
21,swimming_pool,15.29303
14,furnished,14.718526
7,floorboards,13.752217
5,dishwasher,13.35812
12,balcony_deck,11.820727
9,intercom,10.842195
25,garden_courtyard,10.430162
23,balcony,9.947914


In [96]:
# Initialize the Lasso Regression model
lasso = LinearRegression(featuresCol='features', labelCol='cost', elasticNetParam=0.5, regParam=0.1)

# Fit the model to the data
lasso_model = lasso.fit(train_data)

In [97]:
# Print model summary
print("Intercept: " + str(lasso_model.intercept))
print("RMSE: " + str(lasso_model.summary.rootMeanSquaredError))
print("r2: " + str(lasso_model.summary.r2))

# Get the coefficients and feature names
lasso_coefficients = lasso_model.coefficients.toArray()

# Create a DataFrame from the coefficients and feature names
lasso_coefficients_df = pd.DataFrame({
    'feature': feature_names,
    'coefficient': lasso_coefficients
})

# Sort the DataFrame by the 'coefficient' column in decreasing order
lasso_coefficients_df = lasso_coefficients_df.sort_values(by='coefficient', ascending=False)

# Evaluate the model on the test data
lasso_test_results = lasso_model.evaluate(test_data)

# Print evaluation metrics
print("Lasso Test RMSE: ", lasso_test_results.rootMeanSquaredError)
print("Lasso Test r2: ", lasso_test_results.r2)

lasso_coefficients_df

Intercept: 472.5881776632955
RMSE: 139.136049700682
r2: 0.334900441120932
Lasso Test RMSE:  142.18738060457179
Lasso Test r2:  0.3233263612312901


Unnamed: 0,feature,coefficient
1,baths,86.01792
21,swimming_pool,66.799148
14,furnished,58.133238
0,beds,57.326892
25,garden_courtyard,48.135945
23,balcony,44.16408
12,balcony_deck,42.702986
7,floorboards,39.158605
24,study,35.929214
13,gym,32.475599


In [98]:
def backward_elimination(data: DataFrame, features: list, label: str, stop_threshold=0.1):
    features_to_keep = features.copy()
    
    while len(features_to_keep) > 0:
        print(f"Training model with {len(features_to_keep)} features.")
        
        # Assemble the feature vector for the current set of features
        assembler = VectorAssembler(inputCols=features_to_keep, outputCol="features")
        
        # Drop the existing 'features' column if it exists
        if 'features' in data.columns:
            data = data.drop('features')

        # Transform the data to create the new 'features' column
        data_assembled = assembler.transform(data).select("features", label)
        
        # Create a StandardScaler instance
        scaler = StandardScaler(inputCol='features', outputCol='scaled_features', withMean=True, withStd=True)
        
        # Fit the scaler to the data and transform it
        scaler_model = scaler.fit(data_assembled)
        data_scaled = scaler_model.transform(data_assembled)
        
        # Train the model on the scaled features
        lr = LinearRegression(featuresCol="scaled_features", labelCol=label)
        lr_model = lr.fit(data_scaled)
        
        # Get the coefficients and associated features
        coefficients = lr_model.coefficients
        coef_feature_pairs = list(zip(coefficients, features_to_keep))
        
        # Find the least significant feature (smallest coefficient magnitude)
        least_significant_feature = min(coef_feature_pairs, key=lambda x: abs(x[0]))[1]
        
        # Check the magnitude of the smallest coefficient using absolute value
        if abs(min(coef_feature_pairs, key=lambda x: abs(x[0]))[0]) < stop_threshold:
            print(f"Removing least significant feature: {least_significant_feature}")
            features_to_keep.remove(least_significant_feature)
        else:
            break
    
    print(f"Final set of features: {features_to_keep}")
    return features_to_keep

In [16]:
# Perform backward elimination
final_features = backward_elimination(data, feature_columns, 'cost', stop_threshold=8.5)

Training model with 37 features.


24/10/01 22:20:26 WARN Instrumentation: [25b33cdf] regParam is zero, which might cause numerical instability and overfitting.


Removing least significant feature: hospitals
Training model with 36 features.


24/10/01 22:20:28 WARN Instrumentation: [6d407b2a] regParam is zero, which might cause numerical instability and overfitting.


Removing least significant feature: secure_parking
Training model with 35 features.


24/10/01 22:20:29 WARN Instrumentation: [d4464daa] regParam is zero, which might cause numerical instability and overfitting.


Removing least significant feature: alarm_system
Training model with 34 features.


24/10/01 22:20:31 WARN Instrumentation: [bb3285f2] regParam is zero, which might cause numerical instability and overfitting.


Removing least significant feature: supermarkets
Training model with 33 features.


24/10/01 22:20:33 WARN Instrumentation: [2ce9bbf2] regParam is zero, which might cause numerical instability and overfitting.


Removing least significant feature: shopping_districts
Training model with 32 features.


24/10/01 22:20:35 WARN Instrumentation: [115c2d64] regParam is zero, which might cause numerical instability and overfitting.


Removing least significant feature: schools
Training model with 31 features.


24/10/01 22:20:37 WARN Instrumentation: [a15595e0] regParam is zero, which might cause numerical instability and overfitting.


Removing least significant feature: CBD
Training model with 30 features.


24/10/01 22:20:38 WARN Instrumentation: [7c58863d] regParam is zero, which might cause numerical instability and overfitting.


Removing least significant feature: parks
Training model with 29 features.


24/10/01 22:20:40 WARN Instrumentation: [d8022bcd] regParam is zero, which might cause numerical instability and overfitting.


Removing least significant feature: parkings
Training model with 28 features.


24/10/01 22:20:41 WARN Instrumentation: [1d9762b0] regParam is zero, which might cause numerical instability and overfitting.


Removing least significant feature: train_stations
Training model with 27 features.


24/10/01 22:20:43 WARN Instrumentation: [50aecd9f] regParam is zero, which might cause numerical instability and overfitting.


Removing least significant feature: close_to_schools
Training model with 26 features.


24/10/01 22:20:44 WARN Instrumentation: [51dab10e] regParam is zero, which might cause numerical instability and overfitting.


Removing least significant feature: heating
Training model with 25 features.


24/10/01 22:20:46 WARN Instrumentation: [3c534121] regParam is zero, which might cause numerical instability and overfitting.


Removing least significant feature: gym
Training model with 24 features.


24/10/01 22:20:47 WARN Instrumentation: [72c346a6] regParam is zero, which might cause numerical instability and overfitting.


Removing least significant feature: pets_allowed
Training model with 23 features.


24/10/01 22:20:48 WARN Instrumentation: [8d442995] regParam is zero, which might cause numerical instability and overfitting.


Removing least significant feature: split_system_heating
Training model with 22 features.


24/10/01 22:20:50 WARN Instrumentation: [3f1dea29] regParam is zero, which might cause numerical instability and overfitting.


Removing least significant feature: close_to_shops
Training model with 21 features.


24/10/01 22:20:51 WARN Instrumentation: [47f1b5c4] regParam is zero, which might cause numerical instability and overfitting.


Removing least significant feature: close_to_transport
Training model with 20 features.


24/10/01 22:20:53 WARN Instrumentation: [9b150542] regParam is zero, which might cause numerical instability and overfitting.


Removing least significant feature: internal_laundry
Training model with 19 features.


24/10/01 22:20:54 WARN Instrumentation: [e4b14ef7] regParam is zero, which might cause numerical instability and overfitting.


Final set of features: ['beds', 'baths', 'postal_code', 'built_in_wardrobes', 'dishwasher', 'air_conditioning', 'floorboards', 'intercom', 'ensuite', 'balcony_deck', 'furnished', 'fully_fenced', 'ducted_heating', 'swimming_pool', 'remote_garage', 'balcony', 'study', 'garden_courtyard', 'prop_type_index']


In [17]:
# Create a VectorAssembler to combine feature columns into a single vector
assembler = VectorAssembler(inputCols=final_features, outputCol='features')

# Transform the features into a single vector column
data = sdf.select('cost', *feature_columns)
data = assembler.transform(data)

In [18]:
# Create a StandardScaler instance
scaler = StandardScaler(inputCol='features', outputCol='scaled_features', withMean=True, withStd=True)

# Fit the scaler to the data
scaler_model = scaler.fit(data)

# Transform the data using the fitted scaler
data = scaler_model.transform(data)

In [19]:
# Split the data into training and test sets
train_data, test_data = data.randomSplit([0.7, 0.3], seed=1003)

In [20]:
# Initialize the Linear Regression model
lr = LinearRegression(featuresCol='scaled_features', labelCol='cost')

# Fit the model to the training data
lr_model = lr.fit(train_data)

24/10/01 22:20:56 WARN Instrumentation: [6a799155] regParam is zero, which might cause numerical instability and overfitting.


In [21]:
# Print model summary
print("Intercept: " + str(lr_model.intercept))
print("RMSE: " + str(lr_model.summary.rootMeanSquaredError))
print("r2: " + str(lr_model.summary.r2))

# Get the coefficients and feature names
coefficients = lr_model.coefficients.toArray()
feature_names = final_features

# Create a DataFrame from the coefficients and feature names
coefficients_df = pd.DataFrame({
    'feature': feature_names,
    'coefficient': coefficients
})

# Sort the DataFrame by the 'coefficient' column in decreasing order
coefficients_df = coefficients_df.sort_values(by='coefficient', ascending=False)

# Evaluate the model on the test data
test_results = lr_model.evaluate(test_data)

# Print evaluation metrics
print("Test RMSE: ", test_results.rootMeanSquaredError)
print("Test r2: ", test_results.r2)

coefficients_df

Intercept: 630.6703919110146
RMSE: 198.2768635869822
r2: 0.33802264821596684
Test RMSE:  197.58438775770833
Test r2:  0.32943973636307733


Unnamed: 0,feature,coefficient
1,baths,86.141049
0,beds,64.821416
13,swimming_pool,25.593529
10,furnished,22.296458
16,study,18.268164
9,balcony_deck,17.010795
4,dishwasher,15.613591
15,balcony,15.363227
6,floorboards,14.407658
12,ducted_heating,13.298636


## TREE MODELS

In [108]:
# Create a VectorAssembler to combine feature columns into a single vector
assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')

# Select the target variable (cost) and prepare the final DataFrame
data = sdf.select('cost', *feature_columns)

# Transform the features into a single vector column
data = assembler.transform(data)

# Create a MinMaxScaler instance
scaler = StandardScaler(inputCol='features', outputCol='scaled_features')

# Fit the scaler to the data
scaler_model = scaler.fit(data)

# Transform the data using the fitted scaler
data = scaler_model.transform(data)

# Split the data into training and test sets
train_data, test_data = data.randomSplit([0.8, 0.2], seed=1003)

In [115]:
# Initialize the Gradient Boosting model
gbt = GBTRegressor(featuresCol='scaled_features', labelCol='cost', maxIter=40)

# Fit the model to the training data
gbt_model = gbt.fit(train_data)

In [116]:
# Print model summary
print("Gradient Boosting Model Summary")
print("Number of Trees: ", gbt_model.getNumTrees)

# Evaluate the model on the test data
gbt_predictions = gbt_model.transform(test_data)

# Create evaluators for RMSE, MAE, and R-squared
rmse_evaluator = RegressionEvaluator(labelCol="cost", predictionCol="prediction", metricName="rmse")
mae_evaluator = RegressionEvaluator(labelCol="cost", predictionCol="prediction", metricName="mae")
r2_evaluator = RegressionEvaluator(labelCol="cost", predictionCol="prediction", metricName="r2")

# Evaluate the model on the test data
gbt_rmse = rmse_evaluator.evaluate(gbt_predictions)
gbt_mae = mae_evaluator.evaluate(gbt_predictions)
gbt_r2 = r2_evaluator.evaluate(gbt_predictions)

# Print evaluation metrics
print(f"Gradient Boosting Test RMSE: {gbt_rmse}, MAE: {gbt_mae}, R2: {gbt_r2}")

Gradient Boosting Model Summary
Number of Trees:  40
Gradient Boosting Test RMSE: 85.40797831651557, MAE: 59.22166132919866, R2: 0.5674306680533399


In [32]:
# Get feature importances
gbt_feature_importances = gbt_model.featureImportances

# Create a DataFrame from the feature importances
feature_importances_df = pd.DataFrame({
    'feature': feature_columns,
    'importance': gbt_feature_importances.toArray()
})

# Sort the DataFrame by the 'importance' column in decreasing order
feature_importances_df = feature_importances_df.sort_values(by='importance', ascending=False)
feature_importances_df

Unnamed: 0,feature,importance
3,postal_code,0.26222
1,baths,0.134581
0,beds,0.130232
2,parkings,0.067904
35,CBD,0.033036
32,supermarkets,0.029472
36,train_stations,0.029454
29,prop_type_index,0.028758
30,parks,0.026043
33,hospitals,0.025946
