In [None]:
from pyspark.sql.types import *
from pyspark.sql import SparkSession, functions as F

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.driver.memory', '4g')
    .config('spark.executor.memory', '2g')
    .getOrCreate()
)

import warnings
warnings.filterwarnings("ignore") # suppress warnings

In [121]:
# Load in Dataframes
sdf = spark.read.parquet('../data/curated/merged_df.parquet')
growth = spark.read.parquet('../data/curated/growth_rate_20years.parquet')
distances = spark.read.parquet('../data/curated/distances.parquet')
distances = distances.toPandas()

In [122]:
sdf = sdf.withColumn(
    # Properties priced per week
    'House',
    F.when(F.lower(F.col('property_type')).contains('house'), True).otherwise(False)
).withColumn(
    # see if property is priced per annum
    'flat',
    F.when(F.lower(F.col('property_type')).contains('flat') \
        | F.lower(F.col('property_type')).contains('apartment'), True ).otherwise(False)
)

In [123]:
# Dropping all duplicate rows
sdf1 = sdf.dropDuplicates()

sdf1 = sdf1[sdf1['property_type'] != 'Carspace']
#print(sdf1)

+--------+------+---------------+---------+--------------------+----+-----+-------+--------------------+--------------------+----------------------------+------------------------------+-----------------------+-------------------+-----------------------------------+-------------------------------------+-----+-----+
|postcode|  cost|         suburb|furnished|       property_type|beds|baths|parking|              region|           lgaregion|total male population - 2021|total female population - 2021|total population - 2021|australian citizens|total region male population - 2022|total region female population - 2022|House| flat|
+--------+------+---------------+---------+--------------------+----+-----+-------+--------------------+--------------------+----------------------------+------------------------------+-----------------------+-------------------+-----------------------------------+-------------------------------------+-----+-----+
|    3977| 520.0|cranbourne-west|        0|         

In [124]:
from pyspark.sql import functions as F

# growth = growth.fillna(0)
growth = growth.filter((F.col('postcode')>0))

# [] FILL NA values
#growth.na.fill("2 bed houses", ("All Properties"))


# Removing NULL values and getting data from'All Properties'
columns_to_fill = ['1 bed flats', '2 bed flats', '3 bed flats', '2 bed houses', '3 bed houses', '4 bed houses']


for col in columns_to_fill:
    growth = growth.withColumn(col, F.coalesce(growth[col], growth['All Properties']))


growth.show()

+--------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|postcode|       1 bed flats|       2 bed flats|       3 bed flats|      2 bed houses|      3 bed houses|      4 bed houses|    All Properties|
+--------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|    3206|             1.125|1.0833333333333333|1.0093457943925233|1.1085714285714285|1.6823529411764706| 1.311912225705329|1.1666666666666667|
|    3143|1.3888888888888888|             1.375|1.5714285714285714|1.2794117647058822|1.3863636363636365|1.5334608030592733|1.3333333333333333|
|    3054| 1.247191011235955|1.1568627450980393| 1.102102102102102|1.4204946996466432|               1.5|1.6666666666666667|1.2818791946308725|
|    3052| 1.368421052631579|1.1333333333333333|1.3937677053824362|1.1666666666666667|            1.0575| 1.558139534883721|1.0357142857

In [126]:
from pyspark.sql import functions as F

#Change from columns to merge 
growth_unpivoted = growth.select(
    "postcode",
    F.expr("stack(6, '1 bed flats', `1 bed flats`, '2 bed flats', `2 bed flats`, '3 bed flats', `3 bed flats`, '2 bed houses', `2 bed houses`, '3 bed houses', `3 bed houses`, '4 bed houses', `4 bed houses`) as (bed_column, growth_value)")
)
#print(growth_unpivoted)


# Create the bed_column in sdf1
sdf1 = sdf1.withColumn(
    "bed_column",
    F.when((F.col("property_type").contains("flat")) & (F.col("beds") == 1), "1 bed flats")
    .when((F.col("property_type").contains("flat")) & (F.col("beds") == 2), "2 bed flats")
    .when((F.col("property_type").contains("flat")) & (F.col("beds") == 3), "3 bed flats")
    .when((F.col("property_type").contains("house")) & (F.col("beds") == 2), "2 bed houses")
    .when((F.col("property_type").contains("house")) & (F.col("beds") == 3), "3 bed houses")
    .when((F.col("property_type").contains("house")) & (F.col("beds") == 4), "4 bed houses")
)

# Perform the join after ensuring both DataFrames have 'postcode' and 'bed_column'
sdf_growth = sdf1.join(growth_unpivoted, on=['postcode', 'bed_column'], how='inner')

# Show the result
sdf_growth.show()

+--------+------------+------+-----------------+---------+-------------+----+-----+-------+--------------------+--------------------+----------------------------+------------------------------+-----------------------+-------------------+-----------------------------------+-------------------------------------+-----+-----+------------------+
|postcode|  bed_column|  cost|           suburb|furnished|property_type|beds|baths|parking|              region|           lgaregion|total male population - 2021|total female population - 2021|total population - 2021|australian citizens|total region male population - 2022|total region female population - 2022|House| flat|      growth_value|
+--------+------------+------+-----------------+---------+-------------+----+-----+-------+--------------------+--------------------+----------------------------+------------------------------+-----------------------+-------------------+-----------------------------------+-------------------------------------+---

In [None]:
pdf = sdf.toPandas()
#Dropping duplicates 
duplicate_rows = pdf[pdf.duplicated()]
pdf = pdf.drop_duplicates()

# # Log transform the price
# from math import log

# # Calculating logarithmic values for the specified column
# pdf['cost'] = pdf['cost'].apply(lambda x: log(x))
# pdf['total male population - 2021'] = pdf['total male population - 2021'].apply(lambda x: log(x))
# pdf['total female population - 2021'] = pdf['total female population - 2021'].apply(lambda x: log(x))
# pdf['total population - 2021'] = pdf['total population - 2021'].apply(lambda x: log(x))
# pdf['australian citizens'] = pdf['australian citizens'].apply(lambda x: log(x))
# pdf['median rent'] = pdf['median rent'].apply(lambda x: log(x))
# pdf['median family weekly income'] = pdf['median family weekly income'].apply(lambda x: log(x))
# pdf['total region male population - 2022'] = pdf['total region male population - 2022'].apply(lambda x: log(x))
# pdf['total region female population - 2022'] = pdf['total region female population - 2022'].apply(lambda x: log(x))
pdf

# flat and apartment are different to house

# if Null - > all properties
# search for house
# else flat

Merging with quarterly rent analysis


In [5]:
import pandas as pd

# df_qra = spark.read.parquet('../data/curated/qra.parquet')
# df_qra = df_qra.toPandas()
pdf['postcode'] = pdf['postcode'].astype(str)
# df_merged = pd.merge(pdf, df_qra, on='postcode', how='inner')

In [6]:
pdf = pdf.drop(['region', 'lgaregion'], axis = 1)

In [None]:
growth.count()

In [None]:
distances.count()

In [9]:
distances = distances.drop(['name', 'latitude', 'longitude'], axis = 1)

In [10]:
distances = distances.groupby('postcode').mean()
distances = distances.dropna()

In [None]:
# sdf_reg = spark.read.parquet('../data/curated/qra_reg.parquet')
# sdf_reg.count()
# reg_features = sdf_reg.columns
# pdf_reg = sdf_reg.toPandas()

growth_pdf = growth.toPandas()

import pandas as pd
df_merged = pd.merge(pdf, distances, on='postcode', how='inner')
df_merged = df_merged.drop_duplicates()
print(df_merged)
df_merged.to_parquet('../data/curated/regression_data.parquet', engine='pyarrow', index=False)

PREPROCESSING NOTE: Need to merge growth and df_merge
Combine the correct score to the property type and number of beds i.e. First instance - want to add a growth column that contains the value for 1 bed flat

In [12]:
# # Boosting
# from sklearn.ensemble import GradientBoostingRegressor
# >>> model = GradientBoostingRegressor(random_state=0).fit(x_train, y_train)
# >>> model.score(x_train, y_train)
# 0.8083859166342285
# >>> model.score(x_test, y_test)
# 0.7802104901623703

# >>> from sklearn.ensemble import RandomForestRegressor
# >>> model = RandomForestRegressor(random_state=0).fit(x_train, y_train)
# >>> model.score(x_train, y_train)
# 0.9727449572570027
# >>> model.score(x_test, y_test)

Exploratory Data Analysis

In [None]:
# import seaborn as sns
pdf_numeric = df_merged.drop(['suburb', 'name', 'property_type', 'region', 'lgaregion', 'latitude', 'longitude'], axis=1)
# ax = sns.boxplot(data=pdf_numeric)
# ax.tick_params(axis='x', labelrotation=90)

In [None]:
pdf_numeric = pdf_numeric.dropna()
pdf_numeric.count()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

correlation_pdf = pdf_numeric.drop('postcode',axis=1)
plt.figure(figsize=(12, 6))
sns.heatmap(correlation_pdf.corr(),
            cmap = 'BrBG',
            fmt = '.2f',
            linewidths = 2,
            annot = True)

In [None]:
pdf_numeric

In [80]:
# # stratified sampling --

In [None]:
medians = pdf_numeric.groupby(["postcode"])["median rent"].median() 

import pandas as pd
median_pdf = pd.DataFrame(medians)
median_pdf = median_pdf.sort_values(by=['median rent'])

# Importing required libraries 
import matplotlib.pyplot as plt 

# creating the bar plot
plt.bar(list(median_pdf[0:10].index), median_pdf['median rent'][0:10]) # fix x label, fix outliers, data check

plt.xlabel("postcode")
plt.ylabel("Median log rent (per Week)")
plt.title("Lowest Median log Rent Per Week by Suburb")
plt.show()

In [None]:
medians = pdf_numeric.groupby(["postcode"])["median rent"].median() 

import pandas as pd
median_pdf = pd.DataFrame(medians)
median_pdf = median_pdf.sort_values(by=['median rent'], ascending = False)

# Importing required libraries 
import matplotlib.pyplot as plt 

#fig = plt.figure(figsize = (7, 4))

# creating the bar plot
plt.bar(list(median_pdf[0:10].index), median_pdf['median rent'][0:10]) # fix x label

plt.xlabel("postcode")
plt.ylabel("Median log rent (per Week)")
plt.title("Highest Median log rent of Rent Per Week by Suburb")
plt.show()

In [None]:
means = pdf_numeric.groupby(["postcode"])["median rent"].mean() 

import pandas as pd
means_pdf = pd.DataFrame(means)
means_pdf = means_pdf.sort_values(by=['median rent'], ascending = False)

# Importing required libraries 
import matplotlib.pyplot as plt 

# creating the bar plot
plt.bar(list(means_pdf[0:10].index), means_pdf['median rent'][0:10]) # fix x label, fix outliers, data check

plt.xlabel("postcode")
plt.ylabel("Median log rent (per Week)")
plt.title("Highest Median log rent of Rent Per Week by Suburb")
plt.show()

# Scatterplots - sample

In [None]:
means_pdf

In [None]:
# Importing required libraries 
import matplotlib.pyplot as plt 

# creating the bar plot
plt.bar(list(means_pdf.index), means_pdf['median rent']) # can input top 10 indices

plt.xlabel("Suburbs")
plt.ylabel("Median log rent (per Week)")
plt.title("Median log Rent Per Week by Suburb")
plt.show()

In [None]:
pdf_numeric

In [None]:
import matplotlib.pyplot as plt

plt.scatter(pdf_numeric['distance_to_closest_school_km'], pdf_numeric['median rent'])
plt.show()

In [None]:
plt.scatter(pdf_numeric['distance_to_cbd'], pdf_numeric['median rent'])
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.scatter(pdf_numeric['median rent'], pdf_numeric['median family weekly income'])
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.scatter(pdf_numeric['beds'], pdf_numeric['median rent'])
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.scatter(pdf_numeric['baths'], pdf_numeric['median rent'])
plt.show()

In [92]:
pdf_numeric = pdf_numeric.round()

In [93]:
pdf_numeric = pdf_numeric.dropna()
pdf_numeric = pdf_numeric.drop_duplicates()


In [None]:
pdf_numeric.columns

In [None]:
suburbs = pdf_numeric.groupby(['postcode']).median()
suburbs = suburbs.astype(int)
suburbs

# list(suburbs['median rent'].values)

In [None]:
# Round up to the nearest 10
import numpy as np
rounded_median_rent= [(np.ceil(x/10) * 10) for x in list(suburbs['median rent'].values)]
rounded_median_rent = [int(x) for x in rounded_median_rent]
rounded_median_rent

In [None]:
suburbs['rounded_median_rent'] = rounded_median_rent
suburbs

Regression


In [None]:
unique_vals = ((suburbs['rounded_median_rent'].value_counts()))
unique_vals

for i in unique_vals[unique_vals == 1].index:
    target = suburbs[suburbs['rounded_median_rent'] == i] # Need more than 1 sample for each unique value for modelling
    suburbs = suburbs.drop(target.index[0])

suburbs
   

In [None]:
import numpy as np
sdf_suburbs = spark.createDataFrame(suburbs)
sdf_suburbs

In [100]:
from sklearn.model_selection import train_test_split

# Label
y = suburbs['rounded_median_rent'] 

# Predictors
x = suburbs.drop(['rounded_median_rent'], axis=1)

# Create train, test split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=10, stratify=y)

# Scale data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [101]:
# #K fold cross validation

# from sklearn.model_selection import ShuffleSplit
# from sklearn.model_selection import cross_val_score

# #Randomize the sample so each of the fold has equal distribution of data samples
# cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0) # do hyperparameter tuning

# cross_val_score(LinearRegression(), x, y, cv=cv)

In [None]:
# Linear Regression

from sklearn import datasets, linear_model, metrics
reg = linear_model.LinearRegression()
reg.fit(X_train, y_train)
reg.score(X_test , y_test)

In [None]:
# regression coefficients
print('Coefficients: ', reg.coef_)

# variance score: 1 means perfect prediction
print('Variance score: {}'.format(reg.score(X_test, y_test)))

y_pred = reg.predict(X_test)

In [None]:
plt.scatter(y_test, y_pred) # Add y=x
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Linear Regression: Actual vs. Predicted Values")
plt.show() # Much better performance  

In [None]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,y_pred)
plt.figure(figsize=(12,6))
plt.title("Confusion Matrix")
sns.heatmap(cm, annot=True,fmt='d')
plt.ylabel("Actual Values")
plt.xlabel("Predicted Values")

In [92]:
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import mean_squared_error,r2_score
# from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# from sklearn.metrics import accuracy_score
 
# # Create a random forest regression model   # RANDOM FOREST NO GOOD FOR STRATIFIED SAMPLING
# model = RandomForestRegressor(n_estimators=50)
 
# # Fit the model to the data
# model.fit(x, y)
 
# # Predict the response for a new data point
# y_pred = model.predict(X_test)
# #test_acc = accuracy_score(y_test, y_pred)

# #print("The Accuracy for Test Set is {}" .format(test_acc*100))


# mse = mean_squared_error(y_test, y_pred)
# mae = mean_absolute_error(y_test, y_pred)

# r2 = r2_score(y_test, y_pred)

# print("Mean Squared Error:", mse)
# print("Mean Absolute Error:", mae)

# print("R2 Score:", r2)

# # Make predictions

# print("Prediction:", y_pred)

In [93]:
# plt.scatter(y_test, y_pred) # sample?
# plt.xlabel("Actual Values")
# plt.ylabel("Predicted Values")
# plt.title("Random Forest Regression: Actual vs. Predicted Values")
# plt.show() # Much better performance  

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

In [None]:
rf_feature_importances = reg.feature_importances_
rf_feature_importances = pd.Series(
    rf_feature_importances, index=X_train.columns.values
    ).sort_values(ascending=False)

fig, ax = plt.subplots(figsize=(7,5))
sns.barplot(x=rf_feature_importances, y=rf_feature_importances.index)
plt.xlabel('Feature Importance');
plt.ylabel('Feature');

In [None]:
import numpy as np
from glmnet import ElasticNet


# LASSO Model
elastic_net_model = ElasticNet(alpha=1) 
elastic_net_model.fit(x,y)

In [None]:
# Lambda Value
print(f'Best lambda value for LASSO: {elastic_net_model.lambda_best_[0]}')

In [None]:
y_pred = elastic_net_model.predict(X_test)

# Import necessary libraries
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error,r2_score
import numpy as np
import matplotlib.pyplot as plt

# Calculate the Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
# Print the model coefficients
print("Elastic Net Coefficients:")
print(elastic_net_model.coef_)
# Print the model intercept
print("Elastic Net Intercept:")
print(elastic_net_model.intercept_)
r2_score(y_test,y_pred)

In [None]:
plt.scatter(y_test, y_pred)
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Elastic Net Regression: Actual vs. Predicted Values")
plt.show()

In [39]:
# Begin forecasting the rental properties for the next 3-years by suburb or any other suitable
# granularity (we will let groups decide on the granularity). Whilst working on this, groups are
# expected to also present some analysis on what features are useful in making a certain suburb
# more expensive and sought after.

In [40]:
# Groups are to continue working on their predictions and analysis. Additionally, prepare a
# summary notebook (3-5 minutes max) and walk your Tutor through the current findings and
# any additional insights so far. This task should assist in helping groups formulate their answers
# to the 3 big questions.

In [41]:
# Summarise and output the 3-year predictions for your chosen granularity (i.e suburb or cluster
# of properties) and provide some form of an answer for the 3 big questions.