# Predict the weekly rental price from 2023 to 2027 with Random forest Regression </br>
In this notebook, we will train the random forest regressor model with the train dataset from 2013 to 2022, and predict the weekly rental price for each district for each year from 2023 - 2027 with this trained model.

In [None]:
import os
import re
import glob
import numpy as np
import pandas as pd
from pyspark.sql.functions import *
from matplotlib import pyplot as plt
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
from pyspark.sql.types import FloatType
from pyspark.sql.types import IntegerType
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance

In [None]:
# Directory
directory = "random_forest_pred"
  
# Parent Directory path
parent_dir = "../data/curated/"

# Path
path = os.path.join(parent_dir, directory)

# Create the directory
os.mkdir(path)

In [2]:
spark = (
    SparkSession.builder.appName("MAST30034 Tutorial 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

parent_dir_yr = "../data/curated/2023_2027_data"

path = r'../data/curated/merged_dataset/'
all_files = glob.glob(os.path.join(path , "*.csv"))

li = []

# To extract the same column names 
# Run the prediction dataset (2023-2027) first to extract its column names 

for filename in os.listdir(parent_dir_yr):
    print(parent_dir_yr + "/" + filename)
    merged_df_yr = spark.read.csv(parent_dir_yr + "/" + filename, header=True)

    # Extract year from the file name 
    which_year = re.findall(r'\d+', filename)

    # Add year column to the dataset to fit the input into the model
    merged_df_yr = merged_df_yr.withColumn("year", lit(which_year[0]))
    
    merged_df_yr = merged_df_yr.toPandas()
    sa2 = merged_df_yr["sa2_2021"]
    merged_df_yr = pd.get_dummies(data=merged_df_yr, columns=['sa2_2021'], prefix='sa2')
    merged_df_yr = pd.get_dummies(data=merged_df_yr, columns=['residence_type'], prefix='resiType') 

    merged_df_yr.rename(columns = {'gdp(USD Millioins)':'gdp', 'saving_rate(% of GDP)':'saving_rate'}, inplace = True)

    merged_df_yr.dropna(inplace=True)

    # Get the dolumn names from prediction model after putting dummy variables on sa2 codes and residence type attributes
    merged_df_yr_col = merged_df_yr.columns
    
# Run the tranining dataset (2013 - 2022)
for filename in sorted(all_files):
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

# Merge the whole traninig dataset 
merged_df = pd.concat(li, axis=0, ignore_index=True)

# Put sa2 codes and residence types into dummy variables as they are not treated as numerical in this model
merged_df = pd.get_dummies(data=merged_df, columns=['sa2_2021'], prefix='sa2')
merged_df = pd.get_dummies(data=merged_df, columns=['residence_type'], prefix='resiType') 
merged_df.drop(['address', 'latitude', 'longitude', 'postcode', 'sa2_2016'], axis=1, inplace=True)
print(merged_df.columns)

# Now, convert each column so that each has its approparite data type
for col in merged_df.columns:
    if 'resiType' in col:
        merged_df[col] = merged_df[col].astype(int)
    elif 'year' in col:
        merged_df[col] = merged_df[col].astype(int)
    elif 'sa2' in col:
        merged_df[col] = merged_df[col].astype(int)
    else:
        merged_df[col] = merged_df[col].astype(float)

# Rename the columns for better readability
merged_df.rename({'gdp(USD Millioins)': 'gdp', 'saving_rate(% of GDP)': 'saving_rate'}, axis=1, inplace=True)
merged_df.dropna(inplace=True)

# Get our target class fro the merged training set
y = merged_df['weekly_rent']
merged_df.drop('weekly_rent', axis=1, inplace=True)

# Only have columns that are common in the training and predicting dataset
common_cols = list(set(merged_df).intersection(merged_df_yr_col))
merged_df = merged_df[common_cols]

X = merged_df
print(X)

# Now, run the random forest regressor on the merged tranining dataset
sel = RandomForestRegressor(n_estimators = 100, random_state=42)
sel.fit(X, y)

Index(['year', 'sa2_2021', 'residence_type', 'nbed', 'nbath', 'ncar',
       'min_distance_to_cbd', 'min_distance_to_park', 'min_distance_to_prim',
       'min_distance_to_second', 'min_distance_to_train',
       'min_distance_to_hosp', 'min_distance_to_poli', 'min_distance_to_shop',
       'weekly_rent', 'gdp(USD Millioins)', 'saving_rate(% of GDP)',
       'income_per_person', 'population_density', 'crime_cases'],
      dtype='object')
        year     sa2_2021  residence_type  nbed  nbath  ncar  \
0       2013  204011057.0               1   2.0    1.0   0.0   
1       2013  205051101.0               1   2.0    1.0   0.0   
2       2013  204011057.0               1   2.0    1.0   0.0   
3       2013  202011022.0               1   4.0    2.0   0.0   
4       2013  208041195.0               0   1.0    1.0   0.0   
...      ...          ...             ...   ...    ...   ...   
172030  2022  205021086.0               1   3.0    1.0   1.0   
172031  2022  217041479.0               1   3.

: 

Due to dummy variable function due to a nominal attribute, sa2 code, the column names were not constant between the traninig and prediction dataset. (There are a few of suburbs missing from a training dataset, while the prediction dataset has all suburbs.) This is not accepted in random forest regression where they require to have the exact same features to train and predict. Therefore, the suburb sa2 codes that the trained model didn't see from the training dataset have been dropped. 

In [1]:
# Now, Predict for the next 5 years
for filename in os.listdir(parent_dir_yr):
    merged_df_yr = spark.read.csv(parent_dir_yr + "/" + filename, header=True)

    # Extract year from the file name 
    which_year = re.findall(r'\d+', filename)

    # Add year column to the dataset to fit the input into the model
    merged_df_yr = merged_df_yr.withColumn("year", lit(which_year[0]))
    
    merged_df_yr = merged_df_yr.toPandas()
    
    merged_df_yr = pd.get_dummies(data=merged_df_yr, columns=['sa2_2021'], prefix='sa2')
    merged_df_yr = pd.get_dummies(data=merged_df_yr, columns=['residence_type'], prefix='resiType') 
    merged_df_yr

    for col in merged_df_yr.columns:
        if 'resiType' in col:
            merged_df_yr[col] = merged_df_yr[col].astype(int)
        elif 'year' in col:
            merged_df_yr[col] = merged_df_yr[col].astype(int)
        elif 'sa2' in col:
            merged_df_yr[col] = merged_df_yr[col].astype(int)
        else:
            merged_df_yr[col] = merged_df_yr[col].astype(float)

    merged_df_yr.rename(columns = {'gdp(USD Millioins)':'gdp', 'saving_rate(% of GDP)':'saving_rate'}, inplace = True)

    merged_df_yr.dropna(inplace=True)

    # Reorder the columns with only common columns between the training and prediction dataset
    merged_df_yr = merged_df_yr[common_cols]

    print(merged_df_yr)

    # Predict the weekly rental price with random forest tree
    prediction = sel.predict(merged_df_yr)

    # Reverse the dummy variable of SA2 codes (eg. column 'sa2_20211113' -> '20211113' in sa2_2021 column)
    sa2_list = []
    for i in common_cols:
        if "sa2" in i:
            sa2_list.append(int(i[4:]))

    sa2_list.sort()

    new_columns_sa2 = []
    for i in sa2:
        i = int(i)
        if i in sa2_list:
            new_columns_sa2.append(i)

    # Now, put the predictions into the csv files 
    new_csv_name = "../data/curated/random_forest_pred/" + filename

    data = {'year': merged_df_yr['year'],
            'sa2_2021': new_columns_sa2,
            'predicted_price': prediction }
    df = pd.DataFrame(data)
    print(df)
    df.to_csv(new_csv_name)

22/10/05 02:19:32 WARN Utils: Your hostname, Hyunjin-Win11 resolves to a loopback address: 127.0.1.1; using 192.168.245.16 instead (on interface eth0)
22/10/05 02:19:32 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


KeyboardInterrupt: 

The predictions are completed with the random forest regressor. The key findings will be illustrated in `growthRateCalc.ipynb` notebook. 