# Predict the weekly rental price from 2023 to 2027 with Random forest Regression </br>
In this notebook, we will train the random forest regressor model with the train dataset from 2013 to 2022, and predict the weekly rental price for each district for each year from 2023 - 2027 with this trained model.

In [1]:
import os
import re
import glob
import numpy as np
import pandas as pd
from pyspark.sql.functions import *
from matplotlib import pyplot as plt
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
from pyspark.sql.types import FloatType
from pyspark.sql.types import IntegerType
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import LabelEncoder


### Make a new directory folder to store the predictions made from our trained random forest model

In [2]:
# Directory
directory = "random_forest_pred"
  
# Parent Directory path
parent_dir = "../data/curated/"

# Path
path = os.path.join(parent_dir, directory)

# Create the directory
os.mkdir(path)

FileExistsError: [Errno 17] File exists: '../data/curated/random_forest_pred'

### Steps to take to make the training set to fit into the model for the prediction stage & Train the model

In [2]:
path = r'../data/curated/merged_dataset/'
all_files = glob.glob(os.path.join(path , "*.csv"))

li = []

for filename in sorted(all_files):
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

merged_df = pd.concat(li, axis=0, ignore_index=True)

# Rename the columns to facilitate to call them for later
merged_df.rename({'gdp(USD Millioins)': 'gdp', 'saving_rate(% of GDP)': 'saving_rate'}, axis=1, inplace=True)

LOG_FEATURES = ['saving_rate', 'min_distance_to_prim', 
'min_distance_to_poli', 'min_distance_to_park', 'min_distance_to_second', 'min_distance_to_hosp', 'min_distance_to_cbd', 
'min_distance_to_shop', 'population_density', 'income_per_person', 
'crime_cases', 'min_distance_to_train', 'gdp']

all_candidates = LOG_FEATURES.copy()
for log_feature in LOG_FEATURES:
    merged_df[log_feature] = np.log(merged_df[log_feature])

categorical_features = ['nbed', 'nbath', 'ncar', 'residence_type', 'sa2_2021']
le = LabelEncoder()

# Convert the categorical variables to numerical
for i in range(len(categorical_features)):
    new = le.fit_transform(merged_df[categorical_features[i]])
    merged_df[categorical_features[i]] = new

merged_df.drop(['address', 'latitude', 'longitude', 'postcode', 'sa2_2016'], axis=1, inplace=True)

merged_df.dropna(inplace=True)

# Log transformatoin on our target class for better performance 
y = np.log(merged_df['weekly_rent'])

# Assign predictor data to X for training purposes
X = merged_df.drop('weekly_rent', axis=1)

X = X[['residence_type', 'ncar', 'min_distance_to_hosp', 'min_distance_to_train', 'min_distance_to_second', 'min_distance_to_park', 'min_distance_to_prim', 'year', 'nbed', 'sa2_2021', 'nbath', 'min_distance_to_cbd']]

# Train the model with the training dataset
sel = RandomForestRegressor(n_estimators = 30, random_state=42)
sel.fit(X, y)

0         300.0
1         215.0
2         175.0
3         350.0
4         275.0
          ...  
172030    265.0
172031    500.0
172032    750.0
172033    409.0
172034    365.0
Name: weekly_rent, Length: 172018, dtype: float64


Due to dummy variable function due to a nominal attribute, sa2 code, the column names were not constant between the traninig and prediction dataset. (There are a few of suburbs missing from a training dataset, while the prediction dataset has all suburbs.) This is not accepted in random forest regression where they require to have the exact same features to train and predict. Therefore, the suburb sa2 codes that the trained model didn't see from the training dataset have been dropped. 

### Predict with random forest regressor 

In [4]:
# Now, Predict for the next 5 years
parent_dir_yr = "../data/curated/2023_2027_data"

for filename in os.listdir(parent_dir_yr):
    merged_df_yr = pd.read_csv(parent_dir_yr + "/" + filename, index_col=None, header=0)

    # Extract year from the file name 
    which_year = re.findall(r'\d+', filename)

    # Add year column to the dataset to fit the input into the model
    merged_df_yr.rename({'gdp(USD Millioins)': 'gdp', 'saving_rate(% of GDP)': 'saving_rate'}, axis=1, inplace=True)

    
    LOG_FEATURES = ['saving_rate', 'min_distance_to_prim', 
    'min_distance_to_poli', 'min_distance_to_park', 'min_distance_to_second', 'min_distance_to_hosp', 'min_distance_to_cbd', 
    'min_distance_to_shop', 'population_density', 'income_per_person', 
    'crime_cases', 'min_distance_to_train', 'gdp']

    all_candidates = LOG_FEATURES.copy()
    for log_feature in LOG_FEATURES:
        merged_df_yr[log_feature] = np.log(merged_df_yr[log_feature])


    categorical_features = ['nbed', 'nbath', 'ncar', 'residence_type', 'sa2_2021']
    le = LabelEncoder()

    # Convert the categorical variables to numerical
    for i in range(len(categorical_features)):
        new = le.fit_transform(merged_df_yr[categorical_features[i]])
        merged_df_yr[categorical_features[i]] = new

    print(X.columns, merged_df_yr.columns)

    merged_df_yr.dropna(inplace=True)
    merged_df_yr = merged_df_yr[['residence_type', 'ncar', 'min_distance_to_hosp', 'min_distance_to_train', 'min_distance_to_second', 'min_distance_to_park', 'min_distance_to_prim', 'year', 'nbed', 'sa2_2021', 'nbath', 'min_distance_to_cbd']]
    print(merged_df_yr)

    # Predict the weekly rental price with random forest tree
    prediction = sel.predict(merged_df_yr)

    # Get the sa2 codes 
    merged_df_yr_revert =  le.inverse_transform(merged_df_yr['sa2_2021'])

    # Now, put the predictions into the csv files 
    new_csv_name = "../data/curated/random_forest_pred/" + filename

    data = {'year': merged_df_yr['year'],
            'sa2_2021': merged_df_yr_revert,
            'predicted_price': np.exp(prediction) }
    df = pd.DataFrame(data)
    print(df)
    df.to_csv(new_csv_name)

Index(['residence_type', 'ncar', 'min_distance_to_hosp',
       'min_distance_to_train', 'min_distance_to_second',
       'min_distance_to_park', 'min_distance_to_prim', 'year', 'nbed',
       'sa2_2021', 'nbath', 'min_distance_to_cbd'],
      dtype='object') Index(['year', 'sa2_2021', 'gdp', 'saving_rate', 'income_per_person',
       'population_density', 'crime_cases', 'residence_type', 'nbed', 'nbath',
       'ncar', 'min_distance_to_cbd', 'min_distance_to_park',
       'min_distance_to_prim', 'min_distance_to_second',
       'min_distance_to_train', 'min_distance_to_hosp', 'min_distance_to_poli',
       'min_distance_to_shop'],
      dtype='object')
       residence_type  ncar  min_distance_to_hosp  min_distance_to_train  \
0                   0     1              4.945698               4.500764   
1                   0     1              4.945698               4.500764   
2                   0     2              4.945698               4.500764   
3                   1     1       

The predictions are completed with the random forest regressor. The key findings will be illustrated in `growthRateCalc.ipynb` notebook. 