### Future Prediction using the feature-selected Random Forest

In [17]:
import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

In [18]:
path = r'../data/curated/merged_dataset/'
all_files = glob.glob(os.path.join(path , "*.csv"))

li = []

for filename in sorted(all_files):
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

merged_df = pd.concat(li, axis=0, ignore_index=True)

# Rename the columns to facilitate to call them for later
merged_df.rename({'gdp(USD Millioins)': 'gdp', 'saving_rate(% of GDP)': 'saving_rate'}, axis=1, inplace=True)

ECON_COLS = set({'saving_rate', 'population_density', 'income_per_person', 'crime_cases', 'gdp'})

DIST_COLS = set({'min_distance_to_prim', 'min_distance_to_poli', 'min_distance_to_park', 'min_distance_to_second', 'min_distance_to_hosp', 'min_distance_to_cbd', 
'min_distance_to_shop', 'min_distance_to_train'})

SELECTED_FEATURES = set({'min_distance_to_cbd',
 'min_distance_to_hosp',
 'min_distance_to_park',
 'min_distance_to_poli',
 'min_distance_to_prim',
 'min_distance_to_second',
 'min_distance_to_shop',
 'min_distance_to_train'})

all_candidates = list(ECON_COLS.union(DIST_COLS)).copy()
#for log_feature in LOG_FEATURES:
#    merged_df[log_feature] = np.log(merged_df[log_feature])

#merged_df['weekly_rent'] = np.log(merged_df['weekly_rent'])

categorical_features = ['nbed', 'nbath', 'ncar', 'residence_type', 'sa2_2021']
le = LabelEncoder()

# Convert the categorical variables to numerical
for i in range(len(categorical_features)):
    new = le.fit_transform(merged_df[categorical_features[i]])
    merged_df[categorical_features[i]] = new

#merged_df = pd.get_dummies(data=merged_df, columns=['nbed'], prefix='nbed')
#merged_df = pd.get_dummies(data=merged_df, columns=['nbath'], prefix='nbath')
#merged_df = pd.get_dummies(data=merged_df, columns=['ncar'], prefix='ncar')
#merged_df = pd.get_dummies(data=merged_df, columns=['sa2_2021'], prefix='sa2')
#merged_df = pd.get_dummies(data=merged_df, columns=['residence_type'], prefix='resiType') 
merged_df.drop(['address', 'latitude', 'longitude', 'postcode', 'sa2_2016'], axis=1, inplace=True)

# Check the merged dataframe
merged_df

Unnamed: 0,year,sa2_2021,residence_type,nbed,nbath,ncar,min_distance_to_cbd,min_distance_to_park,min_distance_to_prim,min_distance_to_second,min_distance_to_train,min_distance_to_hosp,min_distance_to_poli,min_distance_to_shop,weekly_rent,gdp,saving_rate,income_per_person,population_density,crime_cases
0,2013,61,1,1,0,0,227.97163,23.16035,7.35747,16.96507,35.56825,21.35025,22.04660,9.35209,300.0,1536454,6.861393,39683.563449,2.172408,86.0
1,2013,102,1,1,0,0,223.66084,5.71742,6.50536,6.76794,7.54355,7.42972,6.28177,9.35209,215.0,1536454,6.861393,47222.702327,5.425503,36.0
2,2013,61,1,1,0,0,243.25680,5.11222,0.20027,36.72106,50.85341,36.63541,0.08478,9.35209,175.0,1536454,6.861393,39683.563449,2.172408,86.0
3,2013,23,1,5,2,0,140.35827,78.32509,10.66523,11.91899,11.26906,177.44731,84.47341,9.35209,350.0,1536454,6.861393,43556.283562,473.765281,1288.0
4,2013,209,0,0,0,0,13.86135,0.93250,1.32931,3.49174,2.20800,177.44731,84.47341,3.96501,275.0,1536454,6.861393,86103.411528,2834.210526,1923.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172030,2022,89,1,3,0,1,293.28053,0.56012,1.21809,114.77016,90.08591,140.56888,74.35608,13.64920,265.0,3305754,12.839000,54365.266130,402.000000,281.0
172031,2022,504,1,3,2,2,258.29111,3.49087,5.08707,3.60570,8.37185,2.60312,74.35608,13.64920,500.0,3305754,12.839000,60828.473189,689.000000,3049.0
172032,2022,190,1,1,2,1,9.47077,2.45011,1.33931,1.62322,3.63291,140.56888,74.35608,1.97636,750.0,3305754,12.839000,98756.492866,3656.000000,759.0
172033,2022,133,0,0,0,1,1.84933,0.65199,1.10438,1.27940,1.87840,140.56888,74.35608,13.64920,409.0,3305754,12.839000,71305.473808,5791.000000,1788.0


In [24]:
drop = DIST_COLS-SELECTED_FEATURES

In [25]:
# Drop Nans
merged_df.dropna(inplace=True)

# Log transformatoin on our target class for better performance 
y = np.log(merged_df['weekly_rent'])

# Assign predictor data to X for training purposes
X = merged_df.drop('weekly_rent', axis=1)

print(y)
print(X)

0         5.703782
1         5.370638
2         5.164786
3         5.857933
4         5.616771
            ...   
172030    5.579730
172031    6.214608
172032    6.620073
172033    6.013715
172034    5.899897
Name: weekly_rent, Length: 172018, dtype: float64
        year  sa2_2021  residence_type  nbed  nbath  ncar  \
0       2013        61               1     1      0     0   
1       2013       102               1     1      0     0   
2       2013        61               1     1      0     0   
3       2013        23               1     5      2     0   
4       2013       209               0     0      0     0   
...      ...       ...             ...   ...    ...   ...   
172030  2022        89               1     3      0     1   
172031  2022       504               1     3      2     2   
172032  2022       190               1     1      2     1   
172033  2022       133               0     0      0     1   
172034  2022       149               0     0      0     1   

        

In [26]:
predictor_model = RandomForestRegressor(n_estimators = 30, random_state=42).fit(X.drop(drop, axis=1), y)
predictor_y_pred = predictor_model.predict(X.drop(drop, axis=1))
predictor_r2 = r2_score(y, predictor_y_pred)
print(f"Predictor R2 = {predictor_r2}")

Predictor R2 = 0.9590024259325911


In [27]:
# create path
new_path = '../data/curated/2023_2027_rental_prediction/'

if not os.path.exists(new_path):
    os.makedirs(new_path)

YEARS = [2023, 2024, 2025, 2026, 2027]

for year in YEARS:
    prediction_set = pd.read_csv(f'../data/curated/2023_2027_data/{year}_data.csv')
    prediction_set.rename({'gdp(USD Millioins)': 'gdp', 'saving_rate(% of GDP)': 'saving_rate'}, axis=1, inplace=True)
    
    # log transformation
    #for log_feature in LOG_FEATURES:
    #    prediction_set[log_feature] = np.log(prediction_set[log_feature])
    
    prediction_set.drop(drop, axis=1, inplace=True)
    final_prediction_set = prediction_set.copy(deep=True)
    
    # Convert the categorical variables to numerical
    for i in range(len(categorical_features)):
        new = le.fit_transform(prediction_set[categorical_features[i]])
        prediction_set[categorical_features[i]] = new

    #prediction_set.drop(['sa2_2021', 'residence_type'], axis=1, inplace=True)
    prediction_set.dropna(inplace=True)

    # add missing sa2 that were present in training but not in testing, assign with 0s
    #missing = set(X_train.columns) - set(prediction_set.columns)
    #for c in missing:
    #    prediction_set[c] = 0

    # Ensure the order of column in the test set is in the same order than in train set
    prediction_set = prediction_set[X.drop(drop, axis=1).columns]

    price_predictions = predictor_model.predict(prediction_set) # in log
    price_predictions = np.exp(price_predictions) # in actual price

    final_prediction_set['predicted_price'] = price_predictions
    final_prediction_set.to_csv(f'../data/curated/random_forest_pred/{year}_data.csv', index=False)