### This section uses Random Forest Regressor to model the distances by house internal attributes and SA2 and predicts for future years

In [2]:
import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

### Merge all data collected from 2013 - 2022

In [7]:
path = r'../data/curated/merged_dataset/'
all_files = glob.glob(os.path.join(path , "*.csv"))

li = []

for filename in sorted(all_files):
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

merged_df = pd.concat(li, axis=0, ignore_index=True)

# Rename the columns to facilitate to call them for later
merged_df.rename({'gdp(USD Millioins)': 'gdp', 'saving_rate(% of GDP)': 'saving_rate'}, axis=1, inplace=True)

# Selected in Random Forest Forward Selection
DIST_COLS = ['min_distance_to_cbd',
 'min_distance_to_hosp',
 'min_distance_to_park',
 'min_distance_to_prim',
 'min_distance_to_second',
 'min_distance_to_train']

PREDICTORS = ['year', 'gdp', 'saving_rate', 'income_per_person' , 'population_density', 'crime_cases', 'residence_type', 'sa2_2021', 'nbed', 'nbath', 'ncar']

merged_df = merged_df[DIST_COLS+PREDICTORS]

categorical_features = ['residence_type', 'sa2_2021', 'nbed', 'nbath', 'ncar']
le = LabelEncoder()

# Convert the categorical variables to numerical
for i in range(len(categorical_features)):
    new = le.fit_transform(merged_df[categorical_features[i]])
    merged_df[categorical_features[i]] = new
merged_df.dropna(inplace=True)
# Check the merged dataframe
merged_df

Unnamed: 0,min_distance_to_cbd,min_distance_to_hosp,min_distance_to_park,min_distance_to_prim,min_distance_to_second,min_distance_to_train,year,gdp,saving_rate,income_per_person,population_density,crime_cases,residence_type,sa2_2021,nbed,nbath,ncar
0,227.97163,21.35025,23.16035,7.35747,16.96507,35.56825,2013,1536454,6.861393,39683.563449,2.172408,86.0,1,61,1,0,0
1,223.66084,7.42972,5.71742,6.50536,6.76794,7.54355,2013,1536454,6.861393,47222.702327,5.425503,36.0,1,102,1,0,0
2,243.25680,36.63541,5.11222,0.20027,36.72106,50.85341,2013,1536454,6.861393,39683.563449,2.172408,86.0,1,61,1,0,0
3,140.35827,177.44731,78.32509,10.66523,11.91899,11.26906,2013,1536454,6.861393,43556.283562,473.765281,1288.0,1,23,5,2,0
4,13.86135,177.44731,0.93250,1.32931,3.49174,2.20800,2013,1536454,6.861393,86103.411528,2834.210526,1923.0,0,209,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172030,293.28053,140.56888,0.56012,1.21809,114.77016,90.08591,2022,3305754,12.839000,54365.266130,402.000000,281.0,1,89,3,0,1
172031,258.29111,2.60312,3.49087,5.08707,3.60570,8.37185,2022,3305754,12.839000,60828.473189,689.000000,3049.0,1,504,3,2,2
172032,9.47077,140.56888,2.45011,1.33931,1.62322,3.63291,2022,3305754,12.839000,98756.492866,3656.000000,759.0,1,190,1,2,1
172033,1.84933,140.56888,0.65199,1.10438,1.27940,1.87840,2022,3305754,12.839000,71305.473808,5791.000000,1788.0,0,133,0,0,1


### Checking the distance prediction performance of Random Forest 

In [4]:
for dist_col in DIST_COLS:
    y = merged_df[dist_col]
    X = merged_df[PREDICTORS]
    X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3, random_state= 42, shuffle=True)

    model = RandomForestRegressor(n_estimators = 30, random_state=42)
    model.fit(X_train, y_train)
    training_pred = model.predict(X_train)
    train_r2 = r2_score(y_train, training_pred)
    test_pred = model.predict(X_test)
    test_r2 = r2_score(y_test, test_pred)
    print(f'Predicting {dist_col} Training R2 = {train_r2}, Testing R2 = {test_r2}')

Predicting min_distance_to_cbd Training R2 = 0.9995795263197766, Testing R2 = 0.9983467990875554
Predicting min_distance_to_hosp Training R2 = 0.9994093814553241, Testing R2 = 0.996563065354215
Predicting min_distance_to_park Training R2 = 0.9942784722756867, Testing R2 = 0.979484715998081
Predicting min_distance_to_prim Training R2 = 0.9659990831900646, Testing R2 = 0.9112123581984973
Predicting min_distance_to_second Training R2 = 0.998271347398522, Testing R2 = 0.9918087436430683
Predicting min_distance_to_train Training R2 = 0.9988498558910694, Testing R2 = 0.9935697000135101


### Now predict distances by year, SA2 and Residence Types

In [10]:
YEARS = [2023, 2024, 2025, 2026, 2027]
for year in YEARS:
    future_set = pd.read_csv(f'../data/curated/2023_2027_data/{year}_data.csv')
    future_set.rename({'gdp(USD Millioins)': 'gdp', 'saving_rate(% of GDP)': 'saving_rate'}, axis=1, inplace=True)
    future_set.dropna(inplace=True)
    future_set_copy = future_set.copy(deep=True)
    # Convert the categorical variables to numerical
    for i in range(len(categorical_features)):
        new = le.fit_transform(future_set[categorical_features[i]])
        future_set[categorical_features[i]] = new


    # make predictions on each distance to facility
    for dist_col in DIST_COLS:
        y = merged_df[dist_col]
        X = merged_df[PREDICTORS]

        model = RandomForestRegressor(n_estimators = 30, random_state=42)
        model.fit(X, y)

        dist_pred = model.predict(future_set[PREDICTORS])
        future_set_copy[dist_col] = dist_pred
    print(f"Completed year {year}")
    # Replace the distance columns with predictions
    future_set_copy.to_csv(f'../data/curated/2023_2027_data/{year}_data_distPredicted.csv', index=False)

Completed year 2023
Completed year 2024
Completed year 2025
Completed year 2026
Completed year 2027
