### Fit selected feature into a Linear Regression Model to make future rental price prediction

In [7]:
import pandas as pd
import glob
import os
import numpy as np
from sklearn.preprocessing import LabelEncoder

path = r'../data/curated/merged_dataset/'
all_files = glob.glob(os.path.join(path , "*.csv"))

li = []

for filename in sorted(all_files):
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

merged_df = pd.concat(li, axis=0, ignore_index=True)

# Rename the columns to facilitate to call them for later
merged_df.rename({'gdp(USD Millioins)': 'gdp', 'saving_rate(% of GDP)': 'saving_rate'}, axis=1, inplace=True)

ECON_COLS = set({'saving_rate', 'population_density', 'income_per_person', 'crime_cases', 'gdp'})

DIST_COLS = set({'min_distance_to_prim', 'min_distance_to_poli', 'min_distance_to_park', 'min_distance_to_second', 'min_distance_to_hosp', 'min_distance_to_cbd', 
'min_distance_to_shop', 'min_distance_to_train'})

INTERNAL_COLS = set({'nbed', 'nbath', 'ncar'})

SELECTED_FEATURES = set({'min_distance_to_cbd', 'min_distance_to_hosp', 
'min_distance_to_park', 'min_distance_to_poli', 'min_distance_to_prim',
'min_distance_to_shop', 'min_distance_to_train', 'nbath', 'nbed', 'ncar'})

all_candidates = list(ECON_COLS.union(DIST_COLS).union(INTERNAL_COLS)).copy()
#for log_feature in LOG_FEATURES:
#    merged_df[log_feature] = np.log(merged_df[log_feature])

#merged_df['weekly_rent'] = np.log(merged_df['weekly_rent'])

categorical_features = ['nbed', 'nbath', 'ncar', 'residence_type', 'sa2_2021']
le = LabelEncoder()

# Convert the categorical variables to numerical
for i in range(len(categorical_features)):
    new = le.fit_transform(merged_df[categorical_features[i]])
    merged_df[categorical_features[i]] = new

merged_df.drop(['address', 'latitude', 'longitude', 'postcode', 'sa2_2016'], axis=1, inplace=True)

# Check the merged dataframe
merged_df


Unnamed: 0,year,sa2_2021,residence_type,nbed,nbath,ncar,min_distance_to_cbd,min_distance_to_park,min_distance_to_prim,min_distance_to_second,min_distance_to_train,min_distance_to_hosp,min_distance_to_poli,min_distance_to_shop,weekly_rent,gdp,saving_rate,income_per_person,population_density,crime_cases
0,2013,61,1,1,0,0,227.97163,23.16035,7.35747,16.96507,35.56825,21.35025,22.04660,9.35209,300.0,1536454,6.861393,39683.563449,2.172408,86.0
1,2013,102,1,1,0,0,223.66084,5.71742,6.50536,6.76794,7.54355,7.42972,6.28177,9.35209,215.0,1536454,6.861393,47222.702327,5.425503,36.0
2,2013,61,1,1,0,0,243.25680,5.11222,0.20027,36.72106,50.85341,36.63541,0.08478,9.35209,175.0,1536454,6.861393,39683.563449,2.172408,86.0
3,2013,23,1,5,2,0,140.35827,78.32509,10.66523,11.91899,11.26906,177.44731,84.47341,9.35209,350.0,1536454,6.861393,43556.283562,473.765281,1288.0
4,2013,209,0,0,0,0,13.86135,0.93250,1.32931,3.49174,2.20800,177.44731,84.47341,3.96501,275.0,1536454,6.861393,86103.411528,2834.210526,1923.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172030,2022,89,1,3,0,1,293.28053,0.56012,1.21809,114.77016,90.08591,140.56888,74.35608,13.64920,265.0,3305754,12.839000,54365.266130,402.000000,281.0
172031,2022,504,1,3,2,2,258.29111,3.49087,5.08707,3.60570,8.37185,2.60312,74.35608,13.64920,500.0,3305754,12.839000,60828.473189,689.000000,3049.0
172032,2022,190,1,1,2,1,9.47077,2.45011,1.33931,1.62322,3.63291,140.56888,74.35608,1.97636,750.0,3305754,12.839000,98756.492866,3656.000000,759.0
172033,2022,133,0,0,0,1,1.84933,0.65199,1.10438,1.27940,1.87840,140.56888,74.35608,13.64920,409.0,3305754,12.839000,71305.473808,5791.000000,1788.0


In [8]:
drop = list(DIST_COLS.union(INTERNAL_COLS)-SELECTED_FEATURES)

In [9]:
# Drop Nans
merged_df.dropna(inplace=True)

# log transformation to improve accuracy
y_train = np.log(merged_df['weekly_rent'])
X_train = merged_df.drop(['weekly_rent'], axis=1)
X_train = X_train[sorted(X_train.drop(drop, axis=1).columns)]
X_train

Unnamed: 0,crime_cases,gdp,income_per_person,min_distance_to_cbd,min_distance_to_hosp,min_distance_to_park,min_distance_to_poli,min_distance_to_prim,min_distance_to_shop,min_distance_to_train,nbath,nbed,ncar,population_density,residence_type,sa2_2021,saving_rate,year
0,86.0,1536454,39683.563449,227.97163,21.35025,23.16035,22.04660,7.35747,9.35209,35.56825,0,1,0,2.172408,1,61,6.861393,2013
1,36.0,1536454,47222.702327,223.66084,7.42972,5.71742,6.28177,6.50536,9.35209,7.54355,0,1,0,5.425503,1,102,6.861393,2013
2,86.0,1536454,39683.563449,243.25680,36.63541,5.11222,0.08478,0.20027,9.35209,50.85341,0,1,0,2.172408,1,61,6.861393,2013
3,1288.0,1536454,43556.283562,140.35827,177.44731,78.32509,84.47341,10.66523,9.35209,11.26906,2,5,0,473.765281,1,23,6.861393,2013
4,1923.0,1536454,86103.411528,13.86135,177.44731,0.93250,84.47341,1.32931,3.96501,2.20800,0,0,0,2834.210526,0,209,6.861393,2013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172030,281.0,3305754,54365.266130,293.28053,140.56888,0.56012,74.35608,1.21809,13.64920,90.08591,0,3,1,402.000000,1,89,12.839000,2022
172031,3049.0,3305754,60828.473189,258.29111,2.60312,3.49087,74.35608,5.08707,13.64920,8.37185,2,3,2,689.000000,1,504,12.839000,2022
172032,759.0,3305754,98756.492866,9.47077,140.56888,2.45011,74.35608,1.33931,1.97636,3.63291,2,1,1,3656.000000,1,190,12.839000,2022
172033,1788.0,3305754,71305.473808,1.84933,140.56888,0.65199,74.35608,1.10438,13.64920,1.87840,0,0,1,5791.000000,0,133,12.839000,2022


In [10]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
final_model = sm.OLS(y_train, X_train).fit()
final_model.summary()

0,1,2,3
Dep. Variable:,weekly_rent,R-squared (uncentered):,0.998
Model:,OLS,Adj. R-squared (uncentered):,0.998
Method:,Least Squares,F-statistic:,4987000.0
Date:,"Sun, 09 Oct 2022",Prob (F-statistic):,0.0
Time:,18:27:53,Log-Likelihood:,-15072.0
No. Observations:,172018,AIC:,30180.0
Df Residuals:,172000,BIC:,30360.0
Df Model:,18,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
crime_cases,-1.629e-05,3.29e-07,-49.430,0.000,-1.69e-05,-1.56e-05
gdp,3.366e-08,3.03e-09,11.111,0.000,2.77e-08,3.96e-08
income_per_person,6.122e-06,4.31e-08,142.050,0.000,6.04e-06,6.21e-06
min_distance_to_cbd,-0.0008,9.04e-06,-88.494,0.000,-0.001,-0.001
min_distance_to_hosp,-0.0001,1.12e-05,-9.463,0.000,-0.000,-8.43e-05
min_distance_to_park,-0.0009,3.84e-05,-22.215,0.000,-0.001,-0.001
min_distance_to_poli,-0.0004,2.17e-05,-16.577,0.000,-0.000,-0.000
min_distance_to_prim,-5.304e-05,9e-05,-0.589,0.556,-0.000,0.000
min_distance_to_shop,-0.0022,0.000,-11.151,0.000,-0.003,-0.002

0,1,2,3
Omnibus:,59018.398,Durbin-Watson:,1.848
Prob(Omnibus):,0.0,Jarque-Bera (JB):,712988.392
Skew:,1.306,Prob(JB):,0.0
Kurtosis:,12.625,Cond. No.,8120000.0


In [8]:
# create path
new_path = '../data/curated/2023_2027_rental_prediction_lr/'

if not os.path.exists(new_path):
    os.makedirs(new_path)

YEARS = [2023, 2024, 2025, 2026, 2027]

LOG_FEATURES = ['saving_rate', 'min_distance_to_prim', 
'min_distance_to_poli', 'min_distance_to_park', 'min_distance_to_second', 'min_distance_to_hosp', 'min_distance_to_cbd', 
'min_distance_to_shop', 'population_density', 'income_per_person', 
'crime_cases', 'min_distance_to_train', 'gdp']

for year in YEARS:
    prediction_set = pd.read_csv(f'../data/curated/2023_2027_data/{year}_data.csv')
    prediction_set.rename({'gdp(USD Millioins)': 'gdp', 'saving_rate(% of GDP)': 'saving_rate'}, axis=1, inplace=True)
    final_prediction_set = prediction_set.copy(deep=True)

    # Convert the categorical variables to numerical
    for i in range(len(categorical_features)):
        new = le.fit_transform(prediction_set[categorical_features[i]])
        prediction_set[categorical_features[i]] = new
    
    # log transformation
    for log_feature in LOG_FEATURES:
        prediction_set[log_feature] = np.log(prediction_set[log_feature])
    
    prediction_set.drop(drop, axis=1, inplace=True)
    #prediction_set.drop(['sa2_2021', 'residence_type'], axis=1, inplace=True)
    prediction_set.dropna(inplace=True)

    # add missing sa2 that were present in training but not in testing, assign with 0s
    #missing = set(X_train.columns) - set(prediction_set.columns)
    #for c in missing:
    #    prediction_set[c] = 0
    
    # Ensure the order of column in the test set is in the same order than in train set
    prediction_set = prediction_set[X_train.columns]

    price_predictions = final_model.predict(prediction_set) # in log
    price_predictions = np.exp(price_predictions) # in actual price

    final_prediction_set['predicted_price'] = price_predictions
    final_prediction_set.to_csv(f'../data/curated/2023_2027_rental_prediction_lr/{year}_rental.csv', index=False)

### Add missing SA2 that were present in training but not in testing

In [5]:
"""
missing = set(X_train.columns) - set(prediction_set.columns)
for c in missing:
    prediction_set[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
prediction_set = prediction_set[X_train.columns]
prediction_set
"""

'\nmissing = set(X_train.columns) - set(prediction_set.columns)\nfor c in missing:\n    prediction_set[c] = 0\n# Ensure the order of column in the test set is in the same order than in train set\nprediction_set = prediction_set[X_train.columns]\nprediction_set\n'

In [6]:
"""
price_predictions = final_model.predict(prediction_set)
prediction = prediction_set.copy(deep=True)
prediction['predicted_price'] = price_predictions
prediction
"""

"\nprice_predictions = final_model.predict(prediction_set)\nprediction = prediction_set.copy(deep=True)\nprediction['predicted_price'] = price_predictions\nprediction\n"

In [7]:
"""
prediction_sa2 = pd.DataFrame({'sa2_2021': sorted(prediction_set['sa2_2021'].astype(str).unique())})
prediction_sa2
dummies = pd.get_dummies(prediction_sa2, prefix='sa2')
dummies
dummies = dummies.reindex(columns=possible_sa2, fill_value=0)
dummies
"""

"\nprediction_sa2 = pd.DataFrame({'sa2_2021': sorted(prediction_set['sa2_2021'].astype(str).unique())})\nprediction_sa2\ndummies = pd.get_dummies(prediction_sa2, prefix='sa2')\ndummies\ndummies = dummies.reindex(columns=possible_sa2, fill_value=0)\ndummies\n"

In [8]:
"""
dtype = pd.CategoricalDtype(categories=possible_sa2)
cat = pd.Series(sorted(prediction_set['sa2_2021'].astype(str).unique()), dtype=dtype)
cat
pd.get_dummies(cat, prefix='sa2')
"""

"\ndtype = pd.CategoricalDtype(categories=possible_sa2)\ncat = pd.Series(sorted(prediction_set['sa2_2021'].astype(str).unique()), dtype=dtype)\ncat\npd.get_dummies(cat, prefix='sa2')\n"