### Fit selected feature into a Linear Regression Model to make future rental price prediction

In [1]:
import pandas as pd
import glob
import os
import numpy as np

path = r'../data/curated/merged_dataset/' # use your path
all_files = glob.glob(os.path.join(path , "*.csv"))

li = []

for filename in sorted(all_files):
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

merged_df = pd.concat(li, axis=0, ignore_index=True)
merged_df.rename({'gdp(USD Millioins)': 'gdp', 'saving_rate(% of GDP)': 'saving_rate'}, axis=1, inplace=True)
merged_df.drop(['address', 'latitude', 'longitude', 'postcode', 'sa2_2016'], axis=1, inplace=True)

ALL_FEATURES = set(merged_df.drop(['weekly_rent', 'year', 'residence_type', 'sa2_2021'], axis=1).columns)
SELECTED_FEATURES = set({'crime_cases',
 'income_per_person',
 'min_distance_to_cbd',
 'min_distance_to_park',
 'min_distance_to_poli',
 'min_distance_to_prim',
 'min_distance_to_second',
 'min_distance_to_shop',
 'min_distance_to_train',
 'nbath',
 'nbed',
 'ncar',
 'population_density',
 'saving_rate'}) # dropped gdp and distance to hospital
drop = ALL_FEATURES - SELECTED_FEATURES

merged_df.drop(drop, axis=1, inplace=True)

possible_sa2 = list(sorted(merged_df['sa2_2021'].astype(str).unique()))
print(len(possible_sa2))

LOG_FEATURES = SELECTED_FEATURES - set({'nbed', 'nbath', 'ncar'})
for log_feature in LOG_FEATURES:
    merged_df[log_feature] = np.log(merged_df[log_feature])
merged_df['weekly_rent'] = np.log(merged_df['weekly_rent'])


506


In [2]:
merged_df = pd.get_dummies(data=merged_df, columns=['sa2_2021'], prefix='sa2')
merged_df = pd.get_dummies(data=merged_df, columns=['residence_type'], prefix='resiType') 
#merged_df.drop(['sa2_2021', 'residence_type'], axis=1, inplace=True)
merged_df.dropna(inplace=True)

y_train = merged_df['weekly_rent']
X_train = merged_df.drop(['weekly_rent'], axis=1)
X_train = X_train[sorted(X_train.columns)]
X_train

Unnamed: 0,crime_cases,income_per_person,min_distance_to_cbd,min_distance_to_park,min_distance_to_poli,min_distance_to_prim,min_distance_to_second,min_distance_to_shop,min_distance_to_train,nbath,...,sa2_217031473,sa2_217031474,sa2_217031475,sa2_217031476,sa2_217041477,sa2_217041478,sa2_217041479,sa2_217041480,saving_rate,year
0,4.454347,10.588692,5.429221,3.142442,3.093158,1.995716,2.831157,2.235600,3.571453,1.0,...,0,0,0,0,0,0,0,0,1.925910,2013
1,3.583519,10.762630,5.410131,1.743518,1.837652,1.872626,1.912197,2.235600,2.020693,1.0,...,0,0,0,0,0,0,0,0,1.925910,2013
2,4.454347,10.588692,5.494118,1.631634,-2.467696,-1.608089,3.603350,2.235600,3.928947,1.0,...,0,0,0,0,0,0,0,0,1.925910,2013
3,7.160846,10.681809,4.944198,4.360868,4.436437,2.366989,2.478133,2.235600,2.422061,2.0,...,0,0,0,0,0,0,0,0,1.925910,2013
4,7.561642,11.363304,2.629104,-0.069886,4.436437,0.284660,1.250400,1.377508,0.792087,1.0,...,0,0,0,0,0,0,0,0,1.925910,2013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172030,5.638355,10.903481,5.681130,-0.579604,4.308865,0.197284,4.742932,2.613681,4.500764,1.0,...,0,0,0,0,0,0,0,0,2.552487,2022
172031,8.022569,11.015813,5.554087,1.250151,4.308865,1.626702,1.282516,2.613681,2.124875,2.0,...,0,0,0,0,0,0,1,0,2.552487,2022
172032,6.632002,11.500412,2.248210,0.896133,4.308865,0.292155,0.484412,0.681257,1.290034,2.0,...,0,0,0,0,0,0,0,0,2.552487,2022
172033,7.488853,11.174728,0.614823,-0.427726,4.308865,0.099284,0.246391,2.613681,0.630420,1.0,...,0,0,0,0,0,0,0,0,2.552487,2022


In [3]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
final_model = sm.OLS(y_train, X_train).fit()
final_model.summary()

0,1,2,3
Dep. Variable:,weekly_rent,R-squared:,0.616
Model:,OLS,Adj. R-squared:,0.615
Method:,Least Squares,F-statistic:,530.1
Date:,"Wed, 05 Oct 2022",Prob (F-statistic):,0.0
Time:,00:51:12,Log-Likelihood:,3965.2
No. Observations:,172018,AIC:,-6888.0
Df Residuals:,171497,BIC:,-1650.0
Df Model:,520,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
crime_cases,-0.0020,0.001,-2.031,0.042,-0.004,-7.06e-05
income_per_person,0.4667,0.024,19.232,0.000,0.419,0.514
min_distance_to_cbd,-0.0533,0.007,-7.805,0.000,-0.067,-0.040
min_distance_to_park,-0.0010,0.001,-1.434,0.152,-0.002,0.000
min_distance_to_poli,-0.0085,0.001,-8.108,0.000,-0.010,-0.006
min_distance_to_prim,0.0086,0.001,10.840,0.000,0.007,0.010
min_distance_to_second,0.0039,0.001,4.957,0.000,0.002,0.005
min_distance_to_shop,-0.0328,0.003,-12.270,0.000,-0.038,-0.028
min_distance_to_train,0.0039,0.001,4.016,0.000,0.002,0.006

0,1,2,3
Omnibus:,74060.809,Durbin-Watson:,1.86
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1396374.2
Skew:,1.605,Prob(JB):,0.0
Kurtosis:,16.584,Cond. No.,3.61e+20


In [4]:
# create path
new_path = '../data/curated/2023_2027_rental_prediction/'

if not os.path.exists(new_path):
    os.makedirs(new_path)

YEARS = [2023, 2024, 2025, 2026, 2027]

drop = ALL_FEATURES - SELECTED_FEATURES
for year in YEARS:
    prediction_set = pd.read_csv(f'../data/curated/2023_2027_data/{year}_data.csv')
    prediction_set.rename({'gdp(USD Millioins)': 'gdp', 'saving_rate(% of GDP)': 'saving_rate'}, axis=1, inplace=True)
    prediction_set.drop(drop, axis=1, inplace=True)
    
    # log transformation
    for log_feature in LOG_FEATURES:
        prediction_set[log_feature] = np.log(prediction_set[log_feature])
    
    final_prediction_set = prediction_set.copy(deep=True)
    prediction_set = pd.get_dummies(data=prediction_set, columns=['sa2_2021'], prefix='sa2')
    prediction_set = pd.get_dummies(data=prediction_set, columns=['residence_type'], prefix='resiType') 
    #prediction_set.drop(['sa2_2021', 'residence_type'], axis=1, inplace=True)
    prediction_set.dropna(inplace=True)

    # add missing sa2 that were present in training but not in testing, assign with 0s
    missing = set(X_train.columns) - set(prediction_set.columns)
    for c in missing:
        prediction_set[c] = 0
    # Ensure the order of column in the test set is in the same order than in train set
    prediction_set = prediction_set[X_train.columns]

    price_predictions = final_model.predict(prediction_set) # in log
    price_predictions = np.exp(price_predictions) # in actual price

    final_prediction_set['predicted_price'] = price_predictions
    final_prediction_set.to_csv(f'../data/curated/2023_2027_rental_prediction/{year}_rental.csv', index=False)

### Add missing SA2 that were present in training but not in testing

In [5]:
"""
missing = set(X_train.columns) - set(prediction_set.columns)
for c in missing:
    prediction_set[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
prediction_set = prediction_set[X_train.columns]
prediction_set
"""

'\nmissing = set(X_train.columns) - set(prediction_set.columns)\nfor c in missing:\n    prediction_set[c] = 0\n# Ensure the order of column in the test set is in the same order than in train set\nprediction_set = prediction_set[X_train.columns]\nprediction_set\n'

In [6]:
"""
price_predictions = final_model.predict(prediction_set)
prediction = prediction_set.copy(deep=True)
prediction['predicted_price'] = price_predictions
prediction
"""

"\nprice_predictions = final_model.predict(prediction_set)\nprediction = prediction_set.copy(deep=True)\nprediction['predicted_price'] = price_predictions\nprediction\n"

In [7]:
"""
prediction_sa2 = pd.DataFrame({'sa2_2021': sorted(prediction_set['sa2_2021'].astype(str).unique())})
prediction_sa2
dummies = pd.get_dummies(prediction_sa2, prefix='sa2')
dummies
dummies = dummies.reindex(columns=possible_sa2, fill_value=0)
dummies
"""

"\nprediction_sa2 = pd.DataFrame({'sa2_2021': sorted(prediction_set['sa2_2021'].astype(str).unique())})\nprediction_sa2\ndummies = pd.get_dummies(prediction_sa2, prefix='sa2')\ndummies\ndummies = dummies.reindex(columns=possible_sa2, fill_value=0)\ndummies\n"

In [8]:
"""
dtype = pd.CategoricalDtype(categories=possible_sa2)
cat = pd.Series(sorted(prediction_set['sa2_2021'].astype(str).unique()), dtype=dtype)
cat
pd.get_dummies(cat, prefix='sa2')
"""

"\ndtype = pd.CategoricalDtype(categories=possible_sa2)\ncat = pd.Series(sorted(prediction_set['sa2_2021'].astype(str).unique()), dtype=dtype)\ncat\npd.get_dummies(cat, prefix='sa2')\n"