### Fit selected feature into a Linear Regression Model to make future rental price prediction

In [1]:
import pandas as pd
import glob
import os

path = r'../data/curated/merged_dataset/' # use your path
all_files = glob.glob(os.path.join(path , "*.csv"))

li = []

for filename in sorted(all_files):
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

merged_df = pd.concat(li, axis=0, ignore_index=True)
merged_df.drop(['address', 'latitude', 'longitude', 'postcode', 'sa2_2016', 'min_distance_to_park', 'min_distance_to_second'], axis=1, inplace=True)
possible_sa2 = list(sorted(merged_df['sa2_2021'].astype(str).unique()))
print(len(possible_sa2))

SELECTED_FEATURES = ['nbed', 'nbath', 'ncar', 'saving_rate', 'min_distance_to_prim', 
'min_distance_to_poli', 'min_distance_to_hosp', 'min_distance_to_cbd', 
'min_distance_to_shop', 'population_density', 'income_per_person', 
'crime_cases', 'min_distance_to_train', 'gdp'] # dropped distance to park and secondary school



506


In [2]:
merged_df.rename({'gdp(USD Millioins)': 'gdp', 'saving_rate(% of GDP)': 'saving_rate'}, axis=1, inplace=True)
merged_df = pd.get_dummies(data=merged_df, columns=['sa2_2021'], prefix='sa2')
merged_df = pd.get_dummies(data=merged_df, columns=['residence_type'], prefix='resiType') 
#merged_df.drop(['sa2_2021', 'residence_type'], axis=1, inplace=True)
merged_df.dropna(inplace=True)

y_train = merged_df['weekly_rent']
X_train = merged_df.drop(['weekly_rent'], axis=1)
X_train = X_train[sorted(X_train.columns)]
X_train

Unnamed: 0,crime_cases,gdp,income_per_person,min_distance_to_cbd,min_distance_to_hosp,min_distance_to_poli,min_distance_to_prim,min_distance_to_shop,min_distance_to_train,nbath,...,sa2_217031473,sa2_217031474,sa2_217031475,sa2_217031476,sa2_217041477,sa2_217041478,sa2_217041479,sa2_217041480,saving_rate,year
0,86.0,1536454,39683.563449,227.97163,21.35025,22.04660,7.35747,9.35209,35.56825,1.0,...,0,0,0,0,0,0,0,0,6.861393,2013
1,36.0,1536454,47222.702327,223.66084,7.42972,6.28177,6.50536,9.35209,7.54355,1.0,...,0,0,0,0,0,0,0,0,6.861393,2013
2,86.0,1536454,39683.563449,243.25680,36.63541,0.08478,0.20027,9.35209,50.85341,1.0,...,0,0,0,0,0,0,0,0,6.861393,2013
3,1288.0,1536454,43556.283562,140.35827,177.44731,84.47341,10.66523,9.35209,11.26906,2.0,...,0,0,0,0,0,0,0,0,6.861393,2013
4,1923.0,1536454,86103.411528,13.86135,177.44731,84.47341,1.32931,3.96501,2.20800,1.0,...,0,0,0,0,0,0,0,0,6.861393,2013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172030,281.0,3305754,54365.266130,293.28053,140.56888,74.35608,1.21809,13.64920,90.08591,1.0,...,0,0,0,0,0,0,0,0,12.839000,2022
172031,3049.0,3305754,60828.473189,258.29111,2.60312,74.35608,5.08707,13.64920,8.37185,2.0,...,0,0,0,0,0,0,1,0,12.839000,2022
172032,759.0,3305754,98756.492866,9.47077,140.56888,74.35608,1.33931,1.97636,3.63291,2.0,...,0,0,0,0,0,0,0,0,12.839000,2022
172033,1788.0,3305754,71305.473808,1.84933,140.56888,74.35608,1.10438,13.64920,1.87840,1.0,...,0,0,0,0,0,0,0,0,12.839000,2022


In [3]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
final_model = sm.OLS(y_train, X_train).fit()
final_model.summary()

0,1,2,3
Dep. Variable:,weekly_rent,R-squared:,0.455
Model:,OLS,Adj. R-squared:,0.454
Method:,Least Squares,F-statistic:,275.6
Date:,"Tue, 04 Oct 2022",Prob (F-statistic):,0.0
Time:,21:04:17,Log-Likelihood:,-1109000.0
No. Observations:,172018,AIC:,2219000.0
Df Residuals:,171497,BIC:,2224000.0
Df Model:,520,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
crime_cases,-0.0014,0.000,-2.890,0.004,-0.002,-0.000
gdp,-5.733e-06,2.11e-06,-2.714,0.007,-9.87e-06,-1.59e-06
income_per_person,0.0009,0.000,3.292,0.001,0.000,0.001
min_distance_to_cbd,0.3880,0.061,6.330,0.000,0.268,0.508
min_distance_to_hosp,0.2905,0.041,7.059,0.000,0.210,0.371
min_distance_to_poli,-0.3692,0.070,-5.290,0.000,-0.506,-0.232
min_distance_to_prim,0.8830,0.106,8.365,0.000,0.676,1.090
min_distance_to_shop,-1.5834,0.408,-3.881,0.000,-2.383,-0.784
min_distance_to_train,-0.0863,0.031,-2.797,0.005,-0.147,-0.026

0,1,2,3
Omnibus:,151106.953,Durbin-Watson:,1.865
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5784158.202
Skew:,4.175,Prob(JB):,0.0
Kurtosis:,30.153,Cond. No.,6.22e+23


In [4]:
# create path
new_path = '../data/curated/2023_2027_rental_prediction/'

if not os.path.exists(new_path):
    os.makedirs(new_path)

YEARS = [2023, 2024, 2025, 2026, 2027]

for year in YEARS:
    prediction_set = pd.read_csv(f'../data/curated/2023_2027_data/{year}_data.csv')
    prediction_set.drop(['min_distance_to_park', 'min_distance_to_second'], axis=1, inplace=True)
    prediction_set.rename({'gdp(USD Millioins)': 'gdp', 'saving_rate(% of GDP)': 'saving_rate'}, axis=1, inplace=True)
    final_prediction_set = prediction_set.copy(deep=True)
    prediction_set = pd.get_dummies(data=prediction_set, columns=['sa2_2021'], prefix='sa2')
    prediction_set = pd.get_dummies(data=prediction_set, columns=['residence_type'], prefix='resiType') 
    #prediction_set.drop(['sa2_2021', 'residence_type'], axis=1, inplace=True)
    prediction_set.dropna(inplace=True)

    # add missing sa2 that were present in training but not in testing, assign with 0s
    missing = set(X_train.columns) - set(prediction_set.columns)
    for c in missing:
        prediction_set[c] = 0
    # Ensure the order of column in the test set is in the same order than in train set
    prediction_set = prediction_set[X_train.columns]

    price_predictions = final_model.predict(prediction_set)
    final_prediction_set['predicted_price'] = price_predictions
    final_prediction_set.to_csv(f'../data/curated/2023_2027_rental_prediction/{year}_rental.csv', index=False)

### Add missing SA2 that were present in training but not in testing

In [5]:
"""
missing = set(X_train.columns) - set(prediction_set.columns)
for c in missing:
    prediction_set[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
prediction_set = prediction_set[X_train.columns]
prediction_set
"""

'\nmissing = set(X_train.columns) - set(prediction_set.columns)\nfor c in missing:\n    prediction_set[c] = 0\n# Ensure the order of column in the test set is in the same order than in train set\nprediction_set = prediction_set[X_train.columns]\nprediction_set\n'

In [6]:
"""
price_predictions = final_model.predict(prediction_set)
prediction = prediction_set.copy(deep=True)
prediction['predicted_price'] = price_predictions
prediction
"""

"\nprice_predictions = final_model.predict(prediction_set)\nprediction = prediction_set.copy(deep=True)\nprediction['predicted_price'] = price_predictions\nprediction\n"

In [7]:
"""
prediction_sa2 = pd.DataFrame({'sa2_2021': sorted(prediction_set['sa2_2021'].astype(str).unique())})
prediction_sa2
dummies = pd.get_dummies(prediction_sa2, prefix='sa2')
dummies
dummies = dummies.reindex(columns=possible_sa2, fill_value=0)
dummies
"""

"\nprediction_sa2 = pd.DataFrame({'sa2_2021': sorted(prediction_set['sa2_2021'].astype(str).unique())})\nprediction_sa2\ndummies = pd.get_dummies(prediction_sa2, prefix='sa2')\ndummies\ndummies = dummies.reindex(columns=possible_sa2, fill_value=0)\ndummies\n"

In [8]:
"""
dtype = pd.CategoricalDtype(categories=possible_sa2)
cat = pd.Series(sorted(prediction_set['sa2_2021'].astype(str).unique()), dtype=dtype)
cat
pd.get_dummies(cat, prefix='sa2')
"""

"\ndtype = pd.CategoricalDtype(categories=possible_sa2)\ncat = pd.Series(sorted(prediction_set['sa2_2021'].astype(str).unique()), dtype=dtype)\ncat\npd.get_dummies(cat, prefix='sa2')\n"