### Linear Regression Model to Predict Future Year Prices (selecting from all features)
#### Read in all merged csv data files

In [1]:
import pandas as pd
import glob
import os

path = r'../data/curated/merged_dataset/' # use your path
all_files = glob.glob(os.path.join(path , "*.csv"))

li = []

for filename in sorted(all_files):
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

merged_df = pd.concat(li, axis=0, ignore_index=True)
merged_df.drop(['address', 'latitude', 'longitude', 'postcode', 'sa2_2016'], axis=1, inplace=True)


In [2]:
print(merged_df.columns)
#pd.get_dummies(merged_df['sa2_2021'])
merged_df.rename({'gdp(USD Millioins)': 'gdp', 'saving_rate(% of GDP)': 'saving_rate'}, axis=1, inplace=True)
merged_df = pd.get_dummies(data=merged_df, columns=['sa2_2021'], prefix='sa2')
merged_df = pd.get_dummies(data=merged_df, columns=['residence_type'], prefix='resiType') 
#merged_df.drop(['sa2_2021', 'residence_type'], axis=1, inplace=True)
merged_df.dropna(inplace=True)

y = merged_df['weekly_rent']
#with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#    print(merged_df.isna().sum())
merged_df

Index(['year', 'sa2_2021', 'residence_type', 'nbed', 'nbath', 'ncar',
       'min_distance_to_cbd', 'min_distance_to_park', 'min_distance_to_prim',
       'min_distance_to_second', 'min_distance_to_train',
       'min_distance_to_hosp', 'min_distance_to_poli', 'min_distance_to_shop',
       'weekly_rent', 'gdp(USD Millioins)', 'saving_rate(% of GDP)',
       'income_per_person', 'population_density', 'crime_cases'],
      dtype='object')


Unnamed: 0,year,nbed,nbath,ncar,min_distance_to_cbd,min_distance_to_park,min_distance_to_prim,min_distance_to_second,min_distance_to_train,min_distance_to_hosp,...,sa2_217031473,sa2_217031474,sa2_217031475,sa2_217031476,sa2_217041477,sa2_217041478,sa2_217041479,sa2_217041480,resiType_Apartment,resiType_House
0,2013,2.0,1.0,0,227.97163,23.16035,7.35747,16.96507,35.56825,21.35025,...,0,0,0,0,0,0,0,0,0,1
1,2013,2.0,1.0,0,223.66084,5.71742,6.50536,6.76794,7.54355,7.42972,...,0,0,0,0,0,0,0,0,0,1
2,2013,2.0,1.0,0,243.25680,5.11222,0.20027,36.72106,50.85341,36.63541,...,0,0,0,0,0,0,0,0,0,1
3,2013,4.0,2.0,0,140.35827,78.32509,10.66523,11.91899,11.26906,177.44731,...,0,0,0,0,0,0,0,0,0,1
4,2013,1.0,1.0,0,13.86135,0.93250,1.32931,3.49174,2.20800,177.44731,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172030,2022,3.0,1.0,1,293.28053,0.56012,1.21809,114.77016,90.08591,140.56888,...,0,0,0,0,0,0,0,0,0,1
172031,2022,3.0,2.0,2,258.29111,3.49087,5.08707,3.60570,8.37185,2.60312,...,0,0,0,0,0,0,1,0,0,1
172032,2022,2.0,2.0,1,9.47077,2.45011,1.33931,1.62322,3.63291,140.56888,...,0,0,0,0,0,0,0,0,0,1
172033,2022,1.0,1.0,1,1.84933,0.65199,1.10438,1.27940,1.87840,140.56888,...,0,0,0,0,0,0,0,0,1,0


#### Prepare a null model that uses only Year, SA2 code, Residence Type to predict weekly rent

In [3]:
#external_X = merged_df.drop(list(merged_df.filter(regex='resiType')), axis=1)
#external_X.drop(list(external_X.filter(regex='sa2')), axis=1, inplace=True)
#external_X.drop(['nbed', 'nbath', 'ncar', 'weekly_rent'], axis=1, inplace=True)
all_candidates = ['nbed', 'nbath', 'ncar', 'min_distance_to_cbd', 'min_distance_to_park',
       'min_distance_to_prim', 'min_distance_to_second',
       'min_distance_to_train', 'min_distance_to_hosp', 'min_distance_to_poli',
       'min_distance_to_shop', 'gdp', 'saving_rate', 'income_per_person',
       'population_density', 'crime_cases']
all_X = merged_df[['year']+list(merged_df.filter(regex='sa2'))+list(merged_df.filter(regex='resiType'))] # null predictors
all_X

Unnamed: 0,year,sa2_201011001,sa2_201011002,sa2_201011005,sa2_201011006,sa2_201011007,sa2_201011008,sa2_201011481,sa2_201011482,sa2_201011483,...,sa2_217031473,sa2_217031474,sa2_217031475,sa2_217031476,sa2_217041477,sa2_217041478,sa2_217041479,sa2_217041480,resiType_Apartment,resiType_House
0,2013,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2013,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,2013,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,2013,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,2013,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172030,2022,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
172031,2022,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
172032,2022,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
172033,2022,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [4]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

all_model = sm.OLS(y, all_X).fit()
# Summary of all factors
all_summary = all_model.summary()

In [5]:
print(all_summary)
print(f'AIC = {all_model.aic}')

                            OLS Regression Results                            
Dep. Variable:            weekly_rent   R-squared:                       0.344
Model:                            OLS   Adj. R-squared:                  0.342
Method:                 Least Squares   F-statistic:                     177.4
Date:                Sat, 01 Oct 2022   Prob (F-statistic):               0.00
Time:                        12:55:52   Log-Likelihood:            -1.1250e+06
No. Observations:              172018   AIC:                         2.251e+06
Df Residuals:                  171511   BIC:                         2.256e+06
Df Model:                         506                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
year                  15.7899      0

#### (All Features) Forward Selection - to add the most useful predictor that gives the lowest AIC at each iteration

In [6]:
AIC_dict = {}
last_min = all_model.aic
candidates = []

while(True):
    for x in all_candidates:
        print(f"trying feature {x}")
        new_X = merged_df[x]
        forward_X = pd.concat([new_X, all_X], axis=1)
        model = sm.OLS(y, forward_X).fit()
        AIC_dict[x] = model.aic
        print(f"AIC = {model.aic}")

    min_aic =  min(AIC_dict.values())
    min_aic_key = min(AIC_dict, key=AIC_dict.get)

    if min_aic < last_min:
        candidates.append(min_aic_key)
        all_candidates.remove(min_aic_key)
        last_min = min_aic
        all_X = pd.concat([merged_df[min_aic_key], all_X], axis=1)

        print('step: ' + str(len(candidates)))
        print(candidates)
        print('new AIC: ' + str(min_aic))
        print('===============')
    else:
        print(model.summary())
        break

trying feature nbed
AIC = 2228061.226176497
trying feature nbath
AIC = 2231950.7863880983
trying feature ncar
AIC = 2247480.8785356395
trying feature min_distance_to_cbd
AIC = 2251020.022487711
trying feature min_distance_to_park
AIC = 2251025.6663582614
trying feature min_distance_to_prim
AIC = 2250864.5711536095
trying feature min_distance_to_second
AIC = 2251004.0412225747
trying feature min_distance_to_train
AIC = 2251037.603824883
trying feature min_distance_to_hosp
AIC = 2251014.0363956895
trying feature min_distance_to_poli
AIC = 2251003.839566804
trying feature min_distance_to_shop
AIC = 2250988.378571484
trying feature gdp
AIC = 2250842.530379278
trying feature saving_rate
AIC = 2250750.75297408
trying feature income_per_person
AIC = 2251053.0494748796
trying feature population_density
AIC = 2251042.840718355
trying feature crime_cases
AIC = 2251015.7235069326
step: 1
['nbed']
new AIC: 2228061.226176497
trying feature nbath
AIC = 2219723.777660318
trying feature ncar
AIC = 222

In [6]:
SELECTED_FEATURES = candidates
sorted(SELECTED_FEATURES)

['crime_cases',
 'gdp',
 'income_per_person',
 'min_distance_to_cbd',
 'min_distance_to_hosp',
 'min_distance_to_poli',
 'min_distance_to_prim',
 'min_distance_to_shop',
 'min_distance_to_train',
 'nbath',
 'nbed',
 'ncar',
 'population_density',
 'saving_rate']

In [7]:
all_X = pd.concat([merged_df[SELECTED_FEATURES], all_X], axis=1)
lm = sm.OLS(y, all_X).fit()

In [9]:
import pickle
pickle.dump(lm, open('../web/lr_rental_model.pkl','wb'))
model = pickle.load(open('../web/lr_rental_model.pkl','rb'))