### Linear Regression Model to Predict Future Year Prices
#### Read in all merged csv data files

In [1]:
import pandas as pd
import glob
import os

path = r'../data/curated/merged_dataset/' # use your path
all_files = glob.glob(os.path.join(path , "*.csv"))

li = []

for filename in sorted(all_files):
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

merged_df = pd.concat(li, axis=0, ignore_index=True)
merged_df.drop(['address', 'latitude', 'longitude', 'postcode', 'sa2_2016'], axis=1, inplace=True)


In [2]:
print(merged_df.columns)
#pd.get_dummies(merged_df['sa2_2021'])
merged_df.rename({'gdp(USD Millioins)': 'gdp', 'saving_rate(% of GDP)': 'saving_rate'}, axis=1, inplace=True)
merged_df = pd.get_dummies(data=merged_df, columns=['sa2_2021'], prefix='sa2')
merged_df = pd.get_dummies(data=merged_df, columns=['residence_type'], prefix='resiType') 
#merged_df.drop(['sa2_2021', 'residence_type'], axis=1, inplace=True)
merged_df.dropna(inplace=True)

y = merged_df['weekly_rent']
#with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#    print(merged_df.isna().sum())
merged_df

Index(['year', 'sa2_2021', 'residence_type', 'nbed', 'nbath', 'ncar',
       'min_distance_to_cbd', 'min_distance_to_park', 'min_distance_to_prim',
       'min_distance_to_second', 'min_distance_to_train',
       'min_distance_to_hosp', 'min_distance_to_poli', 'min_distance_to_shop',
       'weekly_rent', 'gdp(USD Millioins)', 'saving_rate(% of GDP)',
       'income_per_person', 'population_density', 'crime_cases'],
      dtype='object')


Unnamed: 0,year,nbed,nbath,ncar,min_distance_to_cbd,min_distance_to_park,min_distance_to_prim,min_distance_to_second,min_distance_to_train,min_distance_to_hosp,...,sa2_217031473,sa2_217031474,sa2_217031475,sa2_217031476,sa2_217041477,sa2_217041478,sa2_217041479,sa2_217041480,resiType_Apartment,resiType_House
0,2013,2.0,1.0,0,227.97163,23.16035,7.35747,16.96507,35.56825,21.35025,...,0,0,0,0,0,0,0,0,0,1
1,2013,2.0,1.0,0,223.66084,5.71742,6.50536,6.76794,7.54355,7.42972,...,0,0,0,0,0,0,0,0,0,1
2,2013,2.0,1.0,0,243.25680,5.11222,0.20027,36.72106,50.85341,36.63541,...,0,0,0,0,0,0,0,0,0,1
3,2013,4.0,2.0,0,140.35827,78.32509,10.66523,11.91899,11.26906,177.44731,...,0,0,0,0,0,0,0,0,0,1
4,2013,1.0,1.0,0,13.86135,0.93250,1.32931,3.49174,2.20800,177.44731,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172030,2022,3.0,1.0,1,293.28053,0.56012,1.21809,114.77016,90.08591,140.56888,...,0,0,0,0,0,0,0,0,0,1
172031,2022,3.0,2.0,2,258.29111,3.49087,5.08707,3.60570,8.37185,2.60312,...,0,0,0,0,0,0,1,0,0,1
172032,2022,2.0,2.0,1,9.47077,2.45011,1.33931,1.62322,3.63291,140.56888,...,0,0,0,0,0,0,0,0,0,1
172033,2022,1.0,1.0,1,1.84933,0.65199,1.10438,1.27940,1.87840,140.56888,...,0,0,0,0,0,0,0,0,1,0


#### Prepare a null model that uses only SA2 code and Year to predict weekly rent

In [3]:
#external_X = merged_df.drop(list(merged_df.filter(regex='resiType')), axis=1)
#external_X.drop(list(external_X.filter(regex='sa2')), axis=1, inplace=True)
#external_X.drop(['nbed', 'nbath', 'ncar', 'weekly_rent'], axis=1, inplace=True)
external_candidates = ['min_distance_to_cbd', 'min_distance_to_park',
       'min_distance_to_prim', 'min_distance_to_second',
       'min_distance_to_train', 'min_distance_to_hosp', 'min_distance_to_poli',
       'min_distance_to_shop', 'gdp', 'saving_rate', 'income_per_person',
       'population_density', 'crime_cases']
external_X = merged_df[['year']+list(merged_df.filter(regex='sa2'))] # null predictors
external_X

Unnamed: 0,year,sa2_201011001,sa2_201011002,sa2_201011005,sa2_201011006,sa2_201011007,sa2_201011008,sa2_201011481,sa2_201011482,sa2_201011483,...,sa2_217031471,sa2_217031472,sa2_217031473,sa2_217031474,sa2_217031475,sa2_217031476,sa2_217041477,sa2_217041478,sa2_217041479,sa2_217041480
0,2013,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2013,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2013,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2013,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2013,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172030,2022,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
172031,2022,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
172032,2022,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
172033,2022,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

external_model = sm.OLS(y, external_X).fit()
# Summary of external factors
ex_summary = external_model.summary()

In [5]:
print(ex_summary)
print(f'AIC = {external_model.aic}')

                            OLS Regression Results                            
Dep. Variable:            weekly_rent   R-squared:                       0.276
Model:                            OLS   Adj. R-squared:                  0.274
Method:                 Least Squares   F-statistic:                     129.3
Date:                Sat, 01 Oct 2022   Prob (F-statistic):               0.00
Time:                        00:14:01   Log-Likelihood:            -1.1335e+06
No. Observations:              172018   AIC:                         2.268e+06
Df Residuals:                  171512   BIC:                         2.273e+06
Df Model:                         505                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
year             16.4989      0.150    110.352

#### (External Features) Forward Selection - to add the most useful predictor that gives the lowest AIC at each iteration

In [6]:
AIC_dict = {}
last_min = external_model.aic
candidates = []

while(True):
    for x in external_candidates:
        print(f"trying feature {x}")
        new_X = merged_df[x]
        forward_X = pd.concat([new_X, external_X], axis=1)
        model = sm.OLS(y, forward_X).fit()
        AIC_dict[x] = model.aic
        print(f"AIC = {model.aic}")

    min_aic =  min(AIC_dict.values())
    min_aic_key = min(AIC_dict, key=AIC_dict.get)

    if min_aic < last_min:
        candidates.append(min_aic_key)
        external_candidates.remove(min_aic_key)
        last_min = min_aic
        external_X = pd.concat([merged_df[min_aic_key], external_X], axis=1)

        print('step: ' + str(len(candidates)))
        print(candidates)
        print('new AIC: ' + str(min_aic))
        print('===============')
    else:
        print(model.summary())
        break

trying feature min_distance_to_cbd
AIC = 2267906.5610746816
trying feature min_distance_to_park
AIC = 2267916.2974868347
trying feature min_distance_to_prim
AIC = 2267759.032077996
trying feature min_distance_to_second
AIC = 2267900.2442881893
trying feature min_distance_to_train
AIC = 2267942.524172069
trying feature min_distance_to_hosp
AIC = 2267895.1244748244
trying feature min_distance_to_poli
AIC = 2267909.274542001
trying feature min_distance_to_shop
AIC = 2267925.44202691
trying feature gdp
AIC = 2267858.2622013083
trying feature saving_rate
AIC = 2267753.4502940155
trying feature income_per_person
AIC = 2267939.327232675
trying feature population_density
AIC = 2267942.052153629
trying feature crime_cases
AIC = 2267915.7511970247
step: 1
['saving_rate']
new AIC: 2267753.4502940155
trying feature min_distance_to_cbd
AIC = 2267716.5494933375
trying feature min_distance_to_park
AIC = 2267734.651811574
trying feature min_distance_to_prim
AIC = 2267585.948688329
trying feature min_d

### (Internal Feature) Forward Selection - to add the most useful predictor that gives the lowest AIC at each iteration

In [None]:
#internal_X = merged_df[list(merged_df.filter(regex='resiType'))+['nbed', 'nbath', 'ncar']]
internal_candidates = ['nbed', 'nbath', 'ncar']
internal_X = merged_df[['year']+list(merged_df.filter(regex='sa2'))+list(merged_df.filter(regex='resiType'))] # null predictors
internal_model = sm.OLS(y, internal_X).fit()
# Summary of internal factors
in_summary = internal_model.summary()
internal_X

Unnamed: 0,resiType_Apartment,resiType_House,nbed,nbath,ncar
0,0,1,2.0,1.0,0
1,0,1,2.0,1.0,0
2,0,1,2.0,1.0,0
3,0,1,4.0,2.0,0
4,1,0,1.0,1.0,0
...,...,...,...,...,...
172030,0,1,3.0,1.0,1
172031,0,1,3.0,2.0,2
172032,0,1,2.0,2.0,1
172033,1,0,1.0,1.0,1


In [None]:
AIC_dict = {}
last_min =  internal_model.aic
candidates = []

while(True):
    for x in internal_candidates:
        print(f"trying feature {x}")
        new_X = merged_df[x]
        forward_X = pd.concat([new_X, internal_X], axis=1)
        model = sm.OLS(y, forward_X).fit()
        AIC_dict[x] = model.aic
        print(f"AIC = {model.aic}")

    min_aic =  min(AIC_dict.values())
    min_aic_key = min(AIC_dict, key=AIC_dict.get)

    if min_aic < last_min:
        candidates.append(min_aic_key)
        internal_candidates.remove(min_aic_key)
        last_min = min_aic
        internal_X = pd.concat([merged_df[min_aic_key], internal_X], axis=1)

        print('step: ' + str(len(candidates)))
        print(candidates)
        print('new AIC: ' + str(min_aic))
        print('===============')
    else:
        print(model.summary())
        break

In [None]:
"""
import os
from collections import defaultdict
parent_dir = "../data/curated/merged_dataset"
ex_predictor_dict = defaultdict(int)
in_predictor_dict = defaultdict(int)
for filename in sorted(os.listdir(parent_dir)):
    print(filename, "----------------------------------------------------------------------------------------------------------------")
    merged_df = pd.read_csv(parent_dir + '/' + filename).drop(['address', 'latitude', 'longitude', 'postcode', 'sa2_2016'], axis=1)
    #print(merged_df.columns)
    #pd.get_dummies(merged_df['sa2_2021'])
    merged_df.rename({'gdp(USD Millioins)': 'gdp', 'saving_rate(% of GDP)': 'saving_rate'}, axis=1, inplace=True)
    merged_df = pd.get_dummies(data=merged_df, columns=['sa2_2021'], prefix='sa2')
    merged_df = pd.get_dummies(data=merged_df, columns=['residence_type'], prefix='resiType') 
    #merged_df.drop(['sa2_2021', 'residence_type'], axis=1, inplace=True)
    merged_df.dropna(inplace=True)

    # Implement ols regression to analyze the data
    ex_pvalues, in_pvalues = ols_regression(merged_df)
    
    new_csv_name = "../data/curated/significant/" + filename

    #put_signigicants_csv(external_preds, ex_pvalues, internal_preds, in_pvalues, new_csv_name, ex_predictor_dict, in_predictor_dict)
"""

In [None]:
"""
import regex
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Define external predictors
external_preds = ['sa2_2021', 'min_distance_to_cbd','min_distance_to_park', 'min_distance_to_prim', 'min_distance_to_second'\
    , 'min_distance_to_train', 'min_distance_to_hosp', 'min_distance_to_poli', 'min_distance_to_shop', 'gdp', 'saving_rate', 'income_per_person',\
        'crime_cases', 'population_density']

# Define internal predictors
internal_preds = ['residence_type', 'nbed', 'nbath', 'ncar']

# OLS regression function for Data analysis
def ols_regression(merged_df):
    # OLS Regression
    
    # Test Set
    y = merged_df['weekly_rent']
    # External ----------------------------------------------------------------------------------------------------
    external_X = merged_df.drop(list(merged_df.filter(regex='resiType')), axis=1)
    external_X.drop(['nbed', 'nbath', 'ncar', 'weekly_rent'], axis=1, inplace=True)

    external_model = sm.OLS(y, external_X).fit()

    # Summary of external factors
    ex_summary = external_model.summary()

    print("Summary of external factors: ")
    print(ex_summary)

    # Anova table of external factors
    #ex_anova_table = sm.stats.anova_lm(external_model, typ=2)

    #print("Anova table of external factors: ")
    #print(ex_anova_table)

    # Extract p-values for the predictors
    ex_pvals = external_model.pvalues[:]

    # Internal ----------------------------------------------------------------------------------------------------
    internal_X = merged_df[list(merged_df.filter(regex='resiType'))+['nbed', 'nbath', 'ncar']]
    internal_model = sm.OLS(y, internal_X).fit()

    # Summary of internal factors
    in_summary = internal_model.summary()

    print("Summary of internal factors: ")
    print(in_summary)

    # Anova table of internal factors
    #in_anova_table = sm.stats.anova_lm(internal_model, typ=2)

    #print("Anova table of internal factors: ")
    #print(in_anova_table)

    # Extract p-values for the predictors
    in_pvals = internal_model.pvalues[:]

    return ex_pvals, in_pvals

# csv file function
def put_signigicants_csv(external_preds, ex_pvalues, internal_preds, in_pvalues, new_csv, ex_predictor_dict, in_predictor_dict):
    print(ex_pvalues, in_pvalues)
    sign_ex_predictors = [external_preds[(pre-1)] for pre in range(1, len(ex_pvalues)) if ex_pvalues[pre] < 0.05]

    sign_in_predictors = [internal_preds[(pre-1)] for pre in range(1, len(in_pvalues)) if in_pvalues[pre] < 0.05]
    
    external_df = pd.DataFrame({'Significant External predictors': sign_ex_predictors})
    internal_df = pd.DataFrame({'Significant Internal predictors': sign_in_predictors})
    new = pd.concat([external_df, internal_df], axis=1) 

    print(new_csv)
    new.to_csv(new_csv, index=False)
    # record the frequencies of significant predictors
    for predictor in sign_ex_predictors:
        ex_predictor_dict[predictor] += 1
    for predictor in sign_in_predictors:
        in_predictor_dict[predictor] += 1
"""

In [None]:
"""
import os
from collections import defaultdict
parent_dir = "../data/curated/merged_dataset"
ex_predictor_dict = defaultdict(int)
in_predictor_dict = defaultdict(int)
for filename in sorted(os.listdir(parent_dir)):
    print(filename, "----------------------------------------------------------------------------------------------------------------")
    merged_df = pd.read_csv(parent_dir + '/' + filename).drop(['address', 'latitude', 'longitude', 'postcode', 'sa2_2016'], axis=1)
    #print(merged_df.columns)
    #pd.get_dummies(merged_df['sa2_2021'])
    merged_df.rename({'gdp(USD Millioins)': 'gdp', 'saving_rate(% of GDP)': 'saving_rate'}, axis=1, inplace=True)
    merged_df = pd.get_dummies(data=merged_df, columns=['sa2_2021'], prefix='sa2')
    merged_df = pd.get_dummies(data=merged_df, columns=['residence_type'], prefix='resiType') 
    #merged_df.drop(['sa2_2021', 'residence_type'], axis=1, inplace=True)
    merged_df.dropna(inplace=True)

    # Implement ols regression to analyze the data
    ex_pvalues, in_pvalues = ols_regression(merged_df)
    
    new_csv_name = "../data/curated/significant/" + filename

    #put_signigicants_csv(external_preds, ex_pvalues, internal_preds, in_pvalues, new_csv_name, ex_predictor_dict, in_predictor_dict)

"""

2013_merged_data.csv ----------------------------------------------------------------------------------------------------------------
Summary of external factors: 
                            OLS Regression Results                            
Dep. Variable:            weekly_rent   R-squared:                       0.404
Model:                            OLS   Adj. R-squared:                  0.370
Method:                 Least Squares   F-statistic:                     11.59
Date:                Fri, 30 Sep 2022   Prob (F-statistic):               0.00
Time:                        20:23:57   Log-Likelihood:                -52688.
No. Observations:                8566   AIC:                         1.063e+05
Df Residuals:                    8091   BIC:                         1.097e+05
Df Model:                         474                                         
Covariance Type:            nonrobust                                         
                             coef    std err  