In [33]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.interpolate import interp1d

import statsmodels.api as sm
from statsmodels.formula.api import ols

import geopandas as gpd

output_dir = '../data/curated/'


In [2]:
properties_df = pd.read_csv("../data/curated/properties_processed.csv", index_col=0)
properties_df['Postcode'] = properties_df['Postcode'].astype(str)

# read in census dataframes
census_df = pd.read_csv("../data/curated/census_data.csv")
sa2_postcode_map = pd.read_csv("../data/curated/sa2_postcode_mapping_2021.csv")

In [3]:
def convert_census_to_postcode(census_df, sa2_postcode_map, agg_func):
    ''' Inputs census data as indexed by SA2 and converts it to postcode through aggregation
    '''

    census_df_postcode = sa2_postcode_map.merge(census_df, on='sa2_2021').drop('sa2_2021', axis=1)
    census_df_postcode = census_df_postcode[census_df_postcode['postcode_2021'] >= 3000]

    census_df_postcode_agg = census_df_postcode.groupby('postcode_2021').agg(
        tot_population_11 = pd.NamedAgg(column='Tot_persons_C11_P', aggfunc=sum),
        tot_population_16 = pd.NamedAgg(column='Tot_persons_C16_P', aggfunc=sum),
        tot_population_21 = pd.NamedAgg(column='Tot_persons_C21_P', aggfunc=sum),
        avg_med_mortg_rep_11 = pd.NamedAgg(column='Med_mortg_rep_mon_C2011', aggfunc=agg_func),
        avg_med_mortg_rep_16 = pd.NamedAgg(column='Med_mortg_rep_mon_C2016', aggfunc=agg_func),
        avg_med_mortg_rep_21 = pd.NamedAgg(column='Med_mortg_rep_mon_C2021', aggfunc=agg_func),
        avg_med_person_inc_11 = pd.NamedAgg(column='Med_person_inc_we_C2011', aggfunc=agg_func),
        avg_med_person_inc_16 = pd.NamedAgg(column='Med_person_inc_we_C2016', aggfunc=agg_func),
        avg_med_person_inc_21 = pd.NamedAgg(column='Med_person_inc_we_C2021', aggfunc=agg_func),
        avg_med_rent_16 = pd.NamedAgg(column='Med_rent_weekly_C2011', aggfunc=agg_func),
        avg_med_rent_11 = pd.NamedAgg(column='Med_rent_weekly_C2016', aggfunc=agg_func),
        avg_med_rent_21 = pd.NamedAgg(column='Med_rent_weekly_C2021', aggfunc=agg_func),
        avg_med_hh_inc_16 = pd.NamedAgg(column='Med_tot_hh_inc_wee_C2011', aggfunc=agg_func),
        avg_med_hh_inc_11 = pd.NamedAgg(column='Med_tot_hh_inc_wee_C2016', aggfunc=agg_func),
        avg_med_hh_inc_21 = pd.NamedAgg(column='Med_tot_hh_inc_wee_C2021', aggfunc=agg_func),
        tot_avg_hh_size_16 = pd.NamedAgg(column='Average_hh_size_C2011', aggfunc=agg_func),
        tot_avg_hh_size_11 = pd.NamedAgg(column='Average_hh_size_C2016', aggfunc=agg_func),
        tot_avg_hh_size_21 = pd.NamedAgg(column='Average_hh_size_C2021', aggfunc=agg_func),
    ).reset_index()

    return census_df_postcode_agg

In [4]:
mean_no_zero = lambda lst: round(np.mean([x for x in lst if x > 0]), 2)
census_df_postcode = convert_census_to_postcode(census_df, sa2_postcode_map, mean_no_zero)

In [5]:
median_rent = properties_df.groupby(["Postcode"])["Cost"].median()


In [46]:
impute_features = ['tot_population', 'avg_med_mortg_rep', 'avg_med_person_inc', 'avg_med_rent', 'avg_med_hh_inc', 'tot_avg_hh_size']
init_years = [11, 16, 21]
new_years = list(range(11, 22))

for i in range(len(census_df_postcode['postcode_2021'])):
    data = census_df_postcode.iloc[i]

    # create interpolated dataframe 
    interpolated_dict = dict()
    interpolated_dict['date'] = [f'20{year}' for year in new_years]
    for feature in impute_features:
        init_y = data[[f'{feature}_{init_years[0]}', f'{feature}_{init_years[1]}', f'{feature}_{init_years[2]}']].tolist()
        
        interp_func = interp1d(init_years, init_y)
        new_y = list(interp_func(new_years))

        interpolated_dict[feature] = new_y
    interpolated_df = pd.DataFrame(interpolated_dict)
    
    print(interpolated_df)
    
    break


    date  tot_population  avg_med_mortg_rep  avg_med_person_inc  avg_med_rent  \
0   2011        124551.0           2213.380             862.180       447.060   
1   2012        133074.0           2178.780            1786.508       436.800   
2   2013        141597.0           2144.180            2710.836       426.540   
3   2014        150120.0           2109.580            3635.164       416.280   
4   2015        158643.0           2074.980            4559.492       406.020   
5   2016        167166.0           2040.380            5483.820       395.760   
6   2017        169417.6           2040.342            5680.608       400.246   
7   2018        171669.2           2040.304            5877.396       404.732   
8   2019        173920.8           2040.266            6074.184       409.218   
9   2020        176172.4           2040.228            6270.972       413.704   
10  2021        178424.0           2040.190            6467.760       418.190   

    avg_med_hh_inc  tot_avg

In [14]:
median_rent.to_csv(f'{output_dir}median_rental_postcode.csv')