## This notebook merge external attributes for 2023 - 2027

In [1]:
import pandas as pd
import os
import geopandas as gpd

In [1]:
# create path
new_path = '../../data/curated/2023_2027_data/'

if not os.path.exists(new_path):
    os.makedirs(new_path)

In [11]:
''' since 2022 dataset has the most values of postcode, sa2 2021 and sa2 2016, and includes all values from previous years of those attributes, postcode, sa2 2021 and sa2 2016 
from 2022 dataset will be used for further prediction'''
data = pd.read_csv(f'../../data/curated/merged_dataset/2022_merged_data.csv', index_col=[0])
post_sa2 = data[['postcode', 'sa2_2021', 'sa2_2016']]
post_sa2 = post_sa2.reset_index().drop(columns=['year'])
post_sa2 = post_sa2.drop_duplicates().reset_index().drop(columns=['index'])
post_sa2

Unnamed: 0,postcode,sa2_2021,sa2_2016
0,3012,213031351,213031351
1,3664,204011058,204011058
2,3124,207011149,207011149
3,3269,217031475,217031475
4,3048,210051247,210051247
...,...,...,...
1261,3123,207011149,207011149
1262,3335,211041270,211041270
1263,3127,207031165,207031165
1264,3068,207031163,207031163


In [12]:
# load predicted external attributes
pred_gdp_sr = pd.read_csv('../../data/curated/feature_prediction/21_27_gdp_with_saving_predicted.csv')
pred_income = pd.read_csv('../../data/curated/feature_prediction/20_27_income_per_person_2016sa2.csv', index_col=[0])
pred_population = pd.read_csv('../../data/curated/feature_prediction/22_27_population.csv', index_col=[0])
pred_population = pred_population.rename(columns={'SA2 code': 'sa2_2021'})
pred_crime_cases = pd.read_csv('../../data/curated/feature_prediction/23_27_crime_case.csv', index_col=[0])
pred_crime_cases = pred_crime_cases.rename(columns={'Postcode': 'postcode'})

In [13]:
year = [2023, 2024, 2025, 2026, 2027]

for i in year:

    # store year, sa2 (2021 and 2016), postcode
    data = post_sa2
    data['year'] = i

    # store gdp and saving rate into the dataframe
    gdp = list(pred_gdp_sr.loc[pred_gdp_sr['year'] == i, 'gdp'])[0]
    sr = list(pred_gdp_sr.loc[pred_gdp_sr['year'] == i, 'saving'])[0]
        
    data['gdp(USD Millioins)'] = gdp
    data['saving_rate(% of GDP)'] = sr

    # store income per person per year into the dataframe
    income_data = pred_income.loc[pred_income['Year'] == i][['sa2_2016', 'income_per_person_sa2']]
    data['income_per_person'] = (data.merge(income_data, on=['sa2_2016'], how='left')['income_per_person_sa2'])

    # store population density into the dataframe
    population_data = pred_population.loc[pred_population['year'] == i][['sa2_2021', 'pred']]
    data['population_density'] = (data.merge(population_data, on=['sa2_2021'], how='left')['pred'])

    # store crime cases into the dataframe
    crime_cases_data = pred_crime_cases.loc[pred_crime_cases['Year'] == i][['postcode', 'crime_count']]
    data['crime_cases'] = (data.merge(crime_cases_data, on=['postcode'], how='left')['crime_count'])

    # output csv file of merged external data
    data.to_csv(f'../../data/curated/2023_2027_data/{i}_data.csv')