In [121]:
import pandas as pd
from pathlib import Path

In [122]:
ddf = pd.read_csv('../data/curated/historical_data.csv')
ddf = ddf.rename(columns={'Sub': 'suburb'})

popudf = pd.read_csv('../data/curated/subpopu.csv')
popudf = popudf[['POSTCODE', '2016', '2017', '2018', '2019',
                 '2020', '2021', '2022', '2023', '2024', '2025']]

incomedf = pd.read_csv('../data/curated/subincome.csv')
incomedf = incomedf[['POSTCODE', '2016', '2017', '2018', '2019',
                     '2020', '2021', '2022', '2023', '2024', '2025']]

In [123]:
years = ['2016', '2017', '2018', '2019', 
         '2020', '2021', '2022', '2023', '2024', '2025']

new_popudf = pd.DataFrame(columns=['POSTCODE', 'Year', 'population'])
for year in years:
    df = popudf[['POSTCODE', year]]
    df = df.rename(columns={year: 'population'})
    df['Year'] = year
    new_popudf = pd.concat([new_popudf, df], axis=0)

In [124]:
new_incomedf = pd.DataFrame(columns=['POSTCODE', 'Year', 'income'])
for year in years:
    df = incomedf[['POSTCODE', year]]
    df = df.rename(columns={year: 'income'})
    df['Year'] = year
    new_incomedf = pd.concat([new_incomedf, df], axis=0)

In [125]:
df_merged = pd.merge(new_incomedf, new_popudf, on=['POSTCODE', 'Year'], how='inner')

In [126]:
df_merged

Unnamed: 0,POSTCODE,Year,income,population
0,3000,2016,57070.694268,41683.457670
1,3002,2016,124053.718371,3504.936595
2,3003,2016,101795.473902,6036.425502
3,3004,2016,101932.807511,8278.999222
4,3006,2016,75651.415597,20030.152420
...,...,...,...,...
6935,3990,2025,62340.958316,1060.507244
6936,3991,2025,62313.065693,1985.820064
6937,3992,2025,62337.260110,3952.866861
6938,3995,2025,62165.586604,8241.683396


In [127]:
sub_postcodeinfo = pd.read_csv('../data/raw/realestate.csv')
sub_postcodeinfo = sub_postcodeinfo[['suburb', 'postcode']]
sub_postcode = sub_postcodeinfo.groupby(['suburb', 'postcode']).count().reset_index()
sub_postcode = sub_postcode.rename(columns={'postcode': 'POSTCODE'})

In [128]:
sub_postcode

Unnamed: 0,suburb,POSTCODE
0,Abbotsford,3067
1,Aberfeldie,3040
2,Adelaide,5000
3,Airport West,3042
4,Albanvale,3021
...,...,...
309,Windsor,3181
310,Wonga Park,3115
311,Woori Yallock,3139
312,Wyndham Vale,3024


In [129]:
df_merged = pd.merge(df_merged, sub_postcode, on='POSTCODE', how='inner')
df_merged = df_merged[['suburb', 'Year', 'population', 'income']]
df_merged = df_merged.groupby(['suburb', 'Year']).agg({'population': 'sum',
                                                     'income': 'mean'}).reset_index()


In [130]:

df_merged['Year'] = df_merged['Year'].astype(int)
df_merged

Unnamed: 0,suburb,Year,population,income
0,Abbotsford,2016,8843.391594,72463.341789
1,Abbotsford,2017,9360.969572,75929.483052
2,Abbotsford,2018,9595.265665,78784.652725
3,Abbotsford,2019,9661.113233,81883.887314
4,Abbotsford,2020,9738.337037,86715.715407
...,...,...,...,...
3115,Yarraville,2021,15910.505666,91911.558135
3116,Yarraville,2022,15908.663387,96980.309295
3117,Yarraville,2023,15907.936920,102997.167982
3118,Yarraville,2024,15906.499919,110186.147542


In [131]:
df_model = df_merged[df_merged['Year'] <= 2020]
df_predict = df_merged[df_merged['Year'] > 2020]

In [132]:
subdf = ddf.groupby('suburb').count().reset_index()
subdf = subdf['suburb']
subdf

0        Abbotsford
1       Albert Park
2         Alfredton
3        Alphington
4            Altona
           ...     
211    Williamstown
212         Windsor
213         Wodonga
214    Yarra Ranges
215      Yarraville
Name: suburb, Length: 216, dtype: object

In [133]:

df_predict = pd.merge(df_predict, subdf, on=['suburb'], how='inner')

In [134]:
typelist = ddf.groupby('Type').count().reset_index()
typelist = typelist['Type'].tolist()
typelist

['1 bedroom flat',
 '2 bedroom flat',
 '2 bedroom house',
 '3 bedroom flat',
 '3 bedroom house',
 '4 bedroom house',
 'All properties']

In [135]:
newdf_all = pd.DataFrame(columns=['suburb', 'Year', 'population', 'income', 'Type'])
for type in typelist:
    newdf = df_predict.copy()
    newdf['Type'] = type
    newdf_all = pd.concat([newdf_all, newdf], axis=0)


In [136]:
filepath = Path('../data/curated/ques2predictdata.csv')
newdf_all.to_csv(filepath, index=False)
len(newdf_all)

5040

In [137]:

df_model = pd.merge(df_model, ddf, on=['Year', 'suburb'], how='inner')

In [138]:
filepath = Path('../data/curated/ques2data.csv')
df_model.to_csv(filepath, index=False)
len(df_model)

5040