In [568]:
import pandas as pd
import numpy as np

In [569]:
df = pd.read_csv('../../data/curated/AllProperties.csv')
df.loc[df['Suburb'] == 'Wanagaratta', 'Suburb'] = 'Wangaratta'

In [570]:
df.columns = df.columns.str.strip()

years = [2018, 2019, 2020, 2021, 2022, 2023]
df_weekly_rent = df[['Suburb']].copy()

for year in years:
    year_cols = [col for col in df.columns if col.startswith(str(year))]
    df[year_cols] = df[year_cols].apply(pd.to_numeric, errors='coerce')

    df_temp = df[year_cols].replace(0, np.nan)
    avg_without_zeros = df_temp.mean(axis=1, skipna=True)

    all_zero_mask = (df[year_cols] == 0).all(axis=1)
    column_name = f"{year}_weekly_rent"
    df_weekly_rent[column_name] = np.where(all_zero_mask, 0, avg_without_zeros)

df_weekly_rent

Unnamed: 0,Suburb,2018_weekly_rent,2019_weekly_rent,2020_weekly_rent,2021_weekly_rent,2022_weekly_rent,2023_weekly_rent
0,Albert Park-Middle Park-West St Kilda,737.375,720.875,702.750,711.250,684.375,665.5
1,Armadale,702.125,624.000,621.500,628.000,642.250,620.5
2,Carlton North,535.625,537.875,564.875,587.625,573.000,555.0
3,Carlton-Parkville,2418.875,2458.875,2663.875,2615.250,3372.250,3545.0
4,CBD-St Kilda Rd,5352.125,5629.750,6095.625,8814.125,9021.500,8963.0
...,...,...,...,...,...,...,...
141,Traralgon,691.375,621.500,594.375,593.750,633.500,644.0
142,Wangaratta,451.875,460.375,473.125,450.250,457.375,449.0
143,Warragul,398.875,420.625,465.875,459.750,465.250,468.5
144,Warrnambool,751.125,695.750,663.000,614.750,633.375,643.5


In [571]:
df_new = df_weekly_rent.copy()

population_years = [2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025, 2026]
for year in population_years:
    # initialise population columns
    population_column_name = f"{year}_population"
    df_new[population_column_name] = 0
    
    # initialise income columns
    # income_column_name = f"{year}_income"
    # df_new[income_column_name] = 0

df_new

Unnamed: 0,Suburb,2018_weekly_rent,2019_weekly_rent,2020_weekly_rent,2021_weekly_rent,2022_weekly_rent,2023_weekly_rent,2018_population,2019_population,2020_population,2021_population,2022_population,2023_population,2024_population,2025_population,2026_population
0,Albert Park-Middle Park-West St Kilda,737.375,720.875,702.750,711.250,684.375,665.5,0,0,0,0,0,0,0,0,0
1,Armadale,702.125,624.000,621.500,628.000,642.250,620.5,0,0,0,0,0,0,0,0,0
2,Carlton North,535.625,537.875,564.875,587.625,573.000,555.0,0,0,0,0,0,0,0,0,0
3,Carlton-Parkville,2418.875,2458.875,2663.875,2615.250,3372.250,3545.0,0,0,0,0,0,0,0,0,0
4,CBD-St Kilda Rd,5352.125,5629.750,6095.625,8814.125,9021.500,8963.0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141,Traralgon,691.375,621.500,594.375,593.750,633.500,644.0,0,0,0,0,0,0,0,0,0
142,Wangaratta,451.875,460.375,473.125,450.250,457.375,449.0,0,0,0,0,0,0,0,0,0
143,Warragul,398.875,420.625,465.875,459.750,465.250,468.5,0,0,0,0,0,0,0,0,0
144,Warrnambool,751.125,695.750,663.000,614.750,633.375,643.5,0,0,0,0,0,0,0,0,0


In [572]:
numeric_cols = df_weekly_rent.select_dtypes(include=['number']).columns
has_zeros = (df_weekly_rent[numeric_cols] == 0).any().any()

print(f"Does df_weekly_rent have any zero values? {has_zeros}")

Does df_weekly_rent have any zero values? False


In [573]:
df_population = pd.read_csv('../../data/curated/population_by_sa2.csv')
df_income = pd.read_csv('../../data/curated/income_by_sa2.csv')

In [574]:
df_population

Unnamed: 0,Label,estimated_population_2018,estimated_population_2019,estimated_population_2020,estimated_population_2021,estimated_population_2022,estimated_population_2023,estimated_population_2024,estimated_population_2025,estimated_population_2026
0,Abbotsford,9527,9594,9672,9258,9513,10008,10527.676824,10914.578045,11189.069984
1,Airport West,8169,8390,8362,8240,8295,8464,8683.181491,8896.540332,9093.092637
2,Albert Park,16728,17081,16955,16011,16177,16861,17665.062455,18280.665009,18706.847174
3,Alexandra,6646,6687,6690,6771,6794,6836,6915.938566,7038.105197,7186.668076
4,Alfredton,13537,14434,15507,16841,18002,18997,19771.949269,20395.671019,20940.982750
...,...,...,...,...,...,...,...,...,...,...
512,Yarram,5437,5474,5545,5555,5588,5580,5594.979256,5653.608883,5754.511887
513,Yarraville,15991,16092,16068,15651,15661,16020,16523.217961,16987.482063,17369.407751
514,Yarrawonga,8297,8418,8508,8593,8727,8812,8901.992448,9023.455120,9184.454667
515,Yarriambiack,6639,6617,6583,6453,6376,6327,6344.344429,6418.401396,6530.575937


In [575]:
df_income

Unnamed: 0,Label,median_income_2016,median_income_2017,median_income_2018,median_income_2019,median_income_2020,median_income_2021,median_income_2022,median_income_2023,median_income_2024,median_income_2025,median_income_2026
0,Abbotsford,56001,56242,59923,61918,66091,69104.795585,71219.795864,74636.644356,77788.073345,80427.684311,83781.391430
1,Airport West,54708,56619,58350,60992,62920,65268.165154,68154.903250,70695.851426,73394.212934,76451.213723,79397.004301
2,Albert Park,64048,62922,64900,65880,68836,71758.449466,73918.640871,77000.919533,80239.921519,83055.592588,86331.495226
3,Alexandra,38251,39811,41230,42447,44033,46010.900933,47833.306333,49778.505143,51958.291329,54092.617842,56294.447393
4,Alfredton,49071,50119,52709,53765,55019,57905.793758,60042.535187,62020.613758,64882.959903,67482.620117,69928.862355
...,...,...,...,...,...,...,...,...,...,...,...,...
394,Yarram,40266,39960,42650,43228,44904,47529.419325,49053.429695,50953.269734,53495.237847,55533.119845,57691.004992
395,Yarraville,61906,63996,66465,69379,72428,75150.947880,78146.977115,81342.059481,84431.867671,87696.030832,91150.354200
396,Yarrawonga,40234,40400,41809,43914,45708,47484.605261,49661.654459,51747.588040,53812.175594,56123.344343,58452.379863
397,Yarriambiack,35901,37036,39000,42076,43823,45495.709284,47986.025739,49976.076330,51873.803656,54280.164801,56537.349739


In [576]:
# for i in df_population.Label:
#     if i not in df_income.Label.values:
#         print(i)

In [577]:
# for i in df_population.Label:
#     print(i, df_population[df_population.Label == i].estimated_population_2018)

In [578]:
with open('missing_suburbs.txt') as f:
    missing_suburbs = f.read().splitlines()

#### Add population data to new_df for each suburb

In [579]:
for label in df_population.Label:
    if label in missing_suburbs:
        continue
    found = False
    for suburb in df_weekly_rent.Suburb.values:
        if label == 'Carlton' and suburb == 'Carlton-Parkville' or \
        label == 'Keilor' and suburb == 'Keilor' or \
        label == 'Pascoe Vale' and suburb == 'Pascoe Vale-Coburg North' or \
        label == 'Bendigo Surrounds - North' and suburb == 'North Bendigo' or \
        label == 'East Bendigo - Kennington' and suburb == 'Flora Hill-Bendigo East' or \
        label == 'Brunswick East' and suburb == 'East Brunswick' or \
        label == 'Brunswick West' and suburb == 'West Brunswick' or \
        label == 'Carlton North - Princes Hill' and suburb == 'Carlton North' or \
        label == 'Dandenong - North' and suburb == 'Dandenong North-Endeavour Hills' or \
        label == 'Dandenong - South' and suburb == 'Dandenong' or \
        label == 'Gladstone Park - Westmeadows' and suburb == 'Gladstone Park-Tullamarine' or \
        label == 'Hawthorn East' and suburb == 'East Hawthorn' or \
        label == 'Highett (West) - Cheltenham' and suburb == 'Cheltenham' or \
        label == 'Ivanhoe East - Eaglemont' and suburb == 'Ivanhoe-Ivanhoe East' or \
        label == 'North Geelong - Bell Park' and suburb == 'North Geelong' or \
        label == 'Research - North Warrandyte' and suburb == 'Eltham-Research-Montmorency' or \
        label == 'Geelong West - Hamlyn Heights' and suburb == 'Herne Hill-Geelong West' or \
        label == 'St Kilda - Central' and suburb == 'St Kilda' or \
        label == 'St Kilda - West' and suburb == 'Albert Park-Middle Park-West St Kilda' or \
        label == 'St Kilda East' and suburb == 'East St Kilda' or \
        label == 'West Footscray - Tottenham' and suburb == 'West Footscray' or \
        'Berwick' in label and 'Berwick' in suburb or \
        'Bundoora' in label and 'Bundoora' in suburb or \
        'Clayton' in label and 'Clayton' in suburb or \
        'Craigieburn' in label and 'Craigieburn' in suburb or \
        'Croydon' in label and 'Croydon' in suburb or \
        'Doncaster East' in label and 'Doncaster East' in suburb or \
        'Endeavour Hills' in label and 'Endeavour Hills' in suburb or \
        'Epping' in label and 'Epping' in suburb or \
        'Essendon' in label and 'Essendon' in suburb or \
        'Ferntree Gully' in label and 'Ferntree Gully' in suburb or \
        'Glen Waverley' in label and 'Glen Waverley' in suburb or \
        'Glenroy' in label and 'Glenroy' in suburb or \
        'Hampton Park' in label and 'Hampton Park' in suburb or \
        'Hoppers Crossing' in label and 'Hoppers Crossing' in suburb or \
        'Kew' in label and 'Kew' in suburb or \
        'Lalor' in label and 'Lalor' in suburb or \
        'CBD' in label and 'CBD' in suburb or \
        'Mildura' in label and 'Mildura' in suburb or \
        'Mill Park' in label and 'Mill Park' in suburb or \
        'Mornington' in label and 'Mornington' in suburb or \
        'Mount Waverley' in label and 'Mount Waverley' in suburb or \
        'Narre Warren' in label and 'Narre Warren' in suburb or \
        'Noble Park' in label and 'Noble Park' in suburb or \
        'Northcote' in label and 'Northcote' in suburb or \
        'Pakenham' in label and 'Pakenham' in suburb or \
        'Preston' in label and 'Preston' in suburb or \
        'Reservoir' in label and 'Reservoir' in suburb or \
        'Richmond' in label and 'Richmond' in suburb or \
        'Rowville' in label and 'Rowville' in suburb or \
        'Roxburgh' in label and 'Roxburgh' in suburb or \
        'Shepparton' in label and 'Shepparton' in suburb or \
        'South Yarra' in label and 'South Yarra' in suburb or \
        'Southbank' in label and 'Southbank' in suburb or \
        'Albans' in label and 'Albans' in suburb or \
        'Surrey' in label and 'Surrey' in suburb or \
        'Traralgon' in label and 'Traralgon' in suburb or \
        'Warrnambool' in label and 'Warrnambool' in suburb or \
        'Werribee' in label and 'Werribee' in suburb or \
        'West Melbourne' in label and 'West Melbourne' in suburb or \
        label in suburb:
            found = True
            df_new.loc[df_new['Suburb'] == suburb, '2018_population'] += df_population.loc[df_population['Label'] == label, 'estimated_population_2018'].values[0]
            df_new.loc[df_new['Suburb'] == suburb, '2019_population'] += df_population.loc[df_population['Label'] == label, 'estimated_population_2019'].values[0]
            df_new.loc[df_new['Suburb'] == suburb, '2020_population'] += df_population.loc[df_population['Label'] == label, 'estimated_population_2020'].values[0]
            df_new.loc[df_new['Suburb'] == suburb, '2021_population'] += df_population.loc[df_population['Label'] == label, 'estimated_population_2021'].values[0]
            df_new.loc[df_new['Suburb'] == suburb, '2022_population'] += df_population.loc[df_population['Label'] == label, 'estimated_population_2022'].values[0]
            df_new.loc[df_new['Suburb'] == suburb, '2023_population'] += df_population.loc[df_population['Label'] == label, 'estimated_population_2023'].values[0]
            df_new.loc[df_new['Suburb'] == suburb, '2024_population'] += df_population.loc[df_population['Label'] == label, 'estimated_population_2024'].values[0]
            df_new.loc[df_new['Suburb'] == suburb, '2025_population'] += df_population.loc[df_population['Label'] == label, 'estimated_population_2025'].values[0]
            df_new.loc[df_new['Suburb'] == suburb, '2026_population'] += df_population.loc[df_population['Label'] == label, 'estimated_population_2026'].values[0]
            break
    if not found:
        if 'west' in label.lower() or 'north' in label.lower() or 'south' in label.lower() or 'east' in label.lower() or 'central' in label.lower():
            print(label)

  df_new.loc[df_new['Suburb'] == suburb, '2024_population'] += df_population.loc[df_population['Label'] == label, 'estimated_population_2024'].values[0]
  df_new.loc[df_new['Suburb'] == suburb, '2025_population'] += df_population.loc[df_population['Label'] == label, 'estimated_population_2025'].values[0]
  df_new.loc[df_new['Suburb'] == suburb, '2026_population'] += df_population.loc[df_population['Label'] == label, 'estimated_population_2026'].values[0]


In [580]:
df_new

Unnamed: 0,Suburb,2018_weekly_rent,2019_weekly_rent,2020_weekly_rent,2021_weekly_rent,2022_weekly_rent,2023_weekly_rent,2018_population,2019_population,2020_population,2021_population,2022_population,2023_population,2024_population,2025_population,2026_population
0,Albert Park-Middle Park-West St Kilda,737.375,720.875,702.750,711.250,684.375,665.5,32378,32848,32603,30101,30459,32341,34541.805547,36158.063285,37163.846977
1,Armadale,702.125,624.000,621.500,628.000,642.250,620.5,88068,91543,94970,97644,100849,105094,109278.935733,112809.297134,115653.092418
2,Carlton North,535.625,537.875,564.875,587.625,573.000,555.0,29856,30023,29764,25354,26215,29907,33989.768124,36749.740277,38192.896506
3,Carlton-Parkville,2418.875,2458.875,2663.875,2615.250,3372.250,3545.0,8242,8291,8318,7428,7475,8003,8655.475805,9134.517212,9424.459783
4,CBD-St Kilda Rd,5352.125,5629.750,6095.625,8814.125,9021.500,8963.0,47554,49645,50297,43577,46906,54001,61054.901629,65537.764434,67852.135457
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141,Traralgon,691.375,621.500,594.375,593.750,633.500,644.0,28054,28371,28632,28744,29096,29284,29501.579354,29828.458316,30291.782863
142,Wangaratta,451.875,460.375,473.125,450.250,457.375,449.0,19471,19624,19716,19884,19969,19969,20034.054500,20227.345652,20533.557675
143,Warragul,398.875,420.625,465.875,459.750,465.250,468.5,20501,21283,22099,23098,23787,24417,25003.530068,25586.242179,26163.283094
144,Warrnambool,751.125,695.750,663.000,614.750,633.375,643.5,35180,35448,35762,35768,35909,36238,36769.093166,37420.458172,38125.251112


# Model

In [582]:
from sklearn.linear_model import LinearRegression

df = df_new.copy()
years = [2018, 2019, 2020, 2021, 2022, 2023]

# Melt the DataFrame to long format
rent_columns = [f"{year}_weekly_rent" for year in years]
population_columns = [f"{year}_population" for year in years + [2024, 2025, 2026]]

# Melt weekly rent data
df_long = pd.melt(
    df,
    id_vars=['Suburb'] + population_columns,
    value_vars=rent_columns,
    var_name='Year',
    value_name='Weekly_Rent'
)

# Extract year from the 'Year' column
df_long['Year'] = df_long['Year'].str.extract('(\d+)').astype(int)

  df_long['Year'] = df_long['Year'].str.extract('(\d+)').astype(int)


In [583]:
# Melt population data to long format
population_columns_years = [f"{year}_population" for year in years]
df_population_long = pd.melt(
    df,
    id_vars=['Suburb'],
    value_vars=population_columns_years,
    var_name='Year',
    value_name='Population'
)

# Extract year from the 'Year' column
df_population_long['Year'] = df_population_long['Year'].str.extract('(\d+)').astype(int)

# Merge the population data with the rent data
df_long = pd.merge(
    df_long,
    df_population_long,
    on=['Suburb', 'Year'],
    how='left'
)

# Sort the DataFrame
df_long = df_long.sort_values(by=['Suburb', 'Year']).reset_index(drop=True)


  df_population_long['Year'] = df_population_long['Year'].str.extract('(\d+)').astype(int)


In [584]:
# Years for prediction
future_years = [2024, 2025, 2026]

# Create a DataFrame to store predictions
predictions = []

# Group data by Suburb
for suburb, group in df_long.groupby('Suburb'):
    # Drop rows with missing values
    group = group.dropna(subset=['Weekly_Rent', 'Population'])
    
    # Check if we have enough data points
    if len(group) >= 2:
        # Prepare features (Year and Population)
        X = group[['Year', 'Population']]
        y = group['Weekly_Rent']
        
        # Fit linear regression model
        model = LinearRegression()
        model.fit(X, y)
        
        # Prepare future data
        future_population = df.loc[df['Suburb'] == suburb, [f"{year}_population" for year in future_years]].values.flatten()
        
        # Check if we have future population data
        if len(future_population) == len(future_years):
            # Create DataFrame for future predictions
            X_future = pd.DataFrame({
                'Year': future_years,
                'Population': future_population
            })
            
            # Predict future weekly rents
            y_pred = model.predict(X_future)
            
            # Store predictions
            for year, rent in zip(future_years, y_pred):
                predictions.append({
                    'Suburb': suburb,
                    'Year': year,
                    'Predicted_Weekly_Rent': rent
                })
    else:
        # Handle cases with insufficient data
        print(f"Not enough data for suburb: {suburb}")


In [588]:
df_predictions = pd.DataFrame(predictions)

# Pivot the predictions DataFrame to wide format
df_predictions_wide = df_predictions.pivot(index='Suburb', columns='Year', values='Predicted_Weekly_Rent').reset_index()

# Rename columns
df_predictions_wide.columns = ['Suburb'] + [f"{year}_weekly_rent" for year in future_years]

# Merge with the original DataFrame
df_final = pd.merge(df, df_predictions_wide, on='Suburb', how='left')

df_final

Unnamed: 0,Suburb,2018_weekly_rent,2019_weekly_rent,2020_weekly_rent,2021_weekly_rent,2022_weekly_rent,2023_weekly_rent,2018_population,2019_population,2020_population,2021_population,2022_population,2023_population,2024_population,2025_population,2026_population,2024_weekly_rent,2025_weekly_rent,2026_weekly_rent
0,Albert Park-Middle Park-West St Kilda,737.375,720.875,702.750,711.250,684.375,665.5,32378,32848,32603,30101,30459,32341,34541.805547,36158.063285,37163.846977,634.104394,608.999393,587.738863
1,Armadale,702.125,624.000,621.500,628.000,642.250,620.5,88068,91543,94970,97644,100849,105094,109278.935733,112809.297134,115653.092418,575.739949,560.950301,561.113479
2,Carlton North,535.625,537.875,564.875,587.625,573.000,555.0,29856,30023,29764,25354,26215,29907,33989.768124,36749.740277,38192.896506,532.903811,517.169798,510.541325
3,Carlton-Parkville,2418.875,2458.875,2663.875,2615.250,3372.250,3545.0,8242,8291,8318,7428,7475,8003,8655.475805,9134.517212,9424.459783,3937.241896,4312.341623,4644.769914
4,CBD-St Kilda Rd,5352.125,5629.750,6095.625,8814.125,9021.500,8963.0,47554,49645,50297,43577,46906,54001,61054.901629,65537.764434,67852.135457,8397.040589,8529.147286,9070.160329
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141,Traralgon,691.375,621.500,594.375,593.750,633.500,644.0,28054,28371,28632,28744,29096,29284,29501.579354,29828.458316,30291.782863,616.119078,595.999368,553.045033
142,Wangaratta,451.875,460.375,473.125,450.250,457.375,449.0,19471,19624,19716,19884,19969,19969,20034.054500,20227.345652,20533.557675,449.222182,450.477226,455.046782
143,Warragul,398.875,420.625,465.875,459.750,465.250,468.5,20501,21283,22099,23098,23787,24417,25003.530068,25586.242179,26163.283094,478.781968,482.387650,485.735656
144,Warrnambool,751.125,695.750,663.000,614.750,633.375,643.5,35180,35448,35762,35768,35909,36238,36769.093166,37420.458172,38125.251112,582.713437,552.387693,521.107661
