In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression

In [2]:
subs_w_hist = pd.read_csv("../data/curated/suburb_w_hist.csv")

In [3]:
def future_median_predict_linear(df):
    # Initialize results list
    results = []

    # Loop through each suburb in the dataset
    for suburb in df['Suburb'].unique():
        
        suburb_df = df[df['Suburb'] == suburb].copy()
        
        # Convert median_rental_price to numeric and drop NaNs
        suburb_df['median_rental_price'] = pd.to_numeric(suburb_df['median_rental_price'], errors='coerce')
        suburb_df = suburb_df.dropna(subset=['median_rental_price'])
        
        # Convert date column to datetime
        suburb_df['date'] = pd.to_datetime(suburb_df['date'], errors='coerce')
        suburb_df = suburb_df.dropna(subset=['date'])

        # Ensure suburb has at least 10 entries before fitting the model
        if len(suburb_df) < 10:
            continue

        # Use only recent data for linear regression
        suburb_df = suburb_df[suburb_df['date'] >= '2021-01-01']

        # Extract the year from the date
        suburb_df['year'] = suburb_df['date'].dt.year

        # Prepare data for the linear regression model
        X = suburb_df[['year']]  # Independent variable (year)
        y = suburb_df['median_rental_price']  # Dependent variable (rental price)

        # Fit a linear regression model
        model = LinearRegression()
        model.fit(X, y)

        # Forecast for the years 2025, 2026, and 2027
        forecast_years = pd.DataFrame({'year': [2025, 2026, 2027]})
        forecasted_prices = model.predict(forecast_years)

        # Create a DataFrame for the forecast results
        forecast_yearly = pd.DataFrame({
            'Suburb': [suburb] * 3,
            'forecasted_median_rental_price': forecasted_prices
        }, index=[f'2025', f'2026', f'2027'])
        
        results.append(forecast_yearly)

    # Combine all suburb forecasts
    forecast_df_combined = pd.concat(results)

    # Pivot the table to have 'Suburb' as columns and years as rows
    final_results = forecast_df_combined.pivot_table(values='forecasted_median_rental_price', index=forecast_df_combined.index, columns='Suburb')
    
    # Transpose and reset the index for the final result
    final_df = final_results.T.reset_index()

    # Rename the columns for clarity
    final_df = final_df.rename(columns={'index': 'Suburb'})

    # Reorder the DataFrame columns to have 'Suburb' first, followed by dates
    columns_order = ['Suburb'] + [col for col in final_df.columns if col != 'Suburb']
    
    return final_df[columns_order]

In [4]:
tmp = future_median_predict_linear(subs_w_hist)

In [5]:
tmp

Unnamed: 0,Suburb,2025,2026,2027
0,Abbotsford,651.526316,707.684211,763.842105
1,Albert Park,665.000000,710.000000,755.000000
2,Alfredton,446.217105,466.644737,487.072368
3,Alphington,488.802632,513.868421,538.934211
4,Altona,527.171053,561.447368,595.723684
...,...,...,...,...
225,Williamstown,651.513158,689.342105,727.171053
226,Windsor,594.986842,641.657895,688.328947
227,Wodonga,484.868421,516.578947,548.289474
228,Yarra Glen,546.006579,578.171053,610.335526
