**MODELLING**
-----------

In this notebook, we will build a model based on the historical rental data 

In [1]:
import pandas as pd

In [30]:
import pandas as pd

# Load the Excel file with multiple sheets
file_path = '../data/landing/Moving annual rent by suburb - March quarter 2023.xlsx'

sheets_to_load = ['1 bedroom flat', '2 bedroom flat', '3 bedroom house', '4 bedroom house', 'All properties']

# Read the Excel file into a dictionary of DataFrames without headers
dfs = pd.read_excel(file_path, sheet_name=sheets_to_load, header=None)

# Initialize an empty list to store reshaped DataFrames
df_list = []

# Iterate through each sheet and process the data
for sheet_name, df in dfs.items():
    print(f"Processing sheet: {sheet_name}")
    
    # Identify the start of the data
    start_row = 3  # Suburb names start from the 4th row (index 3)
    date_row = 1  # Dates are in the 2nd row (index 1)
    median_row = 2  # 'Median' labels are in the 3rd row (index 2)

    # Extract relevant parts of the DataFrame
    df_suburbs = df.iloc[start_row:, 1].reset_index(drop=True)  # Suburbs in the second column
    df_dates = df.iloc[date_row, 2::2].reset_index(drop=True)  # Dates start from the third column, every second column
    df_medians = df.iloc[median_row, 3::2].reset_index(drop=True)  # 'Median' labels are after each date column

    # Repeat the dates for each associated median column
    repeated_dates = df_dates.repeat(1).reset_index(drop=True)

    # Combine date and median columns into a multi-level index
    date_median_pairs = pd.MultiIndex.from_tuples(zip(repeated_dates, df_medians), names=["Date", "Metric"])

    # Select the data values starting from the 4th row and all date/median columns
    df_values = df.iloc[start_row:, 2:].reset_index(drop=True)

    # Ensure the correct number of columns is matched by dropping extra or filling missing data
    df_values = df_values.iloc[:, :len(date_median_pairs)]
    df_values.columns = date_median_pairs

    # Combine suburb and reshaped data
    df_values.insert(0, 'Suburb', df_suburbs)  # Ensure 'Suburb' is inserted as the first column

    # Flatten the MultiIndex columns to single-level if necessary
    df_values.columns = ['Suburb' if col[0] == 'Suburb' else col for col in df_values.columns]

    # Debugging: Check if 'Suburb' is present in DataFrame
    print("Columns after insertion:", df_values.columns.tolist())
    if 'Suburb' not in df_values.columns:
        print("Suburb column is not found in DataFrame after insertion.")
        continue

    # Convert MultiIndex columns to a list of tuples
    value_vars = list(date_median_pairs)

    # Melt the DataFrame to a long format
    df_long = pd.melt(df_values, id_vars=['Suburb'], value_vars=value_vars, 
                      var_name='Date_Metric', value_name='Rent')

    # Split the 'Date_Metric' into 'Date' and 'Metric'
    df_long[['Date', 'Metric']] = df_long['Date_Metric'].apply(pd.Series)

    # Filter out only 'Median' values and rename the columns appropriately
    df_long = df_long[df_long['Metric'] == 'Median'].drop('Metric', axis=1)

    # Convert 'Date' to a datetime format
    df_long['Date'] = pd.to_datetime(df_long['Date'], format='%b %Y', errors='coerce')

    # Add a column for the property type (from the sheet name)
    df_long['Property_Type'] = sheet_name

    # Append the reshaped DataFrame to the list
    df_list.append(df_long)

    print(f"Processed data for sheet: {sheet_name}")
    print(df_long.head())  # Print the processed data to check the output

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(df_list, ignore_index=True)

# Display the combined DataFrame
print("Combined DataFrame:")
print(combined_df.head())

Processing sheet: 1 bedroom flat
Columns after insertion: ['Suburb', ('Mar 2000', 'Median'), ('Jun 2000', 'Median'), ('Sep 2000', 'Median'), ('Dec 2000', 'Median'), ('Mar 2001', 'Median'), ('Jun 2001', 'Median'), ('Sep 2001', 'Median'), ('Dec 2001', 'Median'), ('Mar 2002', 'Median'), ('Jun 2002', 'Median'), ('Sep 2002', 'Median'), ('Dec 2003', 'Median'), ('Mar 2003', 'Median'), ('Jun 2003', 'Median'), ('Sep 2003', 'Median'), ('Dec 2003', 'Median'), ('Mar 2004', 'Median'), ('Jun 2004', 'Median'), ('Sep 2004', 'Median'), ('Dec 2004', 'Median'), ('Mar 2005', 'Median'), ('Jun 2005', 'Median'), ('Sep 2005', 'Median'), ('Dec 2005', 'Median'), ('Mar 2006', 'Median'), ('Jun 2006', 'Median'), ('Sep 2006', 'Median'), ('Dec 2006', 'Median'), ('Mar 2007', 'Median'), ('Jun 2007', 'Median'), ('Sep 2007', 'Median'), ('Dec 2007', 'Median'), ('Mar 2008', 'Median'), ('Jun 2008', 'Median'), ('Sep 2008', 'Median'), ('Dec 2008', 'Median'), ('Mar 2009', 'Median'), ('Jun 2009', 'Median'), ('Sep 2009', 'Media