### Import Libraries & Read Data

In [77]:
import warnings
import pandas as pd
import numpy as np
import os

warnings.filterwarnings('ignore')

# Flats
one_bed_flat = pd.read_csv('../data/landing/rent_history_1 bedroom flat.csv')
two_bed_flat = pd.read_csv('../data/landing/rent_history_2 bedroom flat.csv')
three_bed_flat = pd.read_csv('../data/landing/rent_history_3 bedroom flat.csv')

# Houses
two_bed_house = pd.read_csv('../data/landing/rent_history_2 bedroom house.csv')
three_bed_house = pd.read_csv('../data/landing/rent_history_3 bedroom house.csv')
four_bed_house = pd.read_csv('../data/landing/rent_history_4 bedroom house.csv')

# All properties
all_properties = pd.read_csv('../data/landing/rent_history_All properties.csv')

### Format Column Names Correctly

In [78]:
def format_df(df):
    '''
        A function that preprocesses the dataframes, particularly its columns to the desired format.
        The df, originally from a xlsx file, had mismatched columns. 
    '''
    
    # Duplicate month to the column to the right (to be combined with the entry below)
    df.iloc[0, 1:] = df.iloc[0, 1:].fillna(method='ffill')

    # Rename columns by combining first and second row
    df.columns = [f"{first} {second}".strip() for first, second in zip(df.iloc[0], df.iloc[1])]

    # Standardise column names 
    df.columns = df.columns.str.lower().str.replace(' ', '_')

    # Drop the first two rows as they were used for column names
    df = df.drop(index=[0, 1])

    # Name first column as 'vic_region'
    df = df.rename(columns={df.columns[0]: 'vic_region'})

    # Name second column as 'suburb'
    df = df.rename(columns={df.columns[1]: 'suburb'})

    df['vic_region'] = df.iloc[:, 0].where(df.iloc[:, 0].notna()).ffill()

    #--- column formatting 

    # Define the range of years you want to keep
    years = list(range(2016, 2025))

    # Create a list that stores the columns that match the required years
    filtered_columns = [col for col in df.columns if any(str(year) in col for year in years)]

    # Ensure 'vic_region' and 'suburb' are included if they exist in the dataframe
    if 'vic_region' in df.columns:
        filtered_columns.insert(0, 'vic_region')
    if 'suburb' in df.columns:
        filtered_columns.insert(1, 'suburb')

    # Filter the dataframe based on these columns
    filtered_df = df[filtered_columns]

    # Change the cbd name to melbourne 
    filtered_df['suburb'] = filtered_df['suburb'].replace('cbd', 'melbourne')

    # Exclude columns that end with '_count'
    filtered_df = filtered_df[[col for col in filtered_df.columns if not col.endswith('_count')]]

    return filtered_df

# Format all the dataframes
one_bed_flat = format_df(one_bed_flat)
two_bed_flat = format_df(two_bed_flat)
three_bed_flat = format_df(three_bed_flat)

two_bed_house = format_df(two_bed_house)
three_bed_house = format_df(three_bed_house)
four_bed_house = format_df(four_bed_house)

all_properties = format_df(all_properties)
all_properties

Unnamed: 0,vic_region,suburb,mar_2016_median,jun_2016_median,sep_2016_median,dec_2016_median,mar_2017_median,jun_2017_median,sep_2017_median,dec_2017_median,...,dec_2021_median,mar_2022_median,jun_2022_median,sep_2022_median,dec_2022_median,mar_2023_median,jun_2023_median,sep_2023_median,dec_2023_median,mar_2024_median
2,Inner Melbourne,Albert Park-Middle Park-West St Kilda,520,500,520,520,520,531,530,530,...,495,500,515,500,525,545,550,600,600,650
3,Inner Melbourne,Armadale,400,400,400,410,420,440,450,470,...,420,430,450,450,460,490,500,525,560,560
4,Inner Melbourne,Carlton North,530,530,520,530,530,540,550,560,...,580,580,595,600,600,620,630,650,670,680
5,Inner Melbourne,Carlton-Parkville,450,450,440,450,450,450,460,460,...,370,380,400,425,450,500,530,550,550,570
6,Inner Melbourne,CBD-St Kilda Rd,450,460,465,475,480,490,500,500,...,355,375,400,450,480,550,580,600,620,640
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156,Other Regional Centres,Wanagaratta,260,260,260,260,260,265,270,270,...,360,370,380,380,380,380,390,390,395,400
157,Other Regional Centres,Warragul,290,290,295,300,300,300,310,310,...,390,400,400,420,430,440,450,450,460,470
158,Other Regional Centres,Warrnambool,290,290,285,290,290,290,290,290,...,380,400,400,420,420,420,430,450,460,460
159,Other Regional Centres,Wodonga,290,290,290,290,295,300,300,300,...,370,380,390,400,410,410,420,420,430,450


### Format Suburb Column Correctly

In [79]:
def format_suburb_col(df, col='suburb', separator='-'):

    '''This function iterates through every row in the dataframe and ensures
    that each row represents a unique suburb. This is done by splitting the 
    combined suburb names based on the '-' seperator. Then the corresponding
    data columns are copied into the new and unique suburb row.'''
    
    formatted_rows = []

    # Iterate through each row 
    for index, row in df.iterrows():
        # Split the suburb column name based on '-'
        suburbs = row[col].split(separator)
        # A new row is created for each suburb with the same corresponding rows
        for suburb in suburbs:
            new_row = row.copy()
            new_row[col] = suburb.strip()  
            formatted_rows.append(new_row)
    
    formatted_df = pd.DataFrame(formatted_rows)
    
    return formatted_df

In [80]:
# Format all the dataframes
one_bed_flat = format_suburb_col(one_bed_flat)
two_bed_flat = format_suburb_col(two_bed_flat)
three_bed_flat = format_suburb_col(three_bed_flat)

two_bed_house = format_suburb_col(two_bed_house)
three_bed_house = format_suburb_col(three_bed_house)
four_bed_house = format_suburb_col(four_bed_house)

all_properties = format_suburb_col(all_properties)

In [81]:
def lowercase_string_cols(df):
    """
    This function inputs a dataframe and lowercases the 'vic_region' and 'suburb'
    columns. It the returns the dataframe with the lowercased columns. 
    """
    string_cols = ['vic_region', 'suburb']
    
    for col in string_cols:
        if col in df.columns and df[col].dtype == 'object':
            df[col] = df[col].str.lower()
    
    return df

# Lowercase the string columns using the function
one_bed_flat = lowercase_string_cols(one_bed_flat)
two_bed_flat = lowercase_string_cols(two_bed_flat)
three_bed_flat = lowercase_string_cols(three_bed_flat)

two_bed_house = lowercase_string_cols(two_bed_house)
three_bed_house = lowercase_string_cols(three_bed_house)
four_bed_house = lowercase_string_cols(four_bed_house)

all_properties = lowercase_string_cols(all_properties)

all_properties

Unnamed: 0,vic_region,suburb,mar_2016_median,jun_2016_median,sep_2016_median,dec_2016_median,mar_2017_median,jun_2017_median,sep_2017_median,dec_2017_median,...,dec_2021_median,mar_2022_median,jun_2022_median,sep_2022_median,dec_2022_median,mar_2023_median,jun_2023_median,sep_2023_median,dec_2023_median,mar_2024_median
2,inner melbourne,albert park,520,500,520,520,520,531,530,530,...,495,500,515,500,525,545,550,600,600,650
2,inner melbourne,middle park,520,500,520,520,520,531,530,530,...,495,500,515,500,525,545,550,600,600,650
2,inner melbourne,west st kilda,520,500,520,520,520,531,530,530,...,495,500,515,500,525,545,550,600,600,650
3,inner melbourne,armadale,400,400,400,410,420,440,450,470,...,420,430,450,450,460,490,500,525,560,560
4,inner melbourne,carlton north,530,530,520,530,530,540,550,560,...,580,580,595,600,600,620,630,650,670,680
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156,other regional centres,wanagaratta,260,260,260,260,260,265,270,270,...,360,370,380,380,380,380,390,390,395,400
157,other regional centres,warragul,290,290,295,300,300,300,310,310,...,390,400,400,420,430,440,450,450,460,470
158,other regional centres,warrnambool,290,290,285,290,290,290,290,290,...,380,400,400,420,420,420,430,450,460,460
159,other regional centres,wodonga,290,290,290,290,295,300,300,300,...,370,380,390,400,410,410,420,420,430,450


In [82]:
def format_suburb_names(df, column_name="suburb"):
    """
    This function formats suburb names in the provided DataFrame based on the rules given.
    
    Parameters:
    - df (pd.DataFrame): The input DataFrame containing suburb names.
    - column_name (str): The column name that contains the suburb names. Default is 'suburb'.
    
    Returns:
    - pd.DataFrame: A DataFrame with formatted suburb names.
    """
    
    # Dictionary for renaming the suburbs
    rename_dict = {
        "west st kilda": "st kilda west",
        "cbd": "melbourne",
        "east st kilda": "st kilda east",
        "east hawthorn": "hawthorn east",
        "east brunswick": "brunswick east",
        "west brunswick": "brunswick west",
        "mt eliza": "mount eliza",
        "mt martha": "mount martha",
        "newcombe": "newcomb",
        "bendigo east": "east bendigo",
        "wanagaratta": "wangaratta",
        "ballarat": "ballarat central",
        "st kilda rd": "st kilda"
    }
    
    # List of values to be removed
    unwanted_values = ["yarra ranges"]
    
    # Apply the renaming
    df[column_name] = df[column_name].replace(rename_dict)

    # Remove rows that are in unwanted_values
    df = df[~df[column_name].isin(unwanted_values)]
    
    return df

# Rename suburbs to correct name using the function
one_bed_flat = format_suburb_names(one_bed_flat)
two_bed_flat = format_suburb_names(two_bed_flat)
three_bed_flat = format_suburb_names(three_bed_flat)

two_bed_house = format_suburb_names(two_bed_house)
three_bed_house = format_suburb_names(three_bed_house)
four_bed_house = format_suburb_names(four_bed_house)

all_properties = format_suburb_names(all_properties)

all_properties

Unnamed: 0,vic_region,suburb,mar_2016_median,jun_2016_median,sep_2016_median,dec_2016_median,mar_2017_median,jun_2017_median,sep_2017_median,dec_2017_median,...,dec_2021_median,mar_2022_median,jun_2022_median,sep_2022_median,dec_2022_median,mar_2023_median,jun_2023_median,sep_2023_median,dec_2023_median,mar_2024_median
2,inner melbourne,albert park,520,500,520,520,520,531,530,530,...,495,500,515,500,525,545,550,600,600,650
2,inner melbourne,middle park,520,500,520,520,520,531,530,530,...,495,500,515,500,525,545,550,600,600,650
2,inner melbourne,st kilda west,520,500,520,520,520,531,530,530,...,495,500,515,500,525,545,550,600,600,650
3,inner melbourne,armadale,400,400,400,410,420,440,450,470,...,420,430,450,450,460,490,500,525,560,560
4,inner melbourne,carlton north,530,530,520,530,530,540,550,560,...,580,580,595,600,600,620,630,650,670,680
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156,other regional centres,wangaratta,260,260,260,260,260,265,270,270,...,360,370,380,380,380,380,390,390,395,400
157,other regional centres,warragul,290,290,295,300,300,300,310,310,...,390,400,400,420,430,440,450,450,460,470
158,other regional centres,warrnambool,290,290,285,290,290,290,290,290,...,380,400,400,420,420,420,430,450,460,460
159,other regional centres,wodonga,290,290,290,290,295,300,300,300,...,370,380,390,400,410,410,420,420,430,450


### Creating a Year Column

In [83]:
############EDIT THIS


def create_year_column(df):
    # Melt the dataframe to have 'suburb' and 'year' columns, with month median values in rows
    df_melted = pd.melt(df, id_vars=['suburb'], var_name='month_year', value_name='median_value')
    
    # Extract the month and year from the 'month_year' column
    df_melted['month'] = df_melted['month_year'].apply(lambda x: x.split('_')[0])
    df_melted['year'] = df_melted['month_year'].apply(lambda x: x.split('_')[1])
    
    # Convert median_value to numeric, handling errors and missing values
    df_melted['median_value'] = pd.to_numeric(df_melted['median_value'], errors='coerce')
    
    # Pivot the dataframe to get the desired format with {month}_median columns
    df_transformed = df_melted.pivot_table(index=['suburb', 'year'], 
                                           columns='month', 
                                           values='median_value').reset_index()

    # Rename columns to have the format {month}_median
    df_transformed.columns = ['suburb', 'year'] + [f"{col}_median" for col in df_transformed.columns if col not in ['suburb', 'year']]
    
    # Identify columns ending in _median
    median_cols = [col for col in df_transformed.columns if col.endswith('_median')]
    
    # Fill NaN values with the median of the row for columns ending in _median
    df_transformed[median_cols] = df_transformed[median_cols].apply(
        lambda row: row.fillna(row.median()), axis=1
    )

    # Remove suburbs that do not have any median prices (i.e., all median price columns are NaN)
    df_transformed = df_transformed.dropna()

    return df_transformed

one_bed_flat = create_year_column(one_bed_flat)
two_bed_flat = create_year_column(two_bed_flat)
three_bed_flat = create_year_column(three_bed_flat)
two_bed_house = create_year_column(two_bed_house)
three_bed_house = create_year_column(three_bed_house)
four_bed_house = create_year_column(four_bed_house)
all_properties = create_year_column(all_properties)


### Save Feature Set

In [84]:
# Save df to a CSV file
def save_rental_data(df, file_path):
    """
    Inputs the dataframe and file path and saves all dataframes to that file 
    path location.
    """

    # Retrieve the directory 
    directory = os.path.dirname(file_path)
    
    # Create the directory if it does not exist
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Save the filtered dataframe to CSV
    df.to_csv(file_path, index=False)

print("All Saved!")


# Save all dataframes in the raw folder
one_bed_flat = save_rental_data(one_bed_flat, '../data/raw/rental_history/one_bed_flat.csv')
two_bed_flat = save_rental_data(two_bed_flat, '../data/raw/rental_history/two_bed_flat.csv')
three_bed_flat = save_rental_data(three_bed_flat, '../data/raw/rental_history/three_bed_flat.csv')

two_bed_house = save_rental_data(two_bed_house, '../data/raw/rental_history/two_bed_house.csv')
three_bed_house = save_rental_data(three_bed_house, '../data/raw/rental_history/three_bed_house.csv')
four_bed_house = save_rental_data(four_bed_house, '../data/raw/rental_history/four_bed_house.csv')

all_properties = save_rental_data(all_properties, '../data/raw/rental_history/all_properties.csv')

All Saved!
