### Import Libraries & Read Data

In [10]:
import warnings
import pandas as pd
import numpy as np
import os

warnings.filterwarnings('ignore')

# Flats
one_bed_flat = pd.read_csv('../data/landing/rent_history_1 bedroom flat.csv')
two_bed_flat = pd.read_csv('../data/landing/rent_history_2 bedroom flat.csv')
three_bed_flat = pd.read_csv('../data/landing/rent_history_3 bedroom flat.csv')

# Houses
two_bed_house = pd.read_csv('../data/landing/rent_history_2 bedroom house.csv')
three_bed_house = pd.read_csv('../data/landing/rent_history_3 bedroom house.csv')
four_bed_house = pd.read_csv('../data/landing/rent_history_4 bedroom house.csv')

# All properties
all_properties = pd.read_csv('../data/landing/rent_history_All properties.csv')

### Format Column Names Correctly

In [11]:
def format_df(df):
    '''
        A function that preprocesses the dataframes, particularly its columns to the desired format.
        The df, originally from a xlsx file, had mismatched columns. 
    '''
    
    # Duplicate month to the column to the right (to be combined with the entry below)
    df.iloc[0, 1:] = df.iloc[0, 1:].fillna(method='ffill')

    # Rename columns by combining first and second row
    df.columns = [f"{first} {second}".strip() for first, second in zip(df.iloc[0], df.iloc[1])]

    # Standardise column names 
    df.columns = df.columns.str.lower().str.replace(' ', '_')

    # Drop the first two rows as they were used for column names
    df = df.drop(index=[0, 1])

    # Name first column as 'vic_region'
    df = df.rename(columns={df.columns[0]: 'vic_region'})

    # Name second column as 'suburb'
    df = df.rename(columns={df.columns[1]: 'suburb'})

    df['vic_region'] = df.iloc[:, 0].where(df.iloc[:, 0].notna()).ffill()

    return df

# Format all the dataframes
one_bed_flat = format_df(one_bed_flat)
two_bed_flat = format_df(two_bed_flat)
three_bed_flat = format_df(three_bed_flat)

two_bed_house = format_df(two_bed_house)
three_bed_house = format_df(three_bed_house)
four_bed_house = format_df(four_bed_house)

all_properties = format_df(all_properties)
all_properties

Unnamed: 0,vic_region,suburb,mar_2000_count,mar_2000_median,jun_2000_count,jun_2000_median,sep_2000_count,sep_2000_median,dec_2000_count,dec_2000_median,...,mar_2023_count,mar_2023_median,jun_2023_count,jun_2023_median,sep_2023_count,sep_2023_median,dec_2023_count,dec_2023_median,mar_2024_count,mar_2024_median
2,Inner Melbourne,Albert Park-Middle Park-West St Kilda,1143,260,1134,260,1177,270,1178,275,...,796,545,740,550,730,600,720,600,671,650
3,Inner Melbourne,Armadale,733,200,737,200,738,205,739,210,...,757,490,687,500,639,525,594,560,566,560
4,Inner Melbourne,Carlton North,864,260,814,260,799,265,736,270,...,497,620,495,630,467,650,418,670,384,680
5,Inner Melbourne,Carlton-Parkville,1303,251,1278,260,1280,260,1301,260,...,2953,500,2755,530,2687,550,2662,550,2543,570
6,Inner Melbourne,CBD-St Kilda Rd,2132,320,2264,320,2358,320,2361,320,...,13568,550,13505,580,13552,600,13564,620,13582,640
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156,Other Regional Centres,Wanagaratta,705,125,671,125,631,130,623,130,...,535,380,555,390,565,390,593,395,580,400
157,Other Regional Centres,Warragul,385,130,367,135,382,135,366,135,...,507,440,542,450,558,450,543,460,541,470
158,Other Regional Centres,Warrnambool,1266,130,1229,135,1204,135,1135,135,...,881,420,861,430,846,450,844,460,840,460
159,Other Regional Centres,Wodonga,1446,145,1439,145,1468,150,1449,150,...,1205,410,1187,420,1164,420,1155,430,1139,450


### Format Suburb Column Correctly

In [12]:
def format_suburb_col(df, col='suburb', separator='-'):

    '''This function iterates through every row in the dataframe and ensures
    that each row represents a unique suburb. This is done by splitting the 
    combined suburb names based on the '-' seperator. Then the corresponding
    data columns are copied into the new and unique suburb row.'''
    
    formatted_rows = []

    # Iterate through each row 
    for index, row in df.iterrows():
        # Split the suburb column name based on '-'
        suburbs = row[col].split(separator)
        # A new row is created for each suburb with the same corresponding rows
        for suburb in suburbs:
            new_row = row.copy()
            new_row[col] = suburb.strip()  
            formatted_rows.append(new_row)
    
    formatted_df = pd.DataFrame(formatted_rows)
    
    return formatted_df

In [13]:
# Format all the dataframes
one_bed_flat = format_suburb_col(one_bed_flat)
two_bed_flat = format_suburb_col(two_bed_flat)
three_bed_flat = format_suburb_col(three_bed_flat)

two_bed_house = format_suburb_col(two_bed_house)
three_bed_house = format_suburb_col(three_bed_house)
four_bed_house = format_suburb_col(four_bed_house)

all_properties = format_suburb_col(all_properties)

In [14]:
def lowercase_string_cols(df):
    """
    This function inputs a dataframe and lowercases the 'vic_region' and 'suburb'
    columns. It the returns the dataframe with the lowercased columns. 
    """
    string_cols = ['vic_region', 'suburb']
    
    for col in string_cols:
        if col in df.columns and df[col].dtype == 'object':
            df[col] = df[col].str.lower()
    
    return df

# Lowercase the string columns using the function
one_bed_flat = lowercase_string_cols(one_bed_flat)
two_bed_flat = lowercase_string_cols(two_bed_flat)
three_bed_flat = lowercase_string_cols(three_bed_flat)

two_bed_house = lowercase_string_cols(two_bed_house)
three_bed_house = lowercase_string_cols(three_bed_house)
four_bed_house = lowercase_string_cols(four_bed_house)

all_properties = lowercase_string_cols(all_properties)

all_properties

Unnamed: 0,vic_region,suburb,mar_2000_count,mar_2000_median,jun_2000_count,jun_2000_median,sep_2000_count,sep_2000_median,dec_2000_count,dec_2000_median,...,mar_2023_count,mar_2023_median,jun_2023_count,jun_2023_median,sep_2023_count,sep_2023_median,dec_2023_count,dec_2023_median,mar_2024_count,mar_2024_median
2,inner melbourne,albert park,1143,260,1134,260,1177,270,1178,275,...,796,545,740,550,730,600,720,600,671,650
2,inner melbourne,middle park,1143,260,1134,260,1177,270,1178,275,...,796,545,740,550,730,600,720,600,671,650
2,inner melbourne,west st kilda,1143,260,1134,260,1177,270,1178,275,...,796,545,740,550,730,600,720,600,671,650
3,inner melbourne,armadale,733,200,737,200,738,205,739,210,...,757,490,687,500,639,525,594,560,566,560
4,inner melbourne,carlton north,864,260,814,260,799,265,736,270,...,497,620,495,630,467,650,418,670,384,680
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156,other regional centres,wanagaratta,705,125,671,125,631,130,623,130,...,535,380,555,390,565,390,593,395,580,400
157,other regional centres,warragul,385,130,367,135,382,135,366,135,...,507,440,542,450,558,450,543,460,541,470
158,other regional centres,warrnambool,1266,130,1229,135,1204,135,1135,135,...,881,420,861,430,846,450,844,460,840,460
159,other regional centres,wodonga,1446,145,1439,145,1468,150,1449,150,...,1205,410,1187,420,1164,420,1155,430,1139,450


### Handling Missing Values

In [15]:
all_properties['vic_region'].unique()

array(['inner melbourne', 'inner eastern melbourne', 'southern melbourne',
       'outer western melbourne', 'north western melbourne',
       'north eastern melbourne', 'outer eastern melbourne',
       'south eastern melbourne', 'mornington peninsula', 'geelong',
       'ballarat', 'bendigo', 'other regional centres'], dtype=object)

In [16]:
def null_imputer(df, region_col = 'vic_region', exclude_cols = ['vic_region', 'suburb']):
    """
    Inputs a dataframe and identifies all the null values which are indicated by the '-'
    symbol. Finds the specific vic_region, that value falls within. Fills that null
    value with the median of that vic_region column values. Returns the transformed
    dataframe. 
    """

    # Reset index to avoid multi-index issues
    df = df.reset_index(drop=True)

    # Use NaN to identify all null values that contain '-'
    df.replace('-', np.nan, inplace=True)
    
    # Exclude the string columns to handle the null values in the count and median 
    # price column 
    fill_cols = [col for col in df.columns if col not in exclude_cols]
    
    # These columns must be numeric 
    df[fill_cols] = df[fill_cols].apply(pd.to_numeric, errors='coerce')

    # Fill missing values in specified columns with the median of respective region
    for col in fill_cols:
        df[col] = df.groupby(region_col)[col].transform(lambda x: x.fillna(x.median()))
    
    return df

# Null Value Handling
one_bed_flat = null_imputer(one_bed_flat)
two_bed_flat = null_imputer(two_bed_flat)
three_bed_flat = null_imputer(three_bed_flat)

two_bed_house = null_imputer(two_bed_house)
three_bed_house = null_imputer(three_bed_house)
four_bed_house = null_imputer(four_bed_house)

all_properties = null_imputer(all_properties)

all_properties


Unnamed: 0,vic_region,suburb,mar_2000_count,mar_2000_median,jun_2000_count,jun_2000_median,sep_2000_count,sep_2000_median,dec_2000_count,dec_2000_median,...,mar_2023_count,mar_2023_median,jun_2023_count,jun_2023_median,sep_2023_count,sep_2023_median,dec_2023_count,dec_2023_median,mar_2024_count,mar_2024_median
0,inner melbourne,albert park,1143.0,260.0,1134,260,1177.0,270.0,1178.0,275.0,...,796,545,740,550,730,600,720,600,671,650
1,inner melbourne,middle park,1143.0,260.0,1134,260,1177.0,270.0,1178.0,275.0,...,796,545,740,550,730,600,720,600,671,650
2,inner melbourne,west st kilda,1143.0,260.0,1134,260,1177.0,270.0,1178.0,275.0,...,796,545,740,550,730,600,720,600,671,650
3,inner melbourne,armadale,733.0,200.0,737,200,738.0,205.0,739.0,210.0,...,757,490,687,500,639,525,594,560,566,560
4,inner melbourne,carlton north,864.0,260.0,814,260,799.0,265.0,736.0,270.0,...,497,620,495,630,467,650,418,670,384,680
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223,other regional centres,wanagaratta,705.0,125.0,671,125,631.0,130.0,623.0,130.0,...,535,380,555,390,565,390,593,395,580,400
224,other regional centres,warragul,385.0,130.0,367,135,382.0,135.0,366.0,135.0,...,507,440,542,450,558,450,543,460,541,470
225,other regional centres,warrnambool,1266.0,130.0,1229,135,1204.0,135.0,1135.0,135.0,...,881,420,861,430,846,450,844,460,840,460
226,other regional centres,wodonga,1446.0,145.0,1439,145,1468.0,150.0,1449.0,150.0,...,1205,410,1187,420,1164,420,1155,430,1139,450


### Save Feature Set

In [17]:
# Save df to a CSV file
def save_rental_data(df, file_path):
    """
    Inputs the dataframe and file path and saves all dataframes to that file 
    path location.
    """
    # Retrieve the directory 
    directory = os.path.dirname(file_path)
    
    # Create the directory if it does not exist
    if not os.path.exists(directory):
        os.makedirs(directory)

    df.to_csv(file_path, index=False)

print("All Saved!")


# Save all dataframes in the raw folder
one_bed_flat = save_rental_data(one_bed_flat, '../data/raw/rental_history/one_bed_flat.csv')
two_bed_flat = save_rental_data(two_bed_flat, '../data/raw/rental_history/two_bed_flat.csv')
three_bed_flat = save_rental_data(three_bed_flat, '../data/raw/rental_history/three_bed_flat.csv')

two_bed_house = save_rental_data(two_bed_house, '../data/raw/rental_history/two_bed_house.csv')
three_bed_house = save_rental_data(three_bed_house, '../data/raw/rental_history/three_bed_house.csv')
four_bed_house = save_rental_data(four_bed_house, '../data/raw/rental_history/four_bed_house.csv')

all_properties = save_rental_data(all_properties, '../data/raw/rental_history/all_properties.csv')

All Saved!
