----

### Import Libraries and Data

In [1]:
import pandas as pd
import os

In [2]:
# Rental History Data
one_bed_flat = pd.read_csv('../data/curated/rental_history/one_bed_flat.csv')
two_bed_flat = pd.read_csv('../data/curated/rental_history/two_bed_flat.csv')
three_bed_flat = pd.read_csv('../data/curated/rental_history/three_bed_flat.csv')
two_bed_house = pd.read_csv('../data/curated/rental_history/two_bed_house.csv')
three_bed_house = pd.read_csv('../data/curated/rental_history/three_bed_house.csv')
four_bed_house = pd.read_csv('../data/curated/rental_history/four_bed_house.csv')
all_properties = pd.read_csv('../data/curated/rental_history/all_properties.csv')

# Domain Rental Data
domain_one_bed_flat = pd.read_csv('../data/curated/domain/domain_one_bed_flat_rent.csv')
domain_two_bed_flat = pd.read_csv('../data/curated/domain/domain_two_bed_flat_rent.csv')
domain_three_bed_flat = pd.read_csv('../data/curated/domain/domain_three_bed_flat_rent.csv')
domain_two_bed_house = pd.read_csv('../data/curated/domain/domain_two_bed_house_rent.csv')
domain_three_bed_house = pd.read_csv('../data/curated/domain/domain_three_bed_house_rent.csv')
domain_four_bed_house = pd.read_csv('../data/curated/domain/domain_four_bed_house_rent.csv')
domain_all_properties = pd.read_csv('../data/curated/domain/domain_all_properties_rent.csv')

# Other engineered feature sets 
crimes = pd.read_csv('../data/curated/crimes.csv')
population = pd.read_csv('../data/curated/population.csv')
education = pd.read_csv('../data/curated/education.csv')
urban_landmarks = pd.read_csv('../data/curated/urban_landmarks.csv')
pt_distances = pd.read_csv('../data/curated/suburb_transport_distances.csv')
income = pd.read_csv('../data/curated/income.csv')

# Livability Index data 
livability_one_bed_flat = pd.read_csv('../data/curated/livability/livability_one_bed_flat.csv')
livability_two_bed_flat = pd.read_csv('../data/curated/livability/livability_two_bed_flat.csv')
livability_three_bed_flat = pd.read_csv('../data/curated/livability/livability_three_bed_flat.csv')
livability_two_bed_house = pd.read_csv('../data/curated/livability/livability_two_bed_house.csv')
livability_three_bed_house = pd.read_csv('../data/curated/livability/livability_three_bed_house.csv')
livability_four_bed_house = pd.read_csv('../data/curated/livability/livability_four_bed_house.csv')
livability_all_properties = pd.read_csv('../data/curated/livability/livability_all_properties.csv')

### Formatting Rental Dataframes

In [3]:
def clean_domain_df(df):
    """
    This function cleans the domain dataframes by removing
    the 'Unnamed:' column, renaming median_rent to 'sep_median'
    (for a standardised column name as in rental history dfs) and
    also creates a year column and inputs the relevant year that
    the data is from - 2024. 
    """

    # Drop columns that contain 'Unnamed:' in their name
    df = df.loc[:, ~df.columns.str.contains('^Unnamed:')]
    
    # Rename the 'median_rent' column to 'sep_median'
    if 'median_rent' in df.columns:
        df = df.rename(columns={'median_rent': 'sep_median'})
    
    # Add a 'year' column with value 2024 for each row
    df['year'] = 2024

    # Reorder columns to make 'year' the second column
    cols = list(df.columns)
    cols.insert(1, cols.pop(cols.index('year')))
    df = df[cols]
    
    return df

# Apply the clean_domain_df function to all the domain dataframes
domain_one_bed_flat = clean_domain_df(domain_one_bed_flat)
domain_two_bed_flat = clean_domain_df(domain_two_bed_flat)
domain_three_bed_flat = clean_domain_df(domain_three_bed_flat)
domain_two_bed_house = clean_domain_df(domain_two_bed_house)
domain_three_bed_house = clean_domain_df(domain_three_bed_house)
domain_four_bed_house = clean_domain_df(domain_four_bed_house)
domain_all_properties = clean_domain_df(domain_all_properties)


Impute the Sep median price from scraped properties into the rental history dataframes

In [4]:
def impute_sep_2024_rental_data(rental_history_df, domain_df):
    """
    This function retrieves all the median rental prices in 
    September from the domain dataframes and then imputes
    them into the rental history dataframes where the year
    is 2024 and month is September. 
    """

    # Merge rental_history_df with domain_df on 'suburb' to keep all years from rental_history_df
    merged_df = pd.merge(rental_history_df, domain_df[['suburb', 'year', 'sep_median']],
                         on=['suburb'], how='left', suffixes=('', '_domain'))
    
    # Replace sep_median values with domain values only for rows where year == 2024
    condition = (merged_df['year'] == 2024) & merged_df['sep_median_domain'].notna()
    merged_df.loc[condition, 'sep_median'] = merged_df.loc[condition, 'sep_median_domain']
    
    # Drop the domain-specific columns used for imputation
    merged_df.drop(columns=['sep_median_domain', 'year_domain'], inplace=True)

    # Filter the dataframe to keep only the suburbs that appear 9 or more times
    suburb_counts = merged_df['suburb'].value_counts()
    suburbs_to_keep = suburb_counts[suburb_counts >= 9].index
    merged_df = merged_df[merged_df['suburb'].isin(suburbs_to_keep)]
    
    # Drop the sep_median column from the domain DataFrame
    domain_df = domain_df.drop(columns=['year', 'sep_median', 'num_properties'], errors='ignore')
    
    return merged_df, domain_df

# Apply the function to each dataset 
one_bed_flat, domain_one_bed_flat = impute_sep_2024_rental_data(one_bed_flat, domain_one_bed_flat)
two_bed_flat, domain_two_bed_flat = impute_sep_2024_rental_data(two_bed_flat, domain_two_bed_flat)
three_bed_flat, domain_three_bed_flat = impute_sep_2024_rental_data(three_bed_flat, domain_three_bed_flat)
two_bed_house, domain_two_bed_house = impute_sep_2024_rental_data(two_bed_house, domain_two_bed_house)
three_bed_house, domain_three_bed_house = impute_sep_2024_rental_data(three_bed_house, domain_three_bed_house)
four_bed_house, domain_four_bed_house = impute_sep_2024_rental_data(four_bed_house, domain_four_bed_house)
all_properties, domain_all_properties = impute_sep_2024_rental_data(all_properties, domain_all_properties)

### Combining All Feature Sets

In [5]:
# Merge one_bed_flat
one_bed_flat_merged = pd.merge(one_bed_flat, crimes, on=['suburb', 'year'], how='outer')
one_bed_flat_merged = pd.merge(one_bed_flat_merged, income, on=['suburb', 'year'], how='outer')
one_bed_flat_merged = pd.merge(one_bed_flat_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
one_bed_flat_merged = pd.merge(one_bed_flat_merged, livability_one_bed_flat, on=['suburb', 'year'], how='inner')
one_bed_flat_merged = pd.merge(one_bed_flat_merged, education, on='suburb', how='inner')
one_bed_flat_merged = pd.merge(one_bed_flat_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
one_bed_flat_merged = pd.merge(one_bed_flat_merged, pt_distances, on='suburb', how='inner')
one_bed_flat_merged = pd.merge(one_bed_flat_merged, domain_one_bed_flat, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in one_bed_flat and the population dataframe
one_bed_flat_merged = one_bed_flat_merged[one_bed_flat_merged['suburb'].isin(population['sa2_name'])]
one_bed_flat_merged = one_bed_flat_merged.drop_duplicates()


# Merge two_bed_flat
two_bed_flat_merged = pd.merge(two_bed_flat, crimes, on=['suburb', 'year'], how='outer')
two_bed_flat_merged = pd.merge(two_bed_flat_merged, income, on=['suburb', 'year'], how='outer')
two_bed_flat_merged = pd.merge(two_bed_flat_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
two_bed_flat_merged = pd.merge(two_bed_flat_merged, livability_two_bed_flat, on=['suburb', 'year'], how='inner')
two_bed_flat_merged = pd.merge(two_bed_flat_merged, education, on='suburb', how='inner')
two_bed_flat_merged = pd.merge(two_bed_flat_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
two_bed_flat_merged = pd.merge(two_bed_flat_merged, pt_distances, on='suburb', how='inner')
two_bed_flat_merged = pd.merge(two_bed_flat_merged, domain_two_bed_flat, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in two_bed_flat and the population dataframe
two_bed_flat_merged = two_bed_flat_merged[two_bed_flat_merged['suburb'].isin(population['sa2_name'])]
two_bed_flat_merged = two_bed_flat_merged.drop_duplicates()


# Merge three_bed_flat
three_bed_flat_merged = pd.merge(three_bed_flat, crimes, on=['suburb', 'year'], how='outer')
three_bed_flat_merged = pd.merge(three_bed_flat_merged, income, on=['suburb', 'year'], how='outer')
three_bed_flat_merged = pd.merge(three_bed_flat_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
three_bed_flat_merged = pd.merge(three_bed_flat_merged, livability_three_bed_flat, on=['suburb', 'year'], how='inner')
three_bed_flat_merged = pd.merge(three_bed_flat_merged, education, on='suburb', how='inner')
three_bed_flat_merged = pd.merge(three_bed_flat_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
three_bed_flat_merged = pd.merge(three_bed_flat_merged, pt_distances, on='suburb', how='inner')
three_bed_flat_merged = pd.merge(three_bed_flat_merged, domain_three_bed_flat, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in three_bed_flat and the population dataframe
three_bed_flat_merged = three_bed_flat_merged[three_bed_flat_merged['suburb'].isin(population['sa2_name'])]
three_bed_flat_merged = three_bed_flat_merged.drop_duplicates()


# Merge two_bed_house
two_bed_house_merged = pd.merge(two_bed_house, crimes, on=['suburb', 'year'], how='outer')
two_bed_house_merged = pd.merge(two_bed_house_merged, income, on=['suburb', 'year'], how='outer')
two_bed_house_merged = pd.merge(two_bed_house_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
two_bed_house_merged = pd.merge(two_bed_house_merged, livability_two_bed_house, on=['suburb', 'year'], how='inner')
two_bed_house_merged = pd.merge(two_bed_house_merged, education, on='suburb', how='inner')
two_bed_house_merged = pd.merge(two_bed_house_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
two_bed_house_merged = pd.merge(two_bed_house_merged, pt_distances, on='suburb', how='inner')
two_bed_house_merged = pd.merge(two_bed_house_merged, domain_two_bed_house, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in two_bed_house and the population dataframe
two_bed_house_merged = two_bed_house_merged[two_bed_house_merged['suburb'].isin(population['sa2_name'])]
two_bed_house_merged = two_bed_house_merged.drop_duplicates()


# Merge three_bed_house
three_bed_house_merged = pd.merge(three_bed_house, crimes, on=['suburb', 'year'], how='outer')
three_bed_house_merged = pd.merge(three_bed_house_merged, income, on=['suburb', 'year'], how='outer')
three_bed_house_merged = pd.merge(three_bed_house_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
three_bed_house_merged = pd.merge(three_bed_house_merged, livability_three_bed_house, on=['suburb', 'year'], how='inner')
three_bed_house_merged = pd.merge(three_bed_house_merged, education, on='suburb', how='inner')
three_bed_house_merged = pd.merge(three_bed_house_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
three_bed_house_merged = pd.merge(three_bed_house_merged, pt_distances, on='suburb', how='inner')
three_bed_house_merged = pd.merge(three_bed_house_merged, domain_three_bed_house, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in three_bed_house and the population dataframe
three_bed_house_merged = three_bed_house_merged[three_bed_house_merged['suburb'].isin(population['sa2_name'])]
three_bed_house_merged = three_bed_house_merged.drop_duplicates()


# Merge four_bed_house
four_bed_house_merged = pd.merge(four_bed_house, crimes, on=['suburb', 'year'], how='outer')
four_bed_house_merged = pd.merge(four_bed_house_merged, income, on=['suburb', 'year'], how='outer')
four_bed_house_merged = pd.merge(four_bed_house_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
four_bed_house_merged = pd.merge(four_bed_house_merged, livability_four_bed_house, on=['suburb', 'year'], how='inner')
four_bed_house_merged = pd.merge(four_bed_house_merged, education, on='suburb', how='inner')
four_bed_house_merged = pd.merge(four_bed_house_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
four_bed_house_merged = pd.merge(four_bed_house_merged, pt_distances, on='suburb', how='inner')
four_bed_house_merged = pd.merge(four_bed_house_merged, domain_four_bed_house, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in four_bed_house and the population dataframe
four_bed_house_merged = four_bed_house_merged[four_bed_house_merged['suburb'].isin(population['sa2_name'])]
four_bed_house_merged =four_bed_house_merged.drop_duplicates()

# Merge all_properties
all_properties_merged = pd.merge(all_properties, crimes, on=['suburb', 'year'], how='outer')
all_properties_merged = pd.merge(all_properties_merged, income, on=['suburb', 'year'], how='outer')
all_properties_merged = pd.merge(all_properties_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
all_properties_merged = pd.merge(all_properties_merged, livability_all_properties, on=['suburb', 'year'], how='inner')
all_properties_merged = pd.merge(all_properties_merged, education, on='suburb', how='inner')
all_properties_merged = pd.merge(all_properties_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
all_properties_merged = pd.merge(all_properties_merged, pt_distances, on='suburb', how='inner')
all_properties_merged = pd.merge(all_properties_merged, domain_all_properties, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in all_properties and the population dataframe
all_properties_merged = all_properties_merged[all_properties_merged['suburb'].isin(population['sa2_name'])]
all_properties_merged = all_properties_merged.drop_duplicates()

In [6]:
# Drop all other suburb column names. Only keep the first suburb column 
def clean_suburb_cols(df):
    """
    This function removes all duplicates of the suburb column name 
    from the merged dataframes. The duplicate suburb column name 
    could be 'Unnamed', 'sa2_name' or 'gazetted_locality'.
    """
    df = df.loc[:, ~df.columns.str.contains('Unnamed')]  # removes the duplicate 'suburb' column
    columns_to_drop = ['sa2_name', 'gazetted_locality']
    df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

    return df

# Clean the column names
one_bed_flat_merged = clean_suburb_cols(one_bed_flat_merged)
two_bed_flat_merged = clean_suburb_cols(two_bed_flat_merged)
three_bed_flat_merged = clean_suburb_cols(three_bed_flat_merged)
two_bed_house_merged = clean_suburb_cols(two_bed_house_merged)
three_bed_house_merged = clean_suburb_cols(three_bed_house_merged)
four_bed_house_merged = clean_suburb_cols(four_bed_house_merged)
all_properties_merged = clean_suburb_cols(all_properties_merged)
all_properties_merged.shape

(2364, 88)

In [7]:
def remove_nan_before_2025(df, median_columns):
    """
    This function removes all suburbs that has missing median rent values in the data 
    from the years before 2025. This excludes suburbs which would have had NaN values
    in our training set.
    """

    # Drop rows where year is less than 2025 and NaN values exist in any of the median columns
    return df[~((df['year'] < 2025) & (df[median_columns].isnull().any(axis=1)))]

# Define the median columns to check
median_columns = ['dec_median', 'jun_median', 'mar_median', 'sep_median']

# Call the function for each dataframe and reassign the cleaned data
one_bed_flat_merged = remove_nan_before_2025(one_bed_flat_merged, median_columns)
two_bed_flat_merged = remove_nan_before_2025(two_bed_flat_merged, median_columns)
three_bed_flat_merged = remove_nan_before_2025(three_bed_flat_merged, median_columns)
two_bed_house_merged = remove_nan_before_2025(two_bed_house_merged, median_columns)
three_bed_house_merged = remove_nan_before_2025(three_bed_house_merged, median_columns)
four_bed_house_merged = remove_nan_before_2025(four_bed_house_merged, median_columns)
all_properties_merged = remove_nan_before_2025(all_properties_merged, median_columns)
all_properties_merged.shape

(2364, 88)

### Save All Properties Dataframe for Visualisation Purposes

In [8]:
def save_merged_dataframes():
    """
    This function saves all merged dataframes to the defined path.
    """
    # Define the base path
    base_path = '../data/curated/merged_feature_set'

    # Ensure the directory exists
    if not os.path.exists(base_path):
        os.makedirs(base_path)

    # Save each dataframe to a CSV file
    one_bed_flat_merged.to_csv(os.path.join(base_path, 'one_bed_flat_merged.csv'), index=False)
    two_bed_flat_merged.to_csv(os.path.join(base_path, 'two_bed_flat_merged.csv'), index=False)
    three_bed_flat_merged.to_csv(os.path.join(base_path, 'three_bed_flat_merged.csv'), index=False)
    two_bed_house_merged.to_csv(os.path.join(base_path, 'two_bed_house_merged.csv'), index=False)
    three_bed_house_merged.to_csv(os.path.join(base_path, 'three_bed_house_merged.csv'), index=False)
    four_bed_house_merged.to_csv(os.path.join(base_path, 'four_bed_house_merged.csv'), index=False)
    all_properties_merged.to_csv(os.path.join(base_path, 'all_properties_merged.csv'), index=False)

# Call the function
save_merged_dataframes()