### Import Libraries and Data

In [86]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [87]:
# Rental History Data
one_bed_flat = pd.read_csv('../data/raw/rental_history/one_bed_flat.csv')
two_bed_flat = pd.read_csv('../data/raw/rental_history/two_bed_flat.csv')
three_bed_flat = pd.read_csv('../data/raw/rental_history/three_bed_flat.csv')
two_bed_house = pd.read_csv('../data/raw/rental_history/two_bed_house.csv')
three_bed_house = pd.read_csv('../data/raw/rental_history/three_bed_house.csv')
four_bed_house = pd.read_csv('../data/raw/rental_history/four_bed_house.csv')
all_properties = pd.read_csv('../data/raw/rental_history/all_properties.csv')

# Domain Rental Data
domain_one_bed_flat = pd.read_csv('../data/curated/domain_one_bed_flat_rent.csv')
domain_two_bed_flat = pd.read_csv('../data/curated/domain_two_bed_flat_rent.csv')
domain_three_bed_flat = pd.read_csv('../data/curated/domain_three_bed_flat_rent.csv')
domain_two_bed_house = pd.read_csv('../data/curated/domain_two_bed_house_rent.csv')
domain_three_bed_house = pd.read_csv('../data/curated/domain_three_bed_house_rent.csv')
domain_four_bed_house = pd.read_csv('../data/curated/domain_four_bed_house.csv')
domain_all_properties = pd.read_csv('../data/curated/domain_all_properties_rent.csv')

# Other engineered feature sets 
crimes = pd.read_csv('../data/curated/crimes.csv')
population = pd.read_csv('../data/curated/final_population.csv')
education = pd.read_csv('../data/curated/education_df.csv')
urban_landmarks = pd.read_csv('../data/raw/urban_landmarks_features.csv')
pt_distances = pd.read_csv('../data/curated/suburb_transport_distances.csv')
income_2016 = pd.read_csv('../data/curated/income_2016.csv')
income_2021 = pd.read_csv('../data/curated/income_2021.csv')

### Combining All Feature Sets

In [79]:
# Merge one_bed_flat
one_bed_merged = pd.merge(one_bed_flat, domain_one_bed_flat, on='suburb', how='inner')
one_bed_merged = pd.merge(one_bed_merged, crimes, on='suburb', how='inner')
one_bed_merged = pd.merge(one_bed_merged, population, left_on='suburb', right_on='sa2_name', how='inner')
one_bed_merged = pd.merge(one_bed_merged, education, on='suburb', how='inner')
one_bed_merged = pd.merge(one_bed_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
one_bed_merged = pd.merge(one_bed_merged, pt_distances, on='suburb', how='inner')
one_bed_merged = pd.merge(one_bed_merged, income_2016, on='suburb', how='inner')
one_bed_merged = pd.merge(one_bed_merged, income_2021, on='suburb', how='inner')

# Merge two_bed_flat
two_bed_merged = pd.merge(two_bed_flat, domain_two_bed_flat, on='suburb', how='inner')
two_bed_merged = pd.merge(two_bed_merged, crimes, on='suburb', how='inner')
two_bed_merged = pd.merge(two_bed_merged, population, left_on='suburb', right_on='sa2_name', how='inner')
two_bed_merged = pd.merge(two_bed_merged, education, on='suburb', how='inner')
two_bed_merged = pd.merge(two_bed_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
two_bed_merged = pd.merge(two_bed_merged, pt_distances, on='suburb', how='inner')
two_bed_merged = pd.merge(two_bed_merged, income_2016, on='suburb', how='inner')
two_bed_merged = pd.merge(two_bed_merged, income_2021, on='suburb', how='inner')

# Merge three_bed_flat
three_bed_merged = pd.merge(three_bed_flat, domain_three_bed_flat, on='suburb', how='inner')
three_bed_merged = pd.merge(three_bed_merged, crimes, on='suburb', how='inner')
three_bed_merged = pd.merge(three_bed_merged, population, left_on='suburb', right_on='sa2_name', how='inner')
three_bed_merged = pd.merge(three_bed_merged, education, on='suburb', how='inner')
three_bed_merged = pd.merge(three_bed_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
three_bed_merged = pd.merge(three_bed_merged, pt_distances, on='suburb', how='inner')
three_bed_merged = pd.merge(three_bed_merged, income_2016, on='suburb', how='inner')
three_bed_merged = pd.merge(three_bed_merged, income_2021, on='suburb', how='inner')

# Merge two_bed_house
two_bed_house_merged = pd.merge(two_bed_house, domain_two_bed_house, on='suburb', how='inner')
two_bed_house_merged = pd.merge(two_bed_house_merged, crimes, on='suburb', how='inner')
two_bed_house_merged = pd.merge(two_bed_house_merged, population, left_on='suburb', right_on='sa2_name', how='inner')
two_bed_house_merged = pd.merge(two_bed_house_merged, education, on='suburb', how='inner')
two_bed_house_merged = pd.merge(two_bed_house_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
two_bed_house_merged = pd.merge(two_bed_house_merged, pt_distances, on='suburb', how='inner')
two_bed_house_merged = pd.merge(two_bed_house_merged, income_2016, on='suburb', how='inner')
two_bed_house_merged = pd.merge(two_bed_house_merged, income_2021, on='suburb', how='inner')

# Merge three_bed_house
three_bed_house_merged = pd.merge(three_bed_house, domain_three_bed_house, on='suburb', how='inner')
three_bed_house_merged = pd.merge(three_bed_house_merged, crimes, on='suburb', how='inner')
three_bed_house_merged = pd.merge(three_bed_house_merged, population, left_on='suburb', right_on='sa2_name', how='inner')
three_bed_house_merged = pd.merge(three_bed_house_merged, education, on='suburb', how='inner')
three_bed_house_merged = pd.merge(three_bed_house_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
three_bed_house_merged = pd.merge(three_bed_house_merged, pt_distances, on='suburb', how='inner')
three_bed_house_merged = pd.merge(two_bed_house_merged, income_2016, on='suburb', how='inner')
three_bed_house_merged = pd.merge(two_bed_house_merged, income_2021, on='suburb', how='inner')

# Merge four_bed_house
four_bed_house_merged = pd.merge(four_bed_house, domain_four_bed_house, on='suburb', how='inner')
four_bed_house_merged = pd.merge(four_bed_house_merged, crimes, on='suburb', how='inner')
four_bed_house_merged = pd.merge(four_bed_house_merged, population, left_on='suburb', right_on='sa2_name', how='inner')
four_bed_house_merged = pd.merge(four_bed_house_merged, education, on='suburb', how='inner')
four_bed_house_merged = pd.merge(four_bed_house_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
four_bed_house_merged = pd.merge(four_bed_house_merged, pt_distances, on='suburb', how='inner')
four_bed_house_merged = pd.merge(four_bed_house_merged, income_2016, on='suburb', how='inner')
four_bed_house_merged = pd.merge(four_bed_house_merged, income_2021, on='suburb', how='inner')

# Merge all_properties
all_properties_merged = pd.merge(all_properties, domain_all_properties, on='suburb', how='inner')
all_properties_merged = pd.merge(all_properties_merged, crimes, on='suburb', how='inner')
all_properties_merged = pd.merge(all_properties_merged, population, left_on='suburb', right_on='sa2_name', how='inner')
all_properties_merged = pd.merge(all_properties_merged, education, on='suburb', how='inner')
all_properties_merged = pd.merge(all_properties_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
all_properties_merged = pd.merge(all_properties_merged, pt_distances, on='suburb', how='inner')
all_properties_merged = pd.merge(all_properties_merged, income_2016, on='suburb', how='inner')
all_properties_merged = pd.merge(all_properties_merged, income_2021, on='suburb', how='inner')

In [80]:
# Drop all other suburb column names. Only keep the first suburb column 
def clean_merged_df(df):
    
    df = df.loc[:, ~df.columns.str.contains('Unnamed')]  # removes the duplicate 'suburb' column
    columns_to_drop = ['sa2_name', 'gazetted_locality']
    df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

    return df

# Clean the column names
one_bed_merged = clean_merged_df(one_bed_merged)
two_bed_merged = clean_merged_df(two_bed_merged)
three_bed_merged = clean_merged_df(three_bed_merged)
two_bed_house_merged = clean_merged_df(two_bed_house_merged,)
three_bed_house_merged = clean_merged_df(three_bed_house_merged)
four_bed_house_merged = clean_merged_df(four_bed_house_merged)
all_properties_merged = clean_merged_df(all_properties_merged)

### Create Train & Test Sets

In [82]:
def train_test_sets(df, is_crime_data=False):
    """
    Input the dataframe. The train sets include the data from the year of 2016-2021 and 
    the test sets include years from 2022-2024. The label sets (y) should only include the 
    median house prices for each month and year, whereas the predictor variables (X) should 
    include any engineered features correspondeiing to the years of the train and test sets 
    whilst also including the general features we created relevant to each suburb. Return
    the following dataframes: X_train, X_test, y_train, y_test.
    """

    # Train set columns = 2016-2021, Test set columns = 2022-2024
    train_years = [str(year) for year in range(2016, 2022)]
    test_years = [str(year) for year in range(2022, 2025)]

    # Filter for the respective columns for the training and test set 
    train_columns = [col for col in df.columns if any(year in col for year in train_years)]
    test_columns = [col for col in df.columns if any(year in col for year in test_years)]

     # Identify columns that do not contain any year and add them to the train and test sets 
    non_year_columns = [col for col in df.columns if not any(str(year) in col for year in range(2016, 2025))]

     # Exclude the 'suburb' column if it exists
    if 'suburb' in non_year_columns:
        non_year_columns.remove('suburb')
        
    train_columns.extend(non_year_columns)
    test_columns.extend(non_year_columns)

    # Extract features and target columns
    X_train = df[[col for col in train_columns if '_median' not in col]]
    y_train = df[[col for col in train_columns if '_median' in col]]

    X_test = df[[col for col in test_columns if '_median' not in col]]
    y_test = df[[col for col in test_columns if '_median' in col]]

    return X_train, X_test, y_train, y_test


# Create training and test sets 
X_train_one_bed, X_test_one_bed, y_train_one_bed, y_test_one_bed = train_test_sets(one_bed_merged)
X_train_two_bed, X_test_two_bed, y_train_two_bed, y_test_two_bed = train_test_sets(two_bed_merged)
X_train_three_bed, X_test_three_bed, y_train_three_bed, y_test_three_bed = train_test_sets(three_bed_merged)
X_train_two_bed_house, X_test_two_bed_house, y_train_two_bed_house, y_test_two_bed_house = train_test_sets(two_bed_house_merged)
X_train_three_bed_house, X_test_three_bed_house, y_train_three_bed_house, y_test_three_bed_house = train_test_sets(three_bed_house_merged)
X_train_four_bed_house, X_test_four_bed_house, y_train_four_bed_house, y_test_four_bed_house = train_test_sets(four_bed_house_merged)
X_train_all_properties, X_test_all_properties, y_train_all_properties, y_test_all_properties = train_test_sets(all_properties_merged)

In [83]:
X_train_one_bed.columns

Index(['personal_income_count_2020', 'personal_total_income_millions_2020',
       'median_personal_total_income_2020', 'mean_personal_total_income_2020',
       'gini_coef_2020', 'hi_1_149_tot_2016', 'hi_150_299_tot_2016',
       'hi_300_399_tot_2016', 'hi_400_499_tot_2016', 'hi_500_649_tot_2016',
       ...
       'distance_to_hotel', 'distance_to_kindergarten', 'distance_to_library',
       'distance_to_mall', 'distance_to_park', 'distance_to_police',
       'distance_to_restaurant', 'distance_to_supermarket',
       'nearest_transport_avg_distance', 'distance_to_cbd'],
      dtype='object', length=109)

### Check Null Values

In [84]:
# Check for missing values in the DataFrame
missing_values = X_test_one_bed.isnull().sum()

# Filter columns that have missing values
columns_with_missing = missing_values[missing_values > 0]

# Display the columns with missing values and their counts
print("Columns with missing values and their counts:")
print(columns_with_missing)


Columns with missing values and their counts:
Series([], dtype: int64)
