### Import Libraries and Data

In [17]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [18]:
# Rental History Data
one_bed_flat = pd.read_csv('../data/raw/rental_history/one_bed_flat.csv')
two_bed_flat = pd.read_csv('../data/raw/rental_history/two_bed_flat.csv')
three_bed_flat = pd.read_csv('../data/raw/rental_history/three_bed_flat.csv')
two_bed_house = pd.read_csv('../data/raw/rental_history/two_bed_house.csv')
three_bed_house = pd.read_csv('../data/raw/rental_history/three_bed_house.csv')
four_bed_house = pd.read_csv('../data/raw/rental_history/four_bed_house.csv')
all_properties = pd.read_csv('../data/raw/rental_history/all_properties.csv')

# Domain Rental Data
domain_one_bed_flat = pd.read_csv('../data/curated/domain_one_bed_flat_rent.csv')
domain_two_bed_flat = pd.read_csv('../data/curated/domain_two_bed_flat_rent.csv')
domain_three_bed_flat = pd.read_csv('../data/curated/domain_three_bed_flat_rent.csv')
domain_two_bed_house = pd.read_csv('../data/curated/domain_two_bed_house_rent.csv')
domain_three_bed_house = pd.read_csv('../data/curated/domain_three_bed_house_rent.csv')
domain_four_bed_house = pd.read_csv('../data/curated/domain_four_bed_house.csv')
domain_all_properties = pd.read_csv('../data/curated/domain_all_properties_rent.csv')

# Other engineered feature sets 
crimes = pd.read_csv('../data/curated/crimes.csv')
demographics = pd.read_csv('../data/curated/demographics.csv')
education = pd.read_csv('../data/curated/education_df.csv')
urban_landmarks = pd.read_csv('../data/raw/urban_landmarks_features.csv')

### Combining All Feature Sets

In [19]:
# Merge one_bed_flat
one_bed_merged = pd.merge(one_bed_flat, domain_one_bed_flat, on='suburb', how='inner')
one_bed_merged = pd.merge(one_bed_merged, crimes, on='suburb', how='inner')
one_bed_merged = pd.merge(one_bed_merged, demographics, left_on='suburb', right_on='sa2_name', how='inner')
one_bed_merged = pd.merge(one_bed_merged, education, on='suburb', how='inner')
one_bed_merged = pd.merge(one_bed_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')

# Merge two_bed_flat
two_bed_merged = pd.merge(two_bed_flat, domain_two_bed_flat, on='suburb', how='inner')
two_bed_merged = pd.merge(two_bed_merged, crimes, on='suburb', how='inner')
two_bed_merged = pd.merge(two_bed_merged, demographics, left_on='suburb', right_on='sa2_name', how='inner')
two_bed_merged = pd.merge(two_bed_merged, education, on='suburb', how='inner')
two_bed_merged = pd.merge(two_bed_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')

# Merge three_bed_flat
three_bed_merged = pd.merge(three_bed_flat, domain_three_bed_flat, on='suburb', how='inner')
three_bed_merged = pd.merge(three_bed_merged, crimes, on='suburb', how='inner')
three_bed_merged = pd.merge(three_bed_merged, demographics, left_on='suburb', right_on='sa2_name', how='inner')
three_bed_merged = pd.merge(three_bed_merged, education, on='suburb', how='inner')
three_bed_merged = pd.merge(three_bed_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')

# Merge two_bed_house
two_bed_house_merged = pd.merge(two_bed_house, domain_two_bed_house, on='suburb', how='inner')
two_bed_house_merged = pd.merge(two_bed_house_merged, crimes, on='suburb', how='inner')
two_bed_house_merged = pd.merge(two_bed_house_merged, demographics, left_on='suburb', right_on='sa2_name', how='inner')
two_bed_house_merged = pd.merge(two_bed_house_merged, education, on='suburb', how='inner')
two_bed_house_merged = pd.merge(two_bed_house_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')

# Merge three_bed_house
three_bed_house_merged = pd.merge(three_bed_house, domain_three_bed_house, on='suburb', how='inner')
three_bed_house_merged = pd.merge(three_bed_house_merged, crimes, on='suburb', how='inner')
three_bed_house_merged = pd.merge(three_bed_house_merged, demographics, left_on='suburb', right_on='sa2_name', how='inner')
three_bed_house_merged = pd.merge(three_bed_house_merged, education, on='suburb', how='inner')
three_bed_house_merged = pd.merge(three_bed_house_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')

# Merge four_bed_house
four_bed_house_merged = pd.merge(four_bed_house, domain_four_bed_house, on='suburb', how='inner')
four_bed_house_merged = pd.merge(four_bed_house_merged, crimes, on='suburb', how='inner')
four_bed_house_merged = pd.merge(four_bed_house_merged, demographics, left_on='suburb', right_on='sa2_name', how='inner')
four_bed_house_merged = pd.merge(four_bed_house_merged, education, on='suburb', how='inner')
four_bed_house_merged = pd.merge(four_bed_house_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')

# Merge all_properties
all_properties_merged = pd.merge(all_properties, domain_all_properties, on='suburb', how='inner')
all_properties_merged = pd.merge(all_properties_merged, crimes, on='suburb', how='inner')
all_properties_merged = pd.merge(all_properties_merged, demographics, left_on='suburb', right_on='sa2_name', how='inner')
all_properties_merged = pd.merge(all_properties_merged, education, on='suburb', how='inner')
all_properties_merged = pd.merge(all_properties_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')


In [20]:
# Drop all other suburb column names. Only keep the first suburb column 
def clean_merged_df(df):
    
    df = df.loc[:, ~df.columns.str.contains('Unnamed')]  # removes the duplicate 'suburb' column
    columns_to_drop = ['sa2_name', 'gazetted_locality']
    df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

    return df

# Clean the column names
one_bed_merged = clean_merged_df(one_bed_merged)
two_bed_merged = clean_merged_df(two_bed_merged)
three_bed_merged = clean_merged_df(three_bed_merged)
two_bed_house_merged = clean_merged_df(two_bed_house_merged,)
three_bed_house_merged = clean_merged_df(three_bed_house_merged)
four_bed_house_merged = clean_merged_df(four_bed_house_merged)
all_properties_merged = clean_merged_df(all_properties_merged)

In [21]:
# Check columns and shape
print(list(all_properties_merged.columns))
all_properties_merged.shape

['suburb', 'mar_2016_median', 'jun_2016_median', 'sep_2016_median', 'dec_2016_median', 'mar_2017_median', 'jun_2017_median', 'sep_2017_median', 'dec_2017_median', 'mar_2018_median', 'jun_2018_median', 'sep_2018_median', 'dec_2018_median', 'mar_2019_median', 'jun_2019_median', 'sep_2019_median', 'dec_2019_median', 'mar_2020_median', 'jun_2020_median', 'sep_2020_median', 'dec_2020_median', 'mar_2021_median', 'jun_2021_median', 'sep_2021_median', 'dec_2021_median', 'mar_2022_median', 'jun_2022_median', 'sep_2022_median', 'dec_2022_median', 'mar_2023_median', 'jun_2023_median', 'sep_2023_median', 'dec_2023_median', 'mar_2024_median', 'median_rent', 'median_bath', 'median_parkings', 'furnished_count', 'unfurnished_count', 'pets_allowed', 'pets_not_allowed', 'num_properties', 'offence_division', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', 'erp_june_2022_count', 'erp_june_2023_count', 'erp_change_count', 'erp_change_percentage', 'natural_increase_count', 'net_internal_migr

(1110, 122)

### Create Train & Test Sets

In [22]:
def train_test_sets(df):
    """
    Input the dataframe. The train sets include the data from the year of 2016-2021 and 
    the test sets include years from 2022-2024. The label sets (y) should only include the 
    median house prices for each month and year, whereas the predictor variables (X) should 
    include any engineered features correspondeiing to the years of the train and test sets 
    whilst also including the general features we created relevant to each suburb. Return
    the following dataframes: X_train, X_test, y_train, y_test.
    """

    # Train set columns = 2016-2021, Test set columns = 2022-2024
    train_years = [str(year) for year in range(2016, 2022)]
    test_years = [str(year) for year in range(2022, 2025)]

    # Filter for the respective columns for the training and test set 
    train_columns = [col for col in df.columns if any(year in col for year in train_years)]
    test_columns = [col for col in df.columns if any(year in col for year in test_years)]

     # Identify columns that do not contain any year and add them to the train and test sets 
    non_year_columns = [col for col in df.columns if not any(str(year) in col for year in range(2016, 2025))]

     # Exclude the 'suburb' column if it exists
    if 'suburb' in non_year_columns:
        non_year_columns.remove('suburb')
        
    train_columns.extend(non_year_columns)
    test_columns.extend(non_year_columns)

    # Extract features and target columns
    X_train = df[[col for col in train_columns if '_median' not in col]]
    y_train = df[[col for col in train_columns if '_median' in col]]

    X_test = df[[col for col in test_columns if '_median' not in col]]
    y_test = df[[col for col in test_columns if '_median' in col]]

    return X_train, X_test, y_train, y_test


# Create training and test sets 
X_train_one_bed, X_test_one_bed, y_train_one_bed, y_test_one_bed = train_test_sets(one_bed_merged)
X_train_two_bed, X_test_two_bed, y_train_two_bed, y_test_two_bed = train_test_sets(two_bed_merged)
X_train_three_bed, X_test_three_bed, y_train_three_bed, y_test_three_bed = train_test_sets(three_bed_merged)
X_train_two_bed_house, X_test_two_bed_house, y_train_two_bed_house, y_test_two_bed_house = train_test_sets(two_bed_house_merged)
X_train_three_bed_house, X_test_three_bed_house, y_train_three_bed_house, y_test_three_bed_house = train_test_sets(three_bed_house_merged)
X_train_four_bed_house, X_test_four_bed_house, y_train_four_bed_house, y_test_four_bed_house = train_test_sets(four_bed_house_merged)
X_train_all_properties, X_test_all_properties, y_train_all_properties, y_test_all_properties = train_test_sets(all_properties_merged)

### Feature Selection

In [24]:
# Check for missing values in the DataFrame
missing_values = X_test_one_bed.isnull().sum()

# Filter columns that have missing values
columns_with_missing = missing_values[missing_values > 0]

# Display the columns with missing values and their counts
print("Columns with missing values and their counts:")
print(columns_with_missing)


Columns with missing values and their counts:
average_highschool_rank        144
median_highschool_rank         144
average_primary_school_rank     30
median_primary_school_rank      30
average_school_rank              6
median_school_rank               6
dtype: int64


In [8]:
def pca_feature_selection(X_train, X_test, variance_threshold=0.95):
    """
    The function applies PCA for dimensionality reduction by fitting on the
    training set and transforming both the training and test sets. It keeps 
    either a specified number of components or selects them based on a variance 
    threshold. It returns the reduced training and test sets.
    """

    # Standardise the data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Initialise PCA, specifying the variance threshold
    pca_temp = PCA().fit(X_train_scaled)
    cumulative_variance = pca_temp.explained_variance_ratio_.cumsum()
    # Find the number of components to capture the specified variance
    n_components = next(i for i, total_variance in enumerate(cumulative_variance) if total_variance >= variance_threshold) + 1

    pca = PCA(n_components=n_components)

    # Fit PCA on the training set and transform both training and test sets
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_test_pca = pca.transform(X_test_scaled)

    print(f"Number of components selected: {n_components}")
    print(f"Explained variance by selected components: {sum(pca.explained_variance_ratio_):.2f}")

    return X_train_pca, X_test_pca

# Perform feature selection with PCA on the X sets 
X_train_one_bed, X_test_one_bed = pca_feature_selection(X_train_one_bed, X_test_one_bed )
X_train_two_bed, X_test_two_bed = pca_feature_selection(X_train_two_bed, X_test_two_bed)
X_train_three_bed, X_test_three_bed = pca_feature_selection(X_train_three_bed, X_test_three_bed)
X_train_two_bed_house, X_test_two_bed_house = pca_feature_selection(X_train_two_bed_house, X_test_two_bed_house)
X_train_three_bed_house, X_test_three_bed_house = pca_feature_selection(X_train_three_bed_house, X_test_three_bed_house)
X_train_four_bed_house, X_test_four_bed_house = pca_feature_selection(X_train_four_bed_house, X_test_four_bed_house)
X_train_all_properties, X_test_all_properties = pca_feature_selection(X_train_all_properties, X_test_all_properties)


ValueError: Expected 2D array, got scalar array instead:
array=nan.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

### Save Combined Feature Set

In [None]:
# def save_feature_set(df, file_path):
#      # Retrieve the directory 
#     directory = os.path.dirname(file_path)
    
#     # Create the directory if it does not exist
#     if not os.path.exists(directory):
#         os.makedirs(directory)

#     # Save the filtered dataframe to CSV
#     df.to_csv(file_path, index=False)

# print("All Saved!")

# # Save all dataframes in the raw folder
# one_bed_merged = save_feature_set(one_bed_merged, '../data/curated/combined_feature_set/one_bed_merged.csv')
# two_bed_merged = save_feature_set(two_bed_merged, '../data/curated/combined_feature_set/two_bed_merged.csv')
# three_bed_merged = save_feature_set(three_bed_merged, '../data/curated/combined_feature_set/three_bed_merged.csv')

# two_bed_house_merged = save_feature_set(two_bed_house_merged, '../data/curated/combined_feature_set/two_bed_house_merged.csv')
# three_bed_house_merged = save_feature_set(three_bed_house_merged, '../data/curated/combined_feature_set/three_bed_house_merged.csv')
# four_bed_house_merged = save_feature_set(four_bed_house_merged, '../data/curated/combined_feature_set/four_bed_house_merged.csv')

# all_properties_merged = save_feature_set(all_properties_merged, '../data/curated/combined_feature_set/all_properties_merged.csv')

All Saved!
