### Import Libraries and Data

In [1237]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [1238]:
# Rental History Data
one_bed_flat = pd.read_csv('../data/raw/rental_history/one_bed_flat.csv')
two_bed_flat = pd.read_csv('../data/raw/rental_history/two_bed_flat.csv')
three_bed_flat = pd.read_csv('../data/raw/rental_history/three_bed_flat.csv')
two_bed_house = pd.read_csv('../data/raw/rental_history/two_bed_house.csv')
three_bed_house = pd.read_csv('../data/raw/rental_history/three_bed_house.csv')
four_bed_house = pd.read_csv('../data/raw/rental_history/four_bed_house.csv')
all_properties = pd.read_csv('../data/raw/rental_history/all_properties.csv')

# Domain Rental Data
domain_one_bed_flat = pd.read_csv('../data/curated/domain_one_bed_flat_rent.csv')
domain_two_bed_flat = pd.read_csv('../data/curated/domain_two_bed_flat_rent.csv')
domain_three_bed_flat = pd.read_csv('../data/curated/domain_three_bed_flat_rent.csv')
domain_two_bed_house = pd.read_csv('../data/curated/domain_two_bed_house_rent.csv')
domain_three_bed_house = pd.read_csv('../data/curated/domain_three_bed_house_rent.csv')
domain_four_bed_house = pd.read_csv('../data/curated/domain_four_bed_house.csv')
domain_all_properties = pd.read_csv('../data/curated/domain_all_properties_rent.csv')

# Other engineered feature sets 
crimes = pd.read_csv('../data/curated/crimes.csv')
population = pd.read_csv('../data/curated/final_population.csv')
education = pd.read_csv('../data/curated/education_df.csv')
urban_landmarks = pd.read_csv('../data/raw/urban_landmarks_features.csv')
pt_distances = pd.read_csv('../data/curated/suburb_transport_distances.csv')
income = pd.read_csv('../data/curated/income.csv')

### Formatting Rental Dataframes

In [1239]:
def clean_domain_data(df):
    # Drop columns that contain 'Unnamed:' in their name
    df = df.loc[:, ~df.columns.str.contains('^Unnamed:')]
    
    # Rename the 'median_rent' column to 'median_rent_sep'
    if 'median_rent' in df.columns:
        df = df.rename(columns={'median_rent': 'sep_median'})
    
    # Add a 'year' column with value 2024 for each row
    df['year'] = 2024

    # Reorder columns to make 'year' the second column
    cols = list(df.columns)
    cols.insert(1, cols.pop(cols.index('year')))
    df = df[cols]
    
    return df

# Example usage on your list of dataframes:
domain_one_bed_flat = clean_domain_data(domain_one_bed_flat)
domain_two_bed_flat = clean_domain_data(domain_two_bed_flat)
domain_three_bed_flat = clean_domain_data(domain_three_bed_flat)
domain_two_bed_house = clean_domain_data(domain_two_bed_house)
domain_three_bed_house = clean_domain_data(domain_three_bed_house)
domain_four_bed_house = clean_domain_data(domain_four_bed_house)
domain_all_properties = clean_domain_data(domain_all_properties)


Impute the Sep median price from scraped properties

In [1240]:
def impute_values(main_df, domain_df):
    # Merge main_df with domain_df on 'suburb' to keep all years from main_df
    merged_df = pd.merge(main_df, domain_df[['suburb', 'year', 'sep_median']],
                         on=['suburb'], how='left', suffixes=('', '_domain'))
    
    # Replace sep_median values with domain values only for rows where year == 2024
    condition = (merged_df['year'] == 2024) & merged_df['sep_median_domain'].notna()
    merged_df.loc[condition, 'sep_median'] = merged_df.loc[condition, 'sep_median_domain']
    
    # Drop the domain-specific columns used for imputation
    merged_df.drop(columns=['sep_median_domain', 'year_domain'], inplace=True)
    
    # Drop the sep_median column from the domain DataFrame
    domain_df = domain_df.drop(columns=['year', 'sep_median', 'num_properties'], errors='ignore')
    
    return merged_df, domain_df

# Impute for each dataset and drop the sep_median from domain DataFrames
one_bed_flat, domain_one_bed_flat = impute_values(one_bed_flat, domain_one_bed_flat)
two_bed_flat, domain_two_bed_flat = impute_values(two_bed_flat, domain_two_bed_flat)
three_bed_flat, domain_three_bed_flat = impute_values(three_bed_flat, domain_three_bed_flat)
two_bed_house, domain_two_bed_house = impute_values(two_bed_house, domain_two_bed_house)
three_bed_house, domain_three_bed_house = impute_values(three_bed_house, domain_three_bed_house)
four_bed_house, domain_four_bed_house = impute_values(four_bed_house, domain_four_bed_house)
all_properties, domain_all_properties = impute_values(all_properties, domain_all_properties)

# Example output to see the results
domain_one_bed_flat.isnull().sum()

suburb               0
median_bath          0
median_parkings      0
furnished_count      0
unfurnished_count    0
pets_allowed         0
pets_not_allowed     0
dtype: int64

### Combining All Feature Sets

In [1241]:
# Merge one_bed_flat
one_bed_flat_merged = pd.merge(one_bed_flat, crimes, on=['suburb', 'year'], how='outer')
one_bed_flat_merged = pd.merge(one_bed_flat_merged, income, on=['suburb', 'year'], how='outer')
one_bed_flat_merged = pd.merge(one_bed_flat_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
one_bed_flat_merged = pd.merge(one_bed_flat_merged, education, on='suburb', how='inner')
one_bed_flat_merged = pd.merge(one_bed_flat_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
one_bed_flat_merged = pd.merge(one_bed_flat_merged, pt_distances, on='suburb', how='inner')
one_bed_flat_merged = pd.merge(one_bed_flat_merged, domain_one_bed_flat, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in one_bed_flat and the population dataframe
one_bed_flat_merged = one_bed_flat_merged[one_bed_flat_merged['suburb'].isin(population['sa2_name'])]


# Merge two_bed_flat
two_bed_flat_merged = pd.merge(two_bed_flat, crimes, on=['suburb', 'year'], how='outer')
two_bed_flat_merged = pd.merge(two_bed_flat_merged, income, on=['suburb', 'year'], how='outer')
two_bed_flat_merged = pd.merge(two_bed_flat_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
two_bed_flat_merged = pd.merge(two_bed_flat_merged, education, on='suburb', how='inner')
two_bed_flat_merged = pd.merge(two_bed_flat_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
two_bed_flat_merged = pd.merge(two_bed_flat_merged, pt_distances, on='suburb', how='inner')
two_bed_flat_merged = pd.merge(two_bed_flat_merged, domain_two_bed_flat, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in two_bed_flat and the population dataframe
two_bed_flat_merged = two_bed_flat_merged[two_bed_flat_merged['suburb'].isin(population['sa2_name'])]


# Merge three_bed_flat
three_bed_flat_merged = pd.merge(three_bed_flat, crimes, on=['suburb', 'year'], how='outer')
three_bed_flat_merged = pd.merge(three_bed_flat_merged, income, on=['suburb', 'year'], how='outer')
three_bed_flat_merged = pd.merge(three_bed_flat_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
three_bed_flat_merged = pd.merge(three_bed_flat_merged, education, on='suburb', how='inner')
three_bed_flat_merged = pd.merge(three_bed_flat_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
three_bed_flat_merged = pd.merge(three_bed_flat_merged, pt_distances, on='suburb', how='inner')
three_bed_flat_merged = pd.merge(three_bed_flat_merged, domain_three_bed_flat, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in three_bed_flat and the population dataframe
three_bed_flat_merged = three_bed_flat_merged[three_bed_flat_merged['suburb'].isin(population['sa2_name'])]


# Merge two_bed_house
two_bed_house_merged = pd.merge(two_bed_house, crimes, on=['suburb', 'year'], how='outer')
two_bed_house_merged = pd.merge(two_bed_house_merged, income, on=['suburb', 'year'], how='outer')
two_bed_house_merged = pd.merge(two_bed_house_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
two_bed_house_merged = pd.merge(two_bed_house_merged, education, on='suburb', how='inner')
two_bed_house_merged = pd.merge(two_bed_house_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
two_bed_house_merged = pd.merge(two_bed_house_merged, pt_distances, on='suburb', how='inner')
two_bed_house_merged = pd.merge(two_bed_house_merged, domain_two_bed_house, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in two_bed_house and the population dataframe
two_bed_house_merged = two_bed_house_merged[two_bed_house_merged['suburb'].isin(population['sa2_name'])]


# Merge three_bed_house
three_bed_house_merged = pd.merge(three_bed_house, crimes, on=['suburb', 'year'], how='outer')
three_bed_house_merged = pd.merge(three_bed_house_merged, income, on=['suburb', 'year'], how='outer')
three_bed_house_merged = pd.merge(three_bed_house_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
three_bed_house_merged = pd.merge(three_bed_house_merged, education, on='suburb', how='inner')
three_bed_house_merged = pd.merge(three_bed_house_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
three_bed_house_merged = pd.merge(three_bed_house_merged, pt_distances, on='suburb', how='inner')
three_bed_house_merged = pd.merge(three_bed_house_merged, domain_three_bed_house, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in three_bed_house and the population dataframe
three_bed_house_merged = three_bed_house_merged[three_bed_house_merged['suburb'].isin(population['sa2_name'])]


# Merge four_bed_house
four_bed_house_merged = pd.merge(four_bed_house, crimes, on=['suburb', 'year'], how='outer')
four_bed_house_merged = pd.merge(four_bed_house_merged, income, on=['suburb', 'year'], how='outer')
four_bed_house_merged = pd.merge(four_bed_house_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
four_bed_house_merged = pd.merge(four_bed_house_merged, education, on='suburb', how='inner')
four_bed_house_merged = pd.merge(four_bed_house_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
four_bed_house_merged = pd.merge(four_bed_house_merged, pt_distances, on='suburb', how='inner')
four_bed_house_merged = pd.merge(four_bed_house_merged, domain_four_bed_house, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in four_bed_house and the population dataframe
four_bed_house_merged = four_bed_house_merged[four_bed_house_merged['suburb'].isin(population['sa2_name'])]


# Merge all_properties
all_properties_merged = pd.merge(all_properties, crimes, on=['suburb', 'year'], how='outer')
all_properties_merged = pd.merge(all_properties_merged, income, on=['suburb', 'year'], how='outer')
all_properties_merged = pd.merge(all_properties_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
all_properties_merged = pd.merge(all_properties_merged, education, on='suburb', how='inner')
all_properties_merged = pd.merge(all_properties_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
all_properties_merged = pd.merge(all_properties_merged, pt_distances, on='suburb', how='inner')
all_properties_merged = pd.merge(all_properties_merged, domain_all_properties, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in all_properties and the population dataframe
all_properties_merged = all_properties_merged[all_properties_merged['suburb'].isin(population['sa2_name'])]


#----- Check Null
missing_values = one_bed_flat_merged.isnull().sum()
columns_with_missing_values = missing_values[missing_values > 0]

columns_with_missing_values

dec_median    2442
jun_median    2442
mar_median    2442
sep_median    2442
dtype: int64

In [1242]:
# Drop all other suburb column names. Only keep the first suburb column 
def clean_merged_df(df):
    
    df = df.loc[:, ~df.columns.str.contains('Unnamed')]  # removes the duplicate 'suburb' column
    columns_to_drop = ['sa2_name', 'gazetted_locality']
    df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

    return df

# Clean the column names
one_bed_flat_merged = clean_merged_df(one_bed_flat_merged)
two_bed_flat_merged = clean_merged_df(two_bed_flat_merged)
three_bed_flat_merged = clean_merged_df(three_bed_flat_merged)
two_bed_house_merged = clean_merged_df(two_bed_house_merged)
three_bed_house_merged = clean_merged_df(three_bed_house_merged)
four_bed_house_merged = clean_merged_df(four_bed_house_merged)
all_properties_merged = clean_merged_df(all_properties_merged)

#----- Check Null
missing_values = one_bed_flat_merged.isnull().sum()
columns_with_missing_values = missing_values[missing_values > 0]

columns_with_missing_values

dec_median    2442
jun_median    2442
mar_median    2442
sep_median    2442
dtype: int64

### Create Train & Test Sets

In [1243]:
def train_val_test_sets(df):
    """
    Splits the dataframe into training, validation, and testing sets based on the 'year' column:
    - Training set includes data from the years 2016-2021.
    - Validation set includes data from the years 2022-2024.
    - Testing set includes data from the years 2025-2027.

    The function merges additional columns that are not part of the feature or target columns 
    back with the respective sets based on matching suburbs.

    Args:
    - df (pd.DataFrame): The input dataframe containing all required columns.

    Returns:
    - X_train (pd.DataFrame): Training feature set.
    - X_val (pd.DataFrame): Validation feature set.
    - X_test (pd.DataFrame): Testing feature set.
    - y_train (pd.DataFrame): Training target set.
    - y_val (pd.DataFrame): Validation target set.
    - y_test (pd.DataFrame): Testing target set.
    """
    # Define the year ranges for training, validation, and testing sets
    train_years = range(2016, 2022)
    val_years = range(2022, 2025)
    test_years = range(2025, 2028)

    # Columns to include in X and y splits
    feature_columns = ['suburb', 'year', 'offence_division', 'total_offence_count', 'erp']
    target_columns = ['dec_median', 'jun_median', 'mar_median', 'sep_median']

    # Split feature and target data
    X = df[feature_columns]
    y = df[target_columns]

    # Split the dataframe into training, validation, and testing sets based on the year
    X_train = X[X['year'].isin(train_years)]
    X_val = X[X['year'].isin(val_years)]
    X_test = X[X['year'].isin(test_years)]

    # Align target sets with the corresponding feature sets
    y_train = y.loc[X_train.index]
    y_val = y.loc[X_val.index]
    y_test = y.loc[X_test.index]

    # Extract other columns not in feature_columns or target_columns, including 'suburb'
    other_columns = df.drop(columns=feature_columns + target_columns).columns
    other_data = df[other_columns].copy()
    other_data['suburb'] = df['suburb']  # Ensure 'suburb' is included

    # Merge the 'other' data back with the matching suburbs, irrespective of the year
    X_train = X_train.merge(other_data, on='suburb', how='left')
    X_val = X_val.merge(other_data, on='suburb', how='left')
    X_test = X_test.merge(other_data, on='suburb', how='left')

    # Drop 'suburb' and 'year' from the feature sets
    X_train = X_train.drop(columns=['suburb', 'year'])
    X_val = X_val.drop(columns=['suburb', 'year'])
    X_test = X_test.drop(columns=['suburb', 'year'])

    # Remove rows with NaN values in y_train and y_val
    valid_train_indices = y_train.dropna().index
    X_train = X_train.loc[valid_train_indices]
    y_train = y_train.loc[valid_train_indices]

    valid_val_indices = y_val.dropna().index
    X_val = X_val.loc[valid_val_indices]
    y_val = y_val.loc[valid_val_indices]

    return X_train, X_val, X_test, y_train, y_val, y_test

# Create training, validation, and test sets for each property type
X_train_one_bed, X_val_one_bed, X_test_one_bed, y_train_one_bed, y_val_one_bed, y_test_one_bed = train_val_test_sets(one_bed_flat_merged)
X_train_two_bed, X_val_two_bed, X_test_two_bed, y_train_two_bed, y_val_two_bed, y_test_two_bed = train_val_test_sets(two_bed_flat_merged)
X_train_three_bed, X_val_three_bed, X_test_three_bed, y_train_three_bed, y_val_three_bed, y_test_three_bed = train_val_test_sets(three_bed_flat_merged)
X_train_two_bed_house, X_val_two_bed_house, X_test_two_bed_house, y_train_two_bed_house, y_val_two_bed_house, y_test_two_bed_house = train_val_test_sets(two_bed_house_merged)
X_train_three_bed_house, X_val_three_bed_house, X_test_three_bed_house, y_train_three_bed_house, y_val_three_bed_house, y_test_three_bed_house = train_val_test_sets(three_bed_house_merged)
X_train_four_bed_house, X_val_four_bed_house, X_test_four_bed_house, y_train_four_bed_house, y_val_four_bed_house, y_test_four_bed_house = train_val_test_sets(four_bed_house_merged)
X_train_all_properties, X_val_all_properties, X_test_all_properties, y_train_all_properties, y_val_all_properties, y_test_all_properties = train_val_test_sets(all_properties_merged)

Check to see all the X columns are the same 

In [1244]:
def compare_columns(train, val, test):
    """
    Compares columns of the training, validation, and testing dataframes.
    
    Parameters:
    - train: DataFrame representing the training set.
    - val: DataFrame representing the validation set.
    - test: DataFrame representing the testing set.
    
    Returns:
    - A dictionary indicating columns missing in each set.
    """
    comparison_result = {}
    # Check if columns match between train, validation, and test sets
    train_val_match = train.columns.equals(val.columns)
    train_test_match = train.columns.equals(test.columns)
    val_test_match = val.columns.equals(test.columns)
    
    if not (train_val_match and train_test_match and val_test_match):
        missing_in_val = set(train.columns) - set(val.columns)
        missing_in_train_val = set(val.columns) - set(train.columns)
        missing_in_test = set(train.columns) - set(test.columns)
        missing_in_train_test = set(test.columns) - set(train.columns)
        missing_in_val_test = set(val.columns) - set(test.columns)
        missing_in_test_val = set(test.columns) - set(val.columns)

        comparison_result = {
            "Columns missing in validation set compared to train": list(missing_in_val),
            "Columns missing in train set compared to validation": list(missing_in_train_val),
            "Columns missing in test set compared to train": list(missing_in_test),
            "Columns missing in train set compared to test": list(missing_in_train_test),
            "Columns missing in test set compared to validation": list(missing_in_val_test),
            "Columns missing in validation set compared to test": list(missing_in_test_val),
        }
    else:
        comparison_result = "Columns are the same in all three sets (train, validation, test)."

    return comparison_result

# List of training, validation, and testing DataFrames to compare
dataframes_triplets = {
    "One Bed": (X_train_one_bed, X_val_one_bed, X_test_one_bed),
    "Two Bed": (X_train_two_bed, X_val_two_bed, X_test_two_bed),
    "Three Bed": (X_train_three_bed, X_val_three_bed, X_test_three_bed),
    "Two Bed House": (X_train_two_bed_house, X_val_two_bed_house, X_test_two_bed_house),
    "Three Bed House": (X_train_three_bed_house, X_val_three_bed_house, X_test_three_bed_house),
    "Four Bed House": (X_train_four_bed_house, X_val_four_bed_house, X_test_four_bed_house),
    "All Properties": (X_train_all_properties, X_val_all_properties, X_test_all_properties)
}

# Compare columns for each triplet of training, validation, and testing sets
comparison_results = {name: compare_columns(train, val, test) for name, (train, val, test) in dataframes_triplets.items()}

comparison_results 

{'One Bed': 'Columns are the same in all three sets (train, validation, test).',
 'Two Bed': 'Columns are the same in all three sets (train, validation, test).',
 'Three Bed': 'Columns are the same in all three sets (train, validation, test).',
 'Two Bed House': 'Columns are the same in all three sets (train, validation, test).',
 'Three Bed House': 'Columns are the same in all three sets (train, validation, test).',
 'Four Bed House': 'Columns are the same in all three sets (train, validation, test).',
 'All Properties': 'Columns are the same in all three sets (train, validation, test).'}

### Check Null Values

In [1245]:
# List of dataframes to check for missing values
dataframes = {
    'X_train_one_bed': X_train_one_bed,
    'X_val_one_bed': X_val_one_bed,
    'X_test_one_bed': X_test_one_bed,
    'y_train_one_bed': y_train_one_bed,
    'y_val_one_bed': y_val_one_bed,
    'y_test_one_bed': y_test_one_bed,
    
    'X_train_two_bed': X_train_two_bed,
    'X_val_two_bed': X_val_two_bed,
    'X_test_two_bed': X_test_two_bed,
    'y_train_two_bed': y_train_two_bed,
    'y_val_two_bed': y_val_two_bed,
    'y_test_two_bed': y_test_two_bed,
    
    'X_train_three_bed': X_train_three_bed,
    'X_val_three_bed': X_val_three_bed,
    'X_test_three_bed': X_test_three_bed,
    'y_train_three_bed': y_train_three_bed,
    'y_val_three_bed': y_val_three_bed,
    'y_test_three_bed': y_test_three_bed,
    
    'X_train_two_bed_house': X_train_two_bed_house,
    'X_val_two_bed_house': X_val_two_bed_house,
    'X_test_two_bed_house': X_test_two_bed_house,
    'y_train_two_bed_house': y_train_two_bed_house,
    'y_val_two_bed_house': y_val_two_bed_house,
    'y_test_two_bed_house': y_test_two_bed_house,
    
    'X_train_three_bed_house': X_train_three_bed_house,
    'X_val_three_bed_house': X_val_three_bed_house,
    'X_test_three_bed_house': X_test_three_bed_house,
    'y_train_three_bed_house': y_train_three_bed_house,
    'y_val_three_bed_house': y_val_three_bed_house,
    'y_test_three_bed_house': y_test_three_bed_house,
    
    'X_train_four_bed_house': X_train_four_bed_house,
    'X_val_four_bed_house': X_val_four_bed_house,
    'X_test_four_bed_house': X_test_four_bed_house,
    'y_train_four_bed_house': y_train_four_bed_house,
    'y_val_four_bed_house': y_val_four_bed_house,
    'y_test_four_bed_house': y_test_four_bed_house,
    
    'X_train_all_properties': X_train_all_properties,
    'X_val_all_properties': X_val_all_properties,
    'X_test_all_properties': X_test_all_properties,
    'y_train_all_properties': y_train_all_properties,
    'y_val_all_properties': y_val_all_properties,
    'y_test_all_properties': y_test_all_properties,
}

In [1246]:
# Collecting rows with missing values for each dataframe
missing_rows_summary = {}

for name, df in dataframes.items():
    # Identify rows with missing values
    rows_with_missing = df[df.isnull().any(axis=1)]
    if not rows_with_missing.empty:
        missing_rows_summary[name] = rows_with_missing

print(missing_rows_summary)

# Check for missing values in each dataframe
missing_values_summary = {}
for name, df in dataframes.items():
    missing_values = df.isnull().sum()
    columns_with_missing = missing_values[missing_values > 0]
    if not columns_with_missing.empty:
        missing_values_summary[name] = columns_with_missing

# Display the missing values summary
print(missing_values_summary)

{'y_test_one_bed':       dec_median  jun_median  mar_median  sep_median
54           NaN         NaN         NaN         NaN
55           NaN         NaN         NaN         NaN
56           NaN         NaN         NaN         NaN
57           NaN         NaN         NaN         NaN
58           NaN         NaN         NaN         NaN
...          ...         ...         ...         ...
9715         NaN         NaN         NaN         NaN
9716         NaN         NaN         NaN         NaN
9717         NaN         NaN         NaN         NaN
9718         NaN         NaN         NaN         NaN
9719         NaN         NaN         NaN         NaN

[2376 rows x 4 columns], 'y_test_two_bed':        dec_median  jun_median  mar_median  sep_median
54            NaN         NaN         NaN         NaN
55            NaN         NaN         NaN         NaN
56            NaN         NaN         NaN         NaN
57            NaN         NaN         NaN         NaN
58            NaN         NaN  

### Feature Selection

In [1247]:
def pca_feature_selection(X_train, X_val, X_test, variance_threshold=0.7):
    """
    The function applies PCA for dimensionality reduction by fitting on the
    training set and transforming both the training and test sets. It keeps 
    either a specified number of components or selects them based on a variance 
    threshold. It returns the reduced training and test sets.
    """

    # Standardise the data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)

    # Initialise PCA, specifying the variance threshold
    pca_temp = PCA().fit(X_train_scaled)
    cumulative_variance = pca_temp.explained_variance_ratio_.cumsum()
    # Find the number of components to capture the specified variance
    n_components = next(i for i, total_variance in enumerate(cumulative_variance) if total_variance >= variance_threshold) + 1

    pca = PCA(n_components=n_components)

    # Fit PCA on the training set and transform both training and test sets
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_val_pca = pca.transform(X_val_scaled)
    X_test_pca = pca.transform(X_test_scaled)

    print(f"Number of components selected: {n_components}")
    print(f"Explained variance by selected components: {sum(pca.explained_variance_ratio_):.2f}")

    return X_train_pca, X_val_pca, X_test_pca

# Perform feature selection with PCA on the X sets 
X_train_one_bed, X_val_one_bed, X_test_one_bed = pca_feature_selection(X_train_one_bed, X_val_one_bed, X_test_one_bed)
X_train_two_bed, X_val_two_bed, X_test_two_bed = pca_feature_selection(X_train_two_bed, X_val_two_bed, X_test_two_bed)
X_train_three_bed, X_val_three_bed, X_test_three_bed = pca_feature_selection(X_train_three_bed, X_val_three_bed, X_test_three_bed)
X_train_two_bed_house, X_val_two_bed_house, X_test_two_bed_house = pca_feature_selection(X_train_two_bed_house, X_val_two_bed_house, X_test_two_bed_house)
X_train_three_bed_house, X_val_three_bed_house, X_test_three_bed_house = pca_feature_selection(X_train_three_bed_house, X_val_three_bed_house, X_test_three_bed_house)
X_train_four_bed_house, X_val_four_bed_house, X_test_four_bed_house = pca_feature_selection(X_train_four_bed_house, X_val_four_bed_house, X_test_four_bed_house)
X_train_all_properties, X_val_all_properties, X_test_all_properties = pca_feature_selection(X_train_all_properties, X_val_all_properties, X_test_all_properties)

Number of components selected: 2
Explained variance by selected components: 0.74
Number of components selected: 3
Explained variance by selected components: 0.82
Number of components selected: 2
Explained variance by selected components: 0.74
Number of components selected: 3
Explained variance by selected components: 0.77
Number of components selected: 4
Explained variance by selected components: 0.83
Number of components selected: 3
Explained variance by selected components: 0.79
Number of components selected: 3
Explained variance by selected components: 0.71


### Train & Fit Model

In [1248]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Define a list of all training, validation, and test sets
datasets = [
    (X_train_one_bed, X_val_one_bed, X_test_one_bed, y_train_one_bed, y_val_one_bed, y_test_one_bed),
    (X_train_two_bed, X_val_two_bed, X_test_two_bed, y_train_two_bed, y_val_two_bed, y_test_two_bed),
    (X_train_three_bed, X_val_three_bed, X_test_three_bed, y_train_three_bed, y_val_three_bed, y_test_three_bed),
    (X_train_two_bed_house, X_val_two_bed_house, X_test_two_bed_house, y_train_two_bed_house, y_val_two_bed_house, y_test_two_bed_house),
    (X_train_three_bed_house, X_val_three_bed_house, X_test_three_bed_house, y_train_three_bed_house, y_val_three_bed_house, y_test_three_bed_house),
    (X_train_four_bed_house, X_val_four_bed_house, X_test_four_bed_house, y_train_four_bed_house, y_val_four_bed_house, y_test_four_bed_house),
    (X_train_all_properties, X_val_all_properties, X_test_all_properties, y_train_all_properties, y_val_all_properties, y_test_all_properties),
]

# Define hyperparameters for Lasso
alpha_range = np.logspace(-4, 1, 10)  # Range of alpha values for tuning
param_grid = {'alpha': alpha_range}

# Loop through each set, tune the model on the validation set, and predict on the test set
for i, (X_train, X_val, X_test, y_train, y_val, y_test) in enumerate(datasets):
    # Initialize the Lasso model
    lasso = Lasso()
    
    # Set up the GridSearchCV to tune the 'alpha' hyperparameter
    grid_search = GridSearchCV(lasso, param_grid, scoring='neg_mean_squared_error', cv=5)
    
    # Train the model using the training set and validate on the validation set
    grid_search.fit(X_train, y_train)
    
    # Select the best model based on the validation set
    best_lasso = grid_search.best_estimator_
    
    # Evaluate on the validation set
    y_val_pred = best_lasso.predict(X_val)
    val_mse = mean_squared_error(y_val, y_val_pred)
    val_r2 = r2_score(y_val, y_val_pred)
    
    # Make predictions on the test set
    y_test_pred = best_lasso.predict(X_test)
    
    # Print the results
    print(f"Dataset {i+1}:")
    print(f"Best alpha: {best_lasso.alpha}")
    print(f"Validation MSE: {val_mse:.4f}, R^2: {val_r2:.4f}")
    print(f"Predictions for 2025-2027: {y_test_pred}\n")


Dataset 1:
Best alpha: 10.0
Validation MSE: 8053.8958, R^2: -0.4595
Predictions for 2025-2027: [[310.30774929 308.82236715 307.34697064 309.47767535]
 [310.30774929 308.82236715 307.34697064 309.47767535]
 [310.30774929 308.82236715 307.34697064 309.47767535]
 ...
 [300.0366232  298.71998189 297.91785723 299.18394444]
 [300.0366232  298.71998189 297.91785723 299.18394444]
 [300.0366232  298.71998189 297.91785723 299.18394444]]

Dataset 2:
Best alpha: 10.0
Validation MSE: 13337.8246, R^2: -0.5642
Predictions for 2025-2027: [[389.56693218 386.62324256 385.12589965 388.22316099]
 [389.56693218 386.62324256 385.12589965 388.22316099]
 [389.56693218 386.62324256 385.12589965 388.22316099]
 ...
 [380.90730241 378.23738011 376.87906051 379.62562685]
 [380.90730241 378.23738011 376.87906051 379.62562685]
 [380.90730241 378.23738011 376.87906051 379.62562685]]

Dataset 3:
Best alpha: 10.0
Validation MSE: 38599.2127, R^2: -0.2970
Predictions for 2025-2027: [[517.49843377 512.72444883 509.8301784