### Import Libraries and Data

In [1062]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [1063]:
# Rental History Data
one_bed_flat = pd.read_csv('../data/raw/rental_history/one_bed_flat.csv')
two_bed_flat = pd.read_csv('../data/raw/rental_history/two_bed_flat.csv')
three_bed_flat = pd.read_csv('../data/raw/rental_history/three_bed_flat.csv')
two_bed_house = pd.read_csv('../data/raw/rental_history/two_bed_house.csv')
three_bed_house = pd.read_csv('../data/raw/rental_history/three_bed_house.csv')
four_bed_house = pd.read_csv('../data/raw/rental_history/four_bed_house.csv')
all_properties = pd.read_csv('../data/raw/rental_history/all_properties.csv')

# Domain Rental Data
domain_one_bed_flat = pd.read_csv('../data/curated/domain_one_bed_flat_rent.csv')
domain_two_bed_flat = pd.read_csv('../data/curated/domain_two_bed_flat_rent.csv')
domain_three_bed_flat = pd.read_csv('../data/curated/domain_three_bed_flat_rent.csv')
domain_two_bed_house = pd.read_csv('../data/curated/domain_two_bed_house_rent.csv')
domain_three_bed_house = pd.read_csv('../data/curated/domain_three_bed_house_rent.csv')
domain_four_bed_house = pd.read_csv('../data/curated/domain_four_bed_house.csv')
domain_all_properties = pd.read_csv('../data/curated/domain_all_properties_rent.csv')

# Other engineered feature sets 
crimes = pd.read_csv('../data/curated/crimes.csv')
population = pd.read_csv('../data/curated/final_population.csv')
education = pd.read_csv('../data/curated/education_df.csv')
urban_landmarks = pd.read_csv('../data/raw/urban_landmarks_features.csv')
pt_distances = pd.read_csv('../data/curated/suburb_transport_distances.csv')
income_2016 = pd.read_csv('../data/curated/income_2016.csv')
income_2021 = pd.read_csv('../data/curated/income_2021.csv')

### Formatting Rental Dataframes

In [1064]:
def clean_domain_data(df):
    # Drop columns that contain 'Unnamed:' in their name
    df = df.loc[:, ~df.columns.str.contains('^Unnamed:')]
    
    # Rename the 'median_rent' column to 'median_rent_sep'
    if 'median_rent' in df.columns:
        df = df.rename(columns={'median_rent': 'sep_median'})
    
    # Add a 'year' column with value 2024 for each row
    df['year'] = 2024

    # Reorder columns to make 'year' the second column
    cols = list(df.columns)
    cols.insert(1, cols.pop(cols.index('year')))
    df = df[cols]
    
    return df

# Example usage on your list of dataframes:
domain_one_bed_flat = clean_domain_data(domain_one_bed_flat)
domain_two_bed_flat = clean_domain_data(domain_two_bed_flat)
domain_three_bed_flat = clean_domain_data(domain_three_bed_flat)
domain_two_bed_house = clean_domain_data(domain_two_bed_house)
domain_three_bed_house = clean_domain_data(domain_three_bed_house)
domain_four_bed_house = clean_domain_data(domain_four_bed_house)
domain_all_properties = clean_domain_data(domain_all_properties)


Impute the Sep median price from scraped properties

In [1065]:
def impute_values(main_df, domain_df):
    # Merge main_df with domain_df on 'suburb' to keep all years from main_df
    merged_df = pd.merge(main_df, domain_df[['suburb', 'year', 'sep_median']],
                         on=['suburb'], how='left', suffixes=('', '_domain'))
    
    # Replace sep_median values with domain values only for rows where year == 2024
    condition = (merged_df['year'] == 2024) & merged_df['sep_median_domain'].notna()
    merged_df.loc[condition, 'sep_median'] = merged_df.loc[condition, 'sep_median_domain']
    
    # Drop the domain-specific columns used for imputation
    merged_df.drop(columns=['sep_median_domain', 'year_domain'], inplace=True)
    
    # Drop the sep_median column from the domain DataFrame
    domain_df = domain_df.drop(columns=['year', 'sep_median', 'num_properties'], errors='ignore')
    
    return merged_df, domain_df

# Impute for each dataset and drop the sep_median from domain DataFrames
one_bed_flat, domain_one_bed_flat = impute_values(one_bed_flat, domain_one_bed_flat)
two_bed_flat, domain_two_bed_flat = impute_values(two_bed_flat, domain_two_bed_flat)
three_bed_flat, domain_three_bed_flat = impute_values(three_bed_flat, domain_three_bed_flat)
two_bed_house, domain_two_bed_house = impute_values(two_bed_house, domain_two_bed_house)
three_bed_house, domain_three_bed_house = impute_values(three_bed_house, domain_three_bed_house)
four_bed_house, domain_four_bed_house = impute_values(four_bed_house, domain_four_bed_house)
all_properties, domain_all_properties = impute_values(all_properties, domain_all_properties)

# Example output to see the results
domain_one_bed_flat.isnull().sum()

suburb               0
median_bath          0
median_parkings      0
furnished_count      0
unfurnished_count    0
pets_allowed         0
pets_not_allowed     0
dtype: int64

### Combining All Feature Sets

In [1066]:
# Merge one_bed_flat
one_bed_flat_merged = pd.merge(one_bed_flat, crimes, on=['suburb', 'year'], how='outer')
one_bed_flat_merged = pd.merge(one_bed_flat_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
one_bed_flat_merged = pd.merge(one_bed_flat_merged, education, on='suburb', how='inner')
one_bed_flat_merged = pd.merge(one_bed_flat_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
one_bed_flat_merged = pd.merge(one_bed_flat_merged, pt_distances, on='suburb', how='inner')
one_bed_flat_merged = pd.merge(one_bed_flat_merged, domain_one_bed_flat, on='suburb', how='inner')
print(one_bed_flat_merged['year'].unique())
# one_bed_flat_merged = pd.merge(one_bed_flat_merged, income_2016, on='suburb', how='inner')
# one_bed_flat_merged = pd.merge(one_bed_flat_merged, income_2021, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in one_bed_flat and the population dataframe
one_bed_flat_merged = one_bed_flat_merged[one_bed_flat_merged['suburb'].isin(population['sa2_name'])]


# Merge two_bed_flat
two_bed_flat_merged = pd.merge(two_bed_flat, crimes, on=['suburb', 'year'], how='outer')
two_bed_flat_merged = pd.merge(two_bed_flat_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
two_bed_flat_merged = pd.merge(two_bed_flat_merged, education, on='suburb', how='inner')
two_bed_flat_merged = pd.merge(two_bed_flat_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
two_bed_flat_merged = pd.merge(two_bed_flat_merged, pt_distances, on='suburb', how='inner')
two_bed_flat_merged = pd.merge(two_bed_flat_merged, domain_two_bed_flat, on='suburb', how='inner')
# two_bed_flat_merged = pd.merge(two_bed_flat_merged, income_2016, on='suburb', how='inner')
# two_bed_flat_merged = pd.merge(two_bed_flat_merged, income_2021, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in two_bed_flat and the population dataframe
two_bed_flat_merged = two_bed_flat_merged[two_bed_flat_merged['suburb'].isin(population['sa2_name'])]


# Merge three_bed_flat
three_bed_flat_merged = pd.merge(three_bed_flat, crimes, on=['suburb', 'year'], how='outer')
three_bed_flat_merged = pd.merge(three_bed_flat_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
three_bed_flat_merged = pd.merge(three_bed_flat_merged, education, on='suburb', how='inner')
three_bed_flat_merged = pd.merge(three_bed_flat_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
three_bed_flat_merged = pd.merge(three_bed_flat_merged, pt_distances, on='suburb', how='inner')
three_bed_flat_merged = pd.merge(three_bed_flat_merged, domain_three_bed_flat, on='suburb', how='inner')
# three_bed_flat_merged = pd.merge(three_bed_flat_merged, income_2016, on='suburb', how='inner')
# three_bed_flat_merged = pd.merge(three_bed_flat_merged, income_2021, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in three_bed_flat and the population dataframe
three_bed_flat_merged = three_bed_flat_merged[three_bed_flat_merged['suburb'].isin(population['sa2_name'])]


# Merge two_bed_house
two_bed_house_merged = pd.merge(two_bed_house, crimes, on=['suburb', 'year'], how='outer')
two_bed_house_merged = pd.merge(two_bed_house_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
two_bed_house_merged = pd.merge(two_bed_house_merged, education, on='suburb', how='inner')
two_bed_house_merged = pd.merge(two_bed_house_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
two_bed_house_merged = pd.merge(two_bed_house_merged, pt_distances, on='suburb', how='inner')
two_bed_house_merged = pd.merge(two_bed_house_merged, domain_two_bed_house, on='suburb', how='inner')
# two_bed_house_merged = pd.merge(two_bed_house_merged, income_2016, on='suburb', how='inner')
# two_bed_house_merged = pd.merge(two_bed_house_merged, income_2021, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in two_bed_house and the population dataframe
two_bed_house_merged = two_bed_house_merged[two_bed_house_merged['suburb'].isin(population['sa2_name'])]


# Merge three_bed_house
three_bed_house_merged = pd.merge(three_bed_house, crimes, on=['suburb', 'year'], how='outer')
three_bed_house_merged = pd.merge(three_bed_house_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
three_bed_house_merged = pd.merge(three_bed_house_merged, education, on='suburb', how='inner')
three_bed_house_merged = pd.merge(three_bed_house_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
three_bed_house_merged = pd.merge(three_bed_house_merged, pt_distances, on='suburb', how='inner')
three_bed_house_merged = pd.merge(three_bed_house_merged, domain_three_bed_house, on='suburb', how='inner')
# three_bed_house_merged = pd.merge(three_bed_house_merged, income_2016, on='suburb', how='inner')
# three_bed_house_merged = pd.merge(three_bed_house_merged, income_2021, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in three_bed_house and the population dataframe
three_bed_house_merged = three_bed_house_merged[three_bed_house_merged['suburb'].isin(population['sa2_name'])]


# Merge four_bed_house
four_bed_house_merged = pd.merge(four_bed_house, crimes, on=['suburb', 'year'], how='outer')
four_bed_house_merged = pd.merge(four_bed_house_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
four_bed_house_merged = pd.merge(four_bed_house_merged, education, on='suburb', how='inner')
four_bed_house_merged = pd.merge(four_bed_house_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
four_bed_house_merged = pd.merge(four_bed_house_merged, pt_distances, on='suburb', how='inner')
four_bed_house_merged = pd.merge(four_bed_house_merged, domain_four_bed_house, on='suburb', how='inner')
# four_bed_house_merged = pd.merge(four_bed_house_merged, income_2016, on='suburb', how='inner')
# four_bed_house_merged = pd.merge(four_bed_house_merged, income_2021, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in four_bed_house and the population dataframe
four_bed_house_merged = four_bed_house_merged[four_bed_house_merged['suburb'].isin(population['sa2_name'])]


# Merge all_properties
all_properties_merged = pd.merge(all_properties, crimes, on=['suburb', 'year'], how='outer')
all_properties_merged = pd.merge(all_properties_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
all_properties_merged = pd.merge(all_properties_merged, education, on='suburb', how='inner')
all_properties_merged = pd.merge(all_properties_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
all_properties_merged = pd.merge(all_properties_merged, pt_distances, on='suburb', how='inner')
all_properties_merged = pd.merge(all_properties_merged, domain_all_properties, on='suburb', how='inner')
# all_properties_merged = pd.merge(all_properties_merged, income_2016, on='suburb', how='inner')
# all_properties_merged = pd.merge(all_properties_merged, income_2021, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in all_properties and the population dataframe
all_properties_merged = all_properties_merged[all_properties_merged['suburb'].isin(population['sa2_name'])]


#----- Check Null
missing_values = two_bed_flat_merged.isnull().sum()
columns_with_missing_values = missing_values[missing_values > 0]

columns_with_missing_values

[2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027]


dec_median    3024
jun_median    3024
mar_median    3024
sep_median    3024
dtype: int64

In [1067]:
# Drop all other suburb column names. Only keep the first suburb column 
def clean_merged_df(df):
    
    df = df.loc[:, ~df.columns.str.contains('Unnamed')]  # removes the duplicate 'suburb' column
    columns_to_drop = ['sa2_name', 'gazetted_locality']
    df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

    return df

# Clean the column names
one_bed_flat_merged = clean_merged_df(one_bed_flat_merged)
two_bed_flat_merged = clean_merged_df(two_bed_flat_merged)
three_bed_flat_merged = clean_merged_df(three_bed_flat_merged)
two_bed_house_merged = clean_merged_df(two_bed_house_merged)
three_bed_house_merged = clean_merged_df(three_bed_house_merged)
four_bed_house_merged = clean_merged_df(four_bed_house_merged)
all_properties_merged = clean_merged_df(all_properties_merged)

#----- Check Null
missing_values = one_bed_flat_merged.isnull().sum()
columns_with_missing_values = missing_values[missing_values > 0]

columns_with_missing_values

dec_median    2442
jun_median    2442
mar_median    2442
sep_median    2442
dtype: int64

### Create Train & Test Sets

In [1068]:
def train_test_sets(df):
    """
    Splits the dataframe into training and testing sets based on the 'year' column:
    - Training set includes data from the years 2016-2024.
    - Testing set includes data from the years 2025-2027.

    The function merges additional columns that are not part of the feature or target columns 
    back with the respective training and testing sets based on matching suburbs.

    Args:
    - df (pd.DataFrame): The input dataframe containing all required columns.

    Returns:
    - X_train (pd.DataFrame): Training feature set.
    - X_test (pd.DataFrame): Testing feature set.
    - y_train (pd.DataFrame): Training target set.
    - y_test (pd.DataFrame): Testing target set.
    """
    # Define the year ranges for training and testing sets
    train_years = range(2016, 2025)
    test_years = range(2025, 2028)

    # Columns to include in X and y splits
    feature_columns = ['suburb', 'year', 'offence_division', 'total_offence_count', 'erp']
    target_columns = ['dec_median', 'jun_median', 'mar_median', 'sep_median']

    # Split feature and target data
    X = df[feature_columns]
    y = df[target_columns]

    # Split the dataframe into training and testing sets based on the year
    X_train = X[X['year'].isin(train_years)]
    X_test = X[X['year'].isin(test_years)]

    # Align target sets with the corresponding feature sets
    y_train = y.loc[X_train.index]
    y_test = y.loc[X_test.index]

    # Extract other columns not in feature_columns or target_columns, including 'suburb'
    other_columns = df.drop(columns=feature_columns + target_columns).columns
    other_data = df[other_columns].copy()
    other_data['suburb'] = df['suburb']  # Ensure 'suburb' is included

    # Merge the 'other' data back with the matching suburbs, irrespective of the year
    X_train = X_train.merge(other_data, on='suburb', how='left')
    X_test = X_test.merge(other_data, on='suburb', how='left')

    # Drop 'suburb' and 'year' from the feature sets
    X_train = X_train.drop(columns=['suburb', 'year'])
    X_test = X_test.drop(columns=['suburb', 'year'])

    # Remove rows with NaN values in y_train
    valid_indices = y_train.dropna().index
    X_train = X_train.loc[valid_indices]
    y_train = y_train.loc[valid_indices]

    return X_train, X_test, y_train, y_test

# Create training and test sets for each property type
X_train_one_bed, X_test_one_bed, y_train_one_bed, y_test_one_bed = train_test_sets(one_bed_flat_merged)
X_train_two_bed, X_test_two_bed, y_train_two_bed, y_test_two_bed = train_test_sets(two_bed_flat_merged)
X_train_three_bed, X_test_three_bed, y_train_three_bed, y_test_three_bed = train_test_sets(three_bed_flat_merged)
X_train_two_bed_house, X_test_two_bed_house, y_train_two_bed_house, y_test_two_bed_house = train_test_sets(two_bed_house_merged)
X_train_three_bed_house, X_test_three_bed_house, y_train_three_bed_house, y_test_three_bed_house = train_test_sets(three_bed_house_merged)
X_train_four_bed_house, X_test_four_bed_house, y_train_four_bed_house, y_test_four_bed_house = train_test_sets(four_bed_house_merged)
X_train_all_properties, X_test_all_properties, y_train_all_properties, y_test_all_properties = train_test_sets(all_properties_merged)

X_train_one_bed

Unnamed: 0,offence_division,total_offence_count,erp,primary_school_count,secondary_school_count,tertiary_institutions_count,avg_primary_school_rank,avg_secondary_school_rank,has_primary_school,has_secondary_school,...,distance_to_restaurant,distance_to_supermarket,nearest_transport_avg_distance,distance_to_cbd,median_bath,median_parkings,furnished_count,unfurnished_count,pets_allowed,pets_not_allowed
0,1.0,107.0,8770.0,2,0,0,964.5,0.0,1,0,...,1.141601,1.083238,1.110000,5.55,1,0,2,10,4,8
1,1.0,107.0,8770.0,2,0,0,964.5,0.0,1,0,...,1.141601,1.083238,1.110000,5.55,1,0,2,10,4,8
2,1.0,107.0,8770.0,2,0,0,964.5,0.0,1,0,...,1.141601,1.083238,1.110000,5.55,1,0,2,10,4,8
3,1.0,107.0,8770.0,2,0,0,964.5,0.0,1,0,...,1.141601,1.083238,1.110000,5.55,1,0,2,10,4,8
4,1.0,107.0,8770.0,2,0,0,964.5,0.0,1,0,...,1.141601,1.083238,1.110000,5.55,1,0,2,10,4,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9697,6.0,14.0,15507.0,2,0,0,583.5,0.0,1,0,...,1.601167,1.664871,2.836667,125.11,1,1,0,1,0,1
9698,6.0,14.0,15507.0,2,0,0,583.5,0.0,1,0,...,1.601167,1.664871,2.836667,125.11,1,1,0,1,0,1
9699,6.0,14.0,15507.0,2,0,0,583.5,0.0,1,0,...,1.601167,1.664871,2.836667,125.11,1,1,0,1,0,1
9700,6.0,14.0,15507.0,2,0,0,583.5,0.0,1,0,...,1.601167,1.664871,2.836667,125.11,1,1,0,1,0,1


Check to see all the X columns are the same 

In [1069]:
def compare_columns(train, test):
    """
    Compares columns of the training and testing dataframes.
    
    Parameters:
    - train: DataFrame representing the training set.
    - test: DataFrame representing the testing set.
    
    Returns:
    - A dictionary indicating columns missing in each set.
    """
    comparison_result = {}
    columns_match = train.columns.equals(test.columns)
    
    if not columns_match:
        missing_in_test = set(train.columns) - set(test.columns)
        missing_in_train = set(test.columns) - set(train.columns)
        comparison_result = {
            "Columns missing in test set": list(missing_in_test),
            "Columns missing in train set": list(missing_in_train)
        }
    else:
        comparison_result = "Columns are the same in both train and test sets."
    
    return comparison_result

# List of training and testing DataFrames to compare
dataframes_pairs = {
    "One Bed": (X_train_one_bed, X_test_one_bed),
    "Two Bed": (X_train_two_bed, X_test_two_bed),
    "Three Bed": (X_train_three_bed, X_test_three_bed),
    "Two Bed House": (X_train_two_bed_house, X_test_two_bed_house),
    "Three Bed House": (X_train_three_bed_house, X_test_three_bed_house),
    "Four Bed House": (X_train_four_bed_house, X_test_four_bed_house),
    "All Properties": (X_train_all_properties, X_test_all_properties)
}

# Compare columns for each pair of training and testing sets
comparison_results = {name: compare_columns(train, test) for name, (train, test) in dataframes_pairs.items()}

# Display comparison results
comparison_results


{'One Bed': 'Columns are the same in both train and test sets.',
 'Two Bed': 'Columns are the same in both train and test sets.',
 'Three Bed': 'Columns are the same in both train and test sets.',
 'Two Bed House': 'Columns are the same in both train and test sets.',
 'Three Bed House': 'Columns are the same in both train and test sets.',
 'Four Bed House': 'Columns are the same in both train and test sets.',
 'All Properties': 'Columns are the same in both train and test sets.'}

### Check Null Values

In [1070]:
# List of dataframes to check for missing values
dataframes = {
    'X_train_one_bed': X_train_one_bed,
    'X_test_one_bed': X_test_one_bed,
    'y_train_one_bed': y_train_one_bed,
    'y_test_one_bed': y_test_one_bed,
    
    'X_train_two_bed': X_train_two_bed,
    'X_test_two_bed': X_test_two_bed,
    'y_train_two_bed': y_train_two_bed,
    'y_test_two_bed': y_test_two_bed,
    
    'X_train_three_bed': X_train_three_bed,
    'X_test_three_bed': X_test_three_bed,
    'y_train_three_bed': y_train_three_bed,
    'y_test_three_bed': y_test_three_bed,
    
    'X_train_two_bed_house': X_train_two_bed_house,
    'X_test_two_bed_house': X_test_two_bed_house,
    'y_train_two_bed_house': y_train_two_bed_house,
    'y_test_two_bed_house': y_test_two_bed_house,
    
    'X_train_three_bed_house': X_train_three_bed_house,
    'X_test_three_bed_house': X_test_three_bed_house,
    'y_train_three_bed_house': y_train_three_bed_house,
    'y_test_three_bed_house': y_test_three_bed_house,
    
    'X_train_four_bed_house': X_train_four_bed_house,
    'X_test_four_bed_house': X_test_four_bed_house,
    'y_train_four_bed_house': y_train_four_bed_house,
    'y_test_four_bed_house': y_test_four_bed_house,
    
    'X_train_all_properties': X_train_all_properties,
    'X_test_all_properties': X_test_all_properties,
    'y_train_all_properties': y_train_all_properties,
    'y_test_all_properties': y_test_all_properties,
}

In [1071]:
# Collecting rows with missing values for each dataframe
missing_rows_summary = {}

for name, df in dataframes.items():
    # Identify rows with missing values
    rows_with_missing = df[df.isnull().any(axis=1)]
    if not rows_with_missing.empty:
        missing_rows_summary[name] = rows_with_missing

print(missing_rows_summary)

# Check for missing values in each dataframe
missing_values_summary = {}
for name, df in dataframes.items():
    missing_values = df.isnull().sum()
    columns_with_missing = missing_values[missing_values > 0]
    if not columns_with_missing.empty:
        missing_values_summary[name] = columns_with_missing

# Display the missing values summary
print(missing_values_summary)

{'y_test_one_bed':       dec_median  jun_median  mar_median  sep_median
54           NaN         NaN         NaN         NaN
55           NaN         NaN         NaN         NaN
56           NaN         NaN         NaN         NaN
57           NaN         NaN         NaN         NaN
58           NaN         NaN         NaN         NaN
...          ...         ...         ...         ...
9715         NaN         NaN         NaN         NaN
9716         NaN         NaN         NaN         NaN
9717         NaN         NaN         NaN         NaN
9718         NaN         NaN         NaN         NaN
9719         NaN         NaN         NaN         NaN

[2376 rows x 4 columns], 'y_test_two_bed':        dec_median  jun_median  mar_median  sep_median
54            NaN         NaN         NaN         NaN
55            NaN         NaN         NaN         NaN
56            NaN         NaN         NaN         NaN
57            NaN         NaN         NaN         NaN
58            NaN         NaN  

### Feature Selection

In [1072]:
def pca_feature_selection(X_train, X_test, variance_threshold=0.7):
    """
    The function applies PCA for dimensionality reduction by fitting on the
    training set and transforming both the training and test sets. It keeps 
    either a specified number of components or selects them based on a variance 
    threshold. It returns the reduced training and test sets.
    """

    # Standardise the data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Initialise PCA, specifying the variance threshold
    pca_temp = PCA().fit(X_train_scaled)
    cumulative_variance = pca_temp.explained_variance_ratio_.cumsum()
    # Find the number of components to capture the specified variance
    n_components = next(i for i, total_variance in enumerate(cumulative_variance) if total_variance >= variance_threshold) + 1

    pca = PCA(n_components=n_components)

    # Fit PCA on the training set and transform both training and test sets
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_test_pca = pca.transform(X_test_scaled)

    print(f"Number of components selected: {n_components}")
    print(f"Explained variance by selected components: {sum(pca.explained_variance_ratio_):.2f}")

    return X_train_pca, X_test_pca

# Perform feature selection with PCA on the X sets 
X_train_one_bed, X_test_one_bed = pca_feature_selection(X_train_one_bed, X_test_one_bed )
X_train_two_bed, X_test_two_bed = pca_feature_selection(X_train_two_bed, X_test_two_bed)
X_train_three_bed, X_test_three_bed = pca_feature_selection(X_train_three_bed, X_test_three_bed)
X_train_two_bed_house, X_test_two_bed_house = pca_feature_selection(X_train_two_bed_house, X_test_two_bed_house)
X_train_three_bed_house, X_test_three_bed_house = pca_feature_selection(X_train_three_bed_house, X_test_three_bed_house)
X_train_four_bed_house, X_test_four_bed_house = pca_feature_selection(X_train_four_bed_house, X_test_four_bed_house)
X_train_all_properties, X_test_all_properties = pca_feature_selection(X_train_all_properties, X_test_all_properties)

Number of components selected: 2
Explained variance by selected components: 0.96
Number of components selected: 2
Explained variance by selected components: 0.81
Number of components selected: 2
Explained variance by selected components: 0.96
Number of components selected: 2
Explained variance by selected components: 0.96
Number of components selected: 2
Explained variance by selected components: 0.75
Number of components selected: 2
Explained variance by selected components: 0.83
Number of components selected: 2
Explained variance by selected components: 0.78


### Train & Fit Model

In [1075]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Define a list of all training and test sets, including sets for 2025-2027 with no y_test values
datasets = [
    (X_train_one_bed, X_test_one_bed, y_train_one_bed, y_test_one_bed),
    (X_train_two_bed, X_test_two_bed, y_train_two_bed, y_test_two_bed),
    (X_train_three_bed, X_test_three_bed, y_train_three_bed, y_test_three_bed),
    (X_train_two_bed_house, X_test_two_bed_house, y_train_two_bed_house, y_test_two_bed_house),
    (X_train_three_bed_house, X_test_three_bed_house, y_train_three_bed_house, y_test_three_bed_house),
    (X_train_four_bed_house, X_test_four_bed_house, y_train_four_bed_house, y_test_four_bed_house),
    (X_train_all_properties, X_test_all_properties, y_train_all_properties, y_test_all_properties),
]

# Define hyperparameters for Lasso
alpha = 1.0  # Adjust this parameter based on your needs

# Loop through each set and train the Lasso model
for i, (X_train, X_test, y_train, y_test) in enumerate(datasets):
    # Initialize the Lasso model
    lasso = Lasso(alpha=alpha)
    
    # Train the model
    lasso.fit(X_train, y_train)
    
    # Make predictions
    y_pred = lasso.predict(X_test)
    
    # Print the results
    print(f"Dataset {i+1}:")
    
    print(f"Predictions for 2025-2027: {y_pred}\n")


Dataset 1:
Predictions for 2025-2027: [[328.87932938 323.657938   321.03655138 328.59659305]
 [328.87932938 323.657938   321.03655138 328.59659305]
 [328.87932938 323.657938   321.03655138 328.59659305]
 ...
 [313.39225394 308.18721504 305.97285802 313.88934021]
 [313.39225394 308.18721504 305.97285802 313.88934021]
 [313.39225394 308.18721504 305.97285802 313.88934021]]

Dataset 2:
Predictions for 2025-2027: [[415.58430435 408.32233606 404.80747621 417.05636782]
 [415.58430435 408.32233606 404.80747621 417.05636782]
 [415.58430435 408.32233606 404.80747621 417.05636782]
 ...
 [401.92908586 394.54487049 391.19214922 402.11053931]
 [401.92908586 394.54487049 391.19214922 402.11053931]
 [401.92908586 394.54487049 391.19214922 402.11053931]]

Dataset 3:
Predictions for 2025-2027: [[555.64479225 545.81355905 541.60611239 558.8871566 ]
 [555.64479225 545.81355905 541.60611239 558.8871566 ]
 [555.64479225 545.81355905 541.60611239 558.8871566 ]
 ...
 [543.64606343 534.36644158 530.08386343 5