### Import Libraries and Data

In [16]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

In [17]:
# Rental History Data
one_bed_flat = pd.read_csv('../data/raw/rental_history/one_bed_flat.csv')
two_bed_flat = pd.read_csv('../data/raw/rental_history/two_bed_flat.csv')
three_bed_flat = pd.read_csv('../data/raw/rental_history/three_bed_flat.csv')
two_bed_house = pd.read_csv('../data/raw/rental_history/two_bed_house.csv')
three_bed_house = pd.read_csv('../data/raw/rental_history/three_bed_house.csv')
four_bed_house = pd.read_csv('../data/raw/rental_history/four_bed_house.csv')
all_properties = pd.read_csv('../data/raw/rental_history/all_properties.csv')

# Domain Rental Data
domain_one_bed_flat = pd.read_csv('../data/curated/domain_one_bed_flat_rent.csv')
domain_two_bed_flat = pd.read_csv('../data/curated/domain_two_bed_flat_rent.csv')
domain_three_bed_flat = pd.read_csv('../data/curated/domain_three_bed_flat_rent.csv')
domain_two_bed_house = pd.read_csv('../data/curated/domain_two_bed_house_rent.csv')
domain_three_bed_house = pd.read_csv('../data/curated/domain_three_bed_house_rent.csv')
domain_four_bed_house = pd.read_csv('../data/curated/domain_four_bed_house.csv')
domain_all_properties = pd.read_csv('../data/curated/domain_all_properties_rent.csv')

# Other engineered feature sets 
crimes = pd.read_csv('../data/curated/crimes.csv')
population = pd.read_csv('../data/curated/final_population.csv')
education = pd.read_csv('../data/curated/education_df.csv')
urban_landmarks = pd.read_csv('../data/raw/urban_landmarks_features.csv')
pt_distances = pd.read_csv('../data/curated/suburb_transport_distances.csv')
income = pd.read_csv('../data/curated/income.csv')

### Formatting Rental Dataframes

In [18]:
def clean_domain_df(df):
    """
    This function cleans the domain dataframes by removing
    the 'Unnamed:' column, renaming median_rent to 'sep_median'
    (for a standardised column name as in rental history dfs) and
    also creates a year column and inputs the relevant year that
    the data is from - 2024. 
    """

    # Drop columns that contain 'Unnamed:' in their name
    df = df.loc[:, ~df.columns.str.contains('^Unnamed:')]
    
    # Rename the 'median_rent' column to 'sep_median'
    if 'median_rent' in df.columns:
        df = df.rename(columns={'median_rent': 'sep_median'})
    
    # Add a 'year' column with value 2024 for each row
    df['year'] = 2024

    # Reorder columns to make 'year' the second column
    cols = list(df.columns)
    cols.insert(1, cols.pop(cols.index('year')))
    df = df[cols]
    
    return df

# Apply the clean_domain_df function to all the domain dataframes
domain_one_bed_flat = clean_domain_df(domain_one_bed_flat)
domain_two_bed_flat = clean_domain_df(domain_two_bed_flat)
domain_three_bed_flat = clean_domain_df(domain_three_bed_flat)
domain_two_bed_house = clean_domain_df(domain_two_bed_house)
domain_three_bed_house = clean_domain_df(domain_three_bed_house)
domain_four_bed_house = clean_domain_df(domain_four_bed_house)
domain_all_properties = clean_domain_df(domain_all_properties)


Impute the Sep median price from scraped properties into the rental history dataframes

In [19]:
def impute_sep_2024_rental_data(rental_history_df, domain_df):
    """
    This function retrieves all the median rental prices in 
    September from the domain dataframes and then imputes
    them into the rental history dataframes where the year
    is 2024 and month is September. 
    """

    # Merge rental_history_df with domain_df on 'suburb' to keep all years from rental_history_df
    merged_df = pd.merge(rental_history_df, domain_df[['suburb', 'year', 'sep_median']],
                         on=['suburb'], how='left', suffixes=('', '_domain'))
    
    # Replace sep_median values with domain values only for rows where year == 2024
    condition = (merged_df['year'] == 2024) & merged_df['sep_median_domain'].notna()
    merged_df.loc[condition, 'sep_median'] = merged_df.loc[condition, 'sep_median_domain']
    
    # Drop the domain-specific columns used for imputation
    merged_df.drop(columns=['sep_median_domain', 'year_domain'], inplace=True)
    
    # Drop the sep_median column from the domain DataFrame
    domain_df = domain_df.drop(columns=['year', 'sep_median', 'num_properties'], errors='ignore')
    
    return merged_df, domain_df

# Apply the function to each dataset 
one_bed_flat, domain_one_bed_flat = impute_sep_2024_rental_data(one_bed_flat, domain_one_bed_flat)
two_bed_flat, domain_two_bed_flat = impute_sep_2024_rental_data(two_bed_flat, domain_two_bed_flat)
three_bed_flat, domain_three_bed_flat = impute_sep_2024_rental_data(three_bed_flat, domain_three_bed_flat)
two_bed_house, domain_two_bed_house = impute_sep_2024_rental_data(two_bed_house, domain_two_bed_house)
three_bed_house, domain_three_bed_house = impute_sep_2024_rental_data(three_bed_house, domain_three_bed_house)
four_bed_house, domain_four_bed_house = impute_sep_2024_rental_data(four_bed_house, domain_four_bed_house)
all_properties, domain_all_properties = impute_sep_2024_rental_data(all_properties, domain_all_properties)

domain_one_bed_flat.isnull().sum()

suburb               0
median_bath          0
median_parkings      0
furnished_count      0
unfurnished_count    0
pets_allowed         0
pets_not_allowed     0
dtype: int64

### Combining All Feature Sets

In [20]:
# Merge one_bed_flat
one_bed_flat_merged = pd.merge(one_bed_flat, crimes, on=['suburb', 'year'], how='outer')
one_bed_flat_merged = pd.merge(one_bed_flat_merged, income, on=['suburb', 'year'], how='outer')
one_bed_flat_merged = pd.merge(one_bed_flat_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
one_bed_flat_merged = pd.merge(one_bed_flat_merged, education, on='suburb', how='inner')
one_bed_flat_merged = pd.merge(one_bed_flat_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
one_bed_flat_merged = pd.merge(one_bed_flat_merged, pt_distances, on='suburb', how='inner')
one_bed_flat_merged = pd.merge(one_bed_flat_merged, domain_one_bed_flat, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in one_bed_flat and the population dataframe
one_bed_flat_merged = one_bed_flat_merged[one_bed_flat_merged['suburb'].isin(population['sa2_name'])]


# Merge two_bed_flat
two_bed_flat_merged = pd.merge(two_bed_flat, crimes, on=['suburb', 'year'], how='outer')
two_bed_flat_merged = pd.merge(two_bed_flat_merged, income, on=['suburb', 'year'], how='outer')
two_bed_flat_merged = pd.merge(two_bed_flat_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
two_bed_flat_merged = pd.merge(two_bed_flat_merged, education, on='suburb', how='inner')
two_bed_flat_merged = pd.merge(two_bed_flat_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
two_bed_flat_merged = pd.merge(two_bed_flat_merged, pt_distances, on='suburb', how='inner')
two_bed_flat_merged = pd.merge(two_bed_flat_merged, domain_two_bed_flat, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in two_bed_flat and the population dataframe
two_bed_flat_merged = two_bed_flat_merged[two_bed_flat_merged['suburb'].isin(population['sa2_name'])]


# Merge three_bed_flat
three_bed_flat_merged = pd.merge(three_bed_flat, crimes, on=['suburb', 'year'], how='outer')
three_bed_flat_merged = pd.merge(three_bed_flat_merged, income, on=['suburb', 'year'], how='outer')
three_bed_flat_merged = pd.merge(three_bed_flat_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
three_bed_flat_merged = pd.merge(three_bed_flat_merged, education, on='suburb', how='inner')
three_bed_flat_merged = pd.merge(three_bed_flat_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
three_bed_flat_merged = pd.merge(three_bed_flat_merged, pt_distances, on='suburb', how='inner')
three_bed_flat_merged = pd.merge(three_bed_flat_merged, domain_three_bed_flat, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in three_bed_flat and the population dataframe
three_bed_flat_merged = three_bed_flat_merged[three_bed_flat_merged['suburb'].isin(population['sa2_name'])]


# Merge two_bed_house
two_bed_house_merged = pd.merge(two_bed_house, crimes, on=['suburb', 'year'], how='outer')
two_bed_house_merged = pd.merge(two_bed_house_merged, income, on=['suburb', 'year'], how='outer')
two_bed_house_merged = pd.merge(two_bed_house_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
two_bed_house_merged = pd.merge(two_bed_house_merged, education, on='suburb', how='inner')
two_bed_house_merged = pd.merge(two_bed_house_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
two_bed_house_merged = pd.merge(two_bed_house_merged, pt_distances, on='suburb', how='inner')
two_bed_house_merged = pd.merge(two_bed_house_merged, domain_two_bed_house, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in two_bed_house and the population dataframe
two_bed_house_merged = two_bed_house_merged[two_bed_house_merged['suburb'].isin(population['sa2_name'])]


# Merge three_bed_house
three_bed_house_merged = pd.merge(three_bed_house, crimes, on=['suburb', 'year'], how='outer')
three_bed_house_merged = pd.merge(three_bed_house_merged, income, on=['suburb', 'year'], how='outer')
three_bed_house_merged = pd.merge(three_bed_house_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
three_bed_house_merged = pd.merge(three_bed_house_merged, education, on='suburb', how='inner')
three_bed_house_merged = pd.merge(three_bed_house_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
three_bed_house_merged = pd.merge(three_bed_house_merged, pt_distances, on='suburb', how='inner')
three_bed_house_merged = pd.merge(three_bed_house_merged, domain_three_bed_house, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in three_bed_house and the population dataframe
three_bed_house_merged = three_bed_house_merged[three_bed_house_merged['suburb'].isin(population['sa2_name'])]


# Merge four_bed_house
four_bed_house_merged = pd.merge(four_bed_house, crimes, on=['suburb', 'year'], how='outer')
four_bed_house_merged = pd.merge(four_bed_house_merged, income, on=['suburb', 'year'], how='outer')
four_bed_house_merged = pd.merge(four_bed_house_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
four_bed_house_merged = pd.merge(four_bed_house_merged, education, on='suburb', how='inner')
four_bed_house_merged = pd.merge(four_bed_house_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
four_bed_house_merged = pd.merge(four_bed_house_merged, pt_distances, on='suburb', how='inner')
four_bed_house_merged = pd.merge(four_bed_house_merged, domain_four_bed_house, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in four_bed_house and the population dataframe
four_bed_house_merged = four_bed_house_merged[four_bed_house_merged['suburb'].isin(population['sa2_name'])]


# Merge all_properties
all_properties_merged = pd.merge(all_properties, crimes, on=['suburb', 'year'], how='outer')
all_properties_merged = pd.merge(all_properties_merged, income, on=['suburb', 'year'], how='outer')
all_properties_merged = pd.merge(all_properties_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
all_properties_merged = pd.merge(all_properties_merged, education, on='suburb', how='inner')
all_properties_merged = pd.merge(all_properties_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
all_properties_merged = pd.merge(all_properties_merged, pt_distances, on='suburb', how='inner')
all_properties_merged = pd.merge(all_properties_merged, domain_all_properties, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in all_properties and the population dataframe
all_properties_merged = all_properties_merged[all_properties_merged['suburb'].isin(population['sa2_name'])]


#----- Manually Check Null
missing_values = one_bed_flat_merged.isnull().sum()
columns_with_missing_values = missing_values[missing_values > 0]

columns_with_missing_values

dec_median    2442
jun_median    2442
mar_median    2442
sep_median    2442
dtype: int64

In [21]:
# Drop all other suburb column names. Only keep the first suburb column 
def clean_suburb_cols(df):
    """
    This function removes all duplicated of the suburb column name 
    from the merged dataframes. The duplicate suburb column name 
    could be 'Unnamed', 'sa2_name' or 'gazetted_locality'.
    """
    df = df.loc[:, ~df.columns.str.contains('Unnamed')]  # removes the duplicate 'suburb' column
    columns_to_drop = ['sa2_name', 'gazetted_locality']
    df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

    return df

# Clean the column names
one_bed_flat_merged = clean_suburb_cols(one_bed_flat_merged)
two_bed_flat_merged = clean_suburb_cols(two_bed_flat_merged)
three_bed_flat_merged = clean_suburb_cols(three_bed_flat_merged)
two_bed_house_merged = clean_suburb_cols(two_bed_house_merged)
three_bed_house_merged = clean_suburb_cols(three_bed_house_merged)
four_bed_house_merged = clean_suburb_cols(four_bed_house_merged)
all_properties_merged = clean_suburb_cols(all_properties_merged)

#----- Manually Check Null
missing_values = one_bed_flat_merged.isnull().sum()
columns_with_missing_values = missing_values[missing_values > 0]

columns_with_missing_values

dec_median    2442
jun_median    2442
mar_median    2442
sep_median    2442
dtype: int64

### Save All Properties Dataframe for Visualisation Purposes

In [22]:
# Save df to a CSV file
all_properties_merged.to_csv('../data/curated/all_properties_merged.csv', index=False)

### Create Train & Test Sets

In [41]:
def train_val_test_sets(df):
    """
    This function first splits the dataframe into training, validation, 
    and testing sets based on the 'year' column:
    - Training set includes data from the years 2016-2021.
    - Validation set includes data from the years 2022-2024.
    - Testing set includes data from the years 2025-2027.

    It then merges additional columns that are not part of the 
    features specific to years or target columns back with the 
    respective sets based on matching suburbs.

    The function then returns the follow dataframes:
    - X_train: Training feature set.
    - X_val: Validation feature set.
    - X_test: Testing feature set.
    - y_train: Training target set.
    - y_val: Validation target set.
    - y_test: Testing target set.
    """
    # Define the year ranges for training, validation, and testing sets
    train_years = range(2016, 2022)
    val_years = range(2022, 2025)
    test_years = range(2025, 2028)

    # Columns to include in X (specific to the years) and y splits
    feature_year_cols = ['suburb', 'year', 'offence_division', 'total_offence_count', 'erp']
    target_columns = ['dec_median', 'jun_median', 'mar_median', 'sep_median']

    # Split featurex (specific to the years) and target data
    X = df[feature_year_cols]
    y = df[target_columns]

    # Split the dataframe into training, validation, and testing sets based on the year
    X_train = X[X['year'].isin(train_years)]
    X_val = X[X['year'].isin(val_years)]
    X_test = X[X['year'].isin(test_years)]

    # Align target sets with the corresponding feature sets
    y_train = y.loc[X_train.index]
    y_val = y.loc[X_val.index]
    y_test = y.loc[X_test.index]

    # Extract other columns not in feature_year_cols or target_columns, including 'suburb'
    other_columns = df.drop(columns=feature_year_cols + target_columns).columns
    other_data = df[other_columns].copy()
    other_data['suburb'] = df['suburb']  # Ensure 'suburb' is included

    # Merge the 'other' data back with the matching suburbs, irrespective of the year
    X_train = X_train.merge(other_data, on='suburb', how='left')
    X_val = X_val.merge(other_data, on='suburb', how='left')
    X_test = X_test.merge(other_data, on='suburb', how='left')

    # Drop 'suburb' and 'year' from the feature sets
    X_train = X_train.drop(columns=['suburb', 'year'])
    X_val = X_val.drop(columns=['suburb', 'year'])
    X_test = X_test.drop(columns=['suburb', 'year'])

    # Remove rows with NaN values in y_train and y_val
    valid_train_indices = y_train.dropna().index
    X_train = X_train.loc[valid_train_indices]
    y_train = y_train.loc[valid_train_indices]

    valid_val_indices = y_val.dropna().index
    X_val = X_val.loc[valid_val_indices]
    y_val = y_val.loc[valid_val_indices]

    return X_train, X_val, X_test, y_train, y_val, y_test

# Create training, validation, and test sets for each property type
X_train_one_bed, X_val_one_bed, X_test_one_bed, y_train_one_bed, y_val_one_bed, y_test_one_bed = train_val_test_sets(one_bed_flat_merged)
X_train_two_bed, X_val_two_bed, X_test_two_bed, y_train_two_bed, y_val_two_bed, y_test_two_bed = train_val_test_sets(two_bed_flat_merged)
X_train_three_bed, X_val_three_bed, X_test_three_bed, y_train_three_bed, y_val_three_bed, y_test_three_bed = train_val_test_sets(three_bed_flat_merged)
X_train_two_bed_house, X_val_two_bed_house, X_test_two_bed_house, y_train_two_bed_house, y_val_two_bed_house, y_test_two_bed_house = train_val_test_sets(two_bed_house_merged)
X_train_three_bed_house, X_val_three_bed_house, X_test_three_bed_house, y_train_three_bed_house, y_val_three_bed_house, y_test_three_bed_house = train_val_test_sets(three_bed_house_merged)
X_train_four_bed_house, X_val_four_bed_house, X_test_four_bed_house, y_train_four_bed_house, y_val_four_bed_house, y_test_four_bed_house = train_val_test_sets(four_bed_house_merged)
X_train_all_properties, X_val_all_properties, X_test_all_properties, y_train_all_properties, y_val_all_properties, y_test_all_properties = train_val_test_sets(all_properties_merged)

X_train_all_properties

Unnamed: 0,suburb,offence_division,total_offence_count,erp,hi_1_149_tot,hi_150_299_tot,hi_300_399_tot,hi_400_499_tot,hi_500_649_tot,hi_650_799_tot,...,distance_to_restaurant,distance_to_supermarket,nearest_transport_avg_distance,distance_to_cbd,median_bath,median_parkings,furnished_count,unfurnished_count,pets_allowed,pets_not_allowed
0,abbotsford,1.0,107.0,8770.0,27.0,72.0,68.0,114.0,95.0,120.0,...,1.141601,1.083238,1.11,5.55,1,1,5,54,11,48
1,abbotsford,1.0,107.0,8770.0,27.0,72.0,68.0,114.0,95.0,120.0,...,1.141601,1.083238,1.11,5.55,1,1,5,54,11,48
2,abbotsford,1.0,107.0,8770.0,27.0,72.0,68.0,114.0,95.0,120.0,...,1.141601,1.083238,1.11,5.55,1,1,5,54,11,48
3,abbotsford,1.0,107.0,8770.0,27.0,72.0,68.0,114.0,95.0,120.0,...,1.141601,1.083238,1.11,5.55,1,1,5,54,11,48
4,abbotsford,1.0,107.0,8770.0,27.0,72.0,68.0,114.0,95.0,120.0,...,1.141601,1.083238,1.11,5.55,1,1,5,54,11,48
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14431,armadale,5.0,37.0,10044.0,42.0,50.0,74.0,148.0,153.0,144.0,...,0.722461,0.911558,0.31,8.17,1,1,1,45,2,44
14432,armadale,5.0,37.0,10044.0,42.0,50.0,74.0,148.0,153.0,144.0,...,0.722461,0.911558,0.31,8.17,1,1,1,45,2,44
14433,armadale,5.0,37.0,10044.0,42.0,50.0,74.0,148.0,153.0,144.0,...,0.722461,0.911558,0.31,8.17,1,1,1,45,2,44
14434,armadale,5.0,37.0,10044.0,42.0,50.0,74.0,148.0,153.0,144.0,...,0.722461,0.911558,0.31,8.17,1,1,1,45,2,44


Check to see all the X columns are the same 

In [24]:
def compare_feature_columns(train, val, test):
    """
    This function compares columns of the training, validation, and 
    testing feature dataframes (X). A dictionary is then returned
    indicating if any columns are missing in each set or if all
    the colums are the same. 
    """
    comparison_result = {}
    # Check if columns match between train, validation, and test sets
    train_val_match = train.columns.equals(val.columns)
    train_test_match = train.columns.equals(test.columns)
    val_test_match = val.columns.equals(test.columns)
    
    if not (train_val_match and train_test_match and val_test_match):
        missing_in_val = set(train.columns) - set(val.columns)
        missing_in_train_val = set(val.columns) - set(train.columns)
        missing_in_test = set(train.columns) - set(test.columns)
        missing_in_train_test = set(test.columns) - set(train.columns)
        missing_in_val_test = set(val.columns) - set(test.columns)
        missing_in_test_val = set(test.columns) - set(val.columns)

        comparison_result = {
            "Columns missing in validation set compared to train": list(missing_in_val),
            "Columns missing in train set compared to validation": list(missing_in_train_val),
            "Columns missing in test set compared to train": list(missing_in_test),
            "Columns missing in train set compared to test": list(missing_in_train_test),
            "Columns missing in test set compared to validation": list(missing_in_val_test),
            "Columns missing in validation set compared to test": list(missing_in_test_val),
        }
    else:
        comparison_result = "Columns are the same in all three sets (train, validation, test)."

    return comparison_result

# List of training, validation, and testing DataFrames to compare
feature_dfs = {
    "One Bed": (X_train_one_bed, X_val_one_bed, X_test_one_bed),
    "Two Bed": (X_train_two_bed, X_val_two_bed, X_test_two_bed),
    "Three Bed": (X_train_three_bed, X_val_three_bed, X_test_three_bed),
    "Two Bed House": (X_train_two_bed_house, X_val_two_bed_house, X_test_two_bed_house),
    "Three Bed House": (X_train_three_bed_house, X_val_three_bed_house, X_test_three_bed_house),
    "Four Bed House": (X_train_four_bed_house, X_val_four_bed_house, X_test_four_bed_house),
    "All Properties": (X_train_all_properties, X_val_all_properties, X_test_all_properties)
}

# Compare columns for each triplet of training, validation, and testing sets
comparison_results = {name: compare_feature_columns(train, val, test) for name, (train, val, test) in feature_dfs.items()}

comparison_results 

{'One Bed': 'Columns are the same in all three sets (train, validation, test).',
 'Two Bed': 'Columns are the same in all three sets (train, validation, test).',
 'Three Bed': 'Columns are the same in all three sets (train, validation, test).',
 'Two Bed House': 'Columns are the same in all three sets (train, validation, test).',
 'Three Bed House': 'Columns are the same in all three sets (train, validation, test).',
 'Four Bed House': 'Columns are the same in all three sets (train, validation, test).',
 'All Properties': 'Columns are the same in all three sets (train, validation, test).'}

### Check Null Values

In [25]:
# List of dataframes to check for missing values
dataframes = {
    'X_train_one_bed': X_train_one_bed,
    'X_val_one_bed': X_val_one_bed,
    'X_test_one_bed': X_test_one_bed,
    'y_train_one_bed': y_train_one_bed,
    'y_val_one_bed': y_val_one_bed,
    'y_test_one_bed': y_test_one_bed,
    
    'X_train_two_bed': X_train_two_bed,
    'X_val_two_bed': X_val_two_bed,
    'X_test_two_bed': X_test_two_bed,
    'y_train_two_bed': y_train_two_bed,
    'y_val_two_bed': y_val_two_bed,
    'y_test_two_bed': y_test_two_bed,
    
    'X_train_three_bed': X_train_three_bed,
    'X_val_three_bed': X_val_three_bed,
    'X_test_three_bed': X_test_three_bed,
    'y_train_three_bed': y_train_three_bed,
    'y_val_three_bed': y_val_three_bed,
    'y_test_three_bed': y_test_three_bed,
    
    'X_train_two_bed_house': X_train_two_bed_house,
    'X_val_two_bed_house': X_val_two_bed_house,
    'X_test_two_bed_house': X_test_two_bed_house,
    'y_train_two_bed_house': y_train_two_bed_house,
    'y_val_two_bed_house': y_val_two_bed_house,
    'y_test_two_bed_house': y_test_two_bed_house,
    
    'X_train_three_bed_house': X_train_three_bed_house,
    'X_val_three_bed_house': X_val_three_bed_house,
    'X_test_three_bed_house': X_test_three_bed_house,
    'y_train_three_bed_house': y_train_three_bed_house,
    'y_val_three_bed_house': y_val_three_bed_house,
    'y_test_three_bed_house': y_test_three_bed_house,
    
    'X_train_four_bed_house': X_train_four_bed_house,
    'X_val_four_bed_house': X_val_four_bed_house,
    'X_test_four_bed_house': X_test_four_bed_house,
    'y_train_four_bed_house': y_train_four_bed_house,
    'y_val_four_bed_house': y_val_four_bed_house,
    'y_test_four_bed_house': y_test_four_bed_house,
    
    'X_train_all_properties': X_train_all_properties,
    'X_val_all_properties': X_val_all_properties,
    'X_test_all_properties': X_test_all_properties,
    'y_train_all_properties': y_train_all_properties,
    'y_val_all_properties': y_val_all_properties,
    'y_test_all_properties': y_test_all_properties,
}

In [26]:
# Collecting rows with missing values for each dataframe
missing_rows_summary = {}

for name, df in dataframes.items():
    rows_with_missing = df[df.isnull().any(axis=1)]
    if not rows_with_missing.empty:
        missing_rows_summary[name] = rows_with_missing

print(missing_rows_summary)

# Check for missing values in each dataframe by columns 
missing_values_summary = {}
for name, df in dataframes.items():
    missing_values = df.isnull().sum()
    columns_with_missing = missing_values[missing_values > 0]
    if not columns_with_missing.empty:
        missing_values_summary[name] = columns_with_missing

print(missing_values_summary)

{'y_test_one_bed':       dec_median  jun_median  mar_median  sep_median
54           NaN         NaN         NaN         NaN
55           NaN         NaN         NaN         NaN
56           NaN         NaN         NaN         NaN
57           NaN         NaN         NaN         NaN
58           NaN         NaN         NaN         NaN
...          ...         ...         ...         ...
9715         NaN         NaN         NaN         NaN
9716         NaN         NaN         NaN         NaN
9717         NaN         NaN         NaN         NaN
9718         NaN         NaN         NaN         NaN
9719         NaN         NaN         NaN         NaN

[2376 rows x 4 columns], 'y_test_two_bed':        dec_median  jun_median  mar_median  sep_median
54            NaN         NaN         NaN         NaN
55            NaN         NaN         NaN         NaN
56            NaN         NaN         NaN         NaN
57            NaN         NaN         NaN         NaN
58            NaN         NaN  

### Feature Selection

In [32]:
def pca_feature_selection(X_train, X_val, X_test, variance_threshold=0.9):
    """
    The function applies PCA for dimensionality reduction by fitting on the
    training set and transforming both the training and test sets. It keeps 
    a select number of components based on the defined variance 
    threshold. It returns the reduced training and test sets.
    """

    # Standardise the data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)

    # Initialise PCA, specifying the variance threshold
    pca_temp = PCA().fit(X_train_scaled)
    cumulative_variance = pca_temp.explained_variance_ratio_.cumsum()
    # Find the number of components to capture the specified variance
    n_components = next(i for i, total_variance in enumerate(cumulative_variance) if total_variance >= variance_threshold) + 1

    pca = PCA(n_components=n_components)

    # Fit PCA on the training set and transform both training and test sets
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_val_pca = pca.transform(X_val_scaled)
    X_test_pca = pca.transform(X_test_scaled)

    print(f"Number of components selected: {n_components}")
    print(f"Explained variance by selected components: {sum(pca.explained_variance_ratio_):.2f}")

    return X_train_pca, X_val_pca, X_test_pca

# Perform feature selection with PCA on the X sets 
X_train_one_bed, X_val_one_bed, X_test_one_bed = pca_feature_selection(X_train_one_bed, X_val_one_bed, X_test_one_bed)
X_train_two_bed, X_val_two_bed, X_test_two_bed = pca_feature_selection(X_train_two_bed, X_val_two_bed, X_test_two_bed)
X_train_three_bed, X_val_three_bed, X_test_three_bed = pca_feature_selection(X_train_three_bed, X_val_three_bed, X_test_three_bed)
X_train_two_bed_house, X_val_two_bed_house, X_test_two_bed_house = pca_feature_selection(X_train_two_bed_house, X_val_two_bed_house, X_test_two_bed_house)
X_train_three_bed_house, X_val_three_bed_house, X_test_three_bed_house = pca_feature_selection(X_train_three_bed_house, X_val_three_bed_house, X_test_three_bed_house)
X_train_four_bed_house, X_val_four_bed_house, X_test_four_bed_house = pca_feature_selection(X_train_four_bed_house, X_val_four_bed_house, X_test_four_bed_house)
X_train_all_properties, X_val_all_properties, X_test_all_properties = pca_feature_selection(X_train_all_properties, X_val_all_properties, X_test_all_properties)

Number of components selected: 2
Explained variance by selected components: 1.00
Number of components selected: 3
Explained variance by selected components: 1.00
Number of components selected: 2
Explained variance by selected components: 1.00
Number of components selected: 3
Explained variance by selected components: 1.00
Number of components selected: 4
Explained variance by selected components: 1.00
Number of components selected: 3
Explained variance by selected components: 1.00
Number of components selected: 3
Explained variance by selected components: 1.00


### Train & Fit Model

In [33]:
# Define a list of all training, validation, and test sets
ML_dfs = [
    (X_train_one_bed, X_val_one_bed, X_test_one_bed, y_train_one_bed, y_val_one_bed, y_test_one_bed),
    (X_train_two_bed, X_val_two_bed, X_test_two_bed, y_train_two_bed, y_val_two_bed, y_test_two_bed),
    (X_train_three_bed, X_val_three_bed, X_test_three_bed, y_train_three_bed, y_val_three_bed, y_test_three_bed),
    (X_train_two_bed_house, X_val_two_bed_house, X_test_two_bed_house, y_train_two_bed_house, y_val_two_bed_house, y_test_two_bed_house),
    (X_train_three_bed_house, X_val_three_bed_house, X_test_three_bed_house, y_train_three_bed_house, y_val_three_bed_house, y_test_three_bed_house),
    (X_train_four_bed_house, X_val_four_bed_house, X_test_four_bed_house, y_train_four_bed_house, y_val_four_bed_house, y_test_four_bed_house),
    (X_train_all_properties, X_val_all_properties, X_test_all_properties, y_train_all_properties, y_val_all_properties, y_test_all_properties),
]

Lasso Regression

In [34]:
# Define hyperparameters for Lasso
alpha_range = np.logspace(-6, 2, 20) 
param_grid = {'alpha': alpha_range}

# Loop through each set, tune the model on the validation set, and predict on the test set
for i, (X_train, X_val, X_test, y_train, y_val, y_test) in enumerate(ML_dfs):
    # Initialise the Lasso model
    lasso = Lasso()
    
    # Set up the GridSearchCV to tune the 'alpha' hyperparameter
    grid_search = GridSearchCV(lasso, param_grid, scoring='neg_mean_squared_error', cv=5)
    
    # Train the model using the training set and validate on the validation set
    grid_search.fit(X_train, y_train)
    
    # Select the best model based on the validation set
    best_lasso = grid_search.best_estimator_
    
    # Evaluate on the validation set
    y_val_pred = best_lasso.predict(X_val)
    val_mse = mean_squared_error(y_val, y_val_pred)
    val_r2 = r2_score(y_val, y_val_pred)
    
    # Make predictions on the test set
    y_test_pred = best_lasso.predict(X_test)
    
    # Print the results
    print(f"Dataset {i+1}:")
    print(f"Best alpha: {best_lasso.alpha}")
    print(f"Validation MSE: {val_mse:.4f}, R^2: {val_r2:.4f}")
    print(f"Predictions for 2025-2027: {y_test_pred}\n")


Dataset 1:
Best alpha: 0.7847599703514607
Validation MSE: 8126.1469, R^2: -0.4729
Predictions for 2025-2027: [[312.04767107 310.47028019 308.96041024 311.19624436]
 [312.04767107 310.47028019 308.96041024 311.19624436]
 [312.04767107 310.47028019 308.96041024 311.19624436]
 ...
 [300.39550945 298.99939746 298.16749706 299.52438767]
 [300.39550945 298.99939746 298.16749706 299.52438767]
 [300.39550945 298.99939746 298.16749706 299.52438767]]

Dataset 2:
Best alpha: 5.455594781168514
Validation MSE: 13186.7588, R^2: -0.5464
Predictions for 2025-2027: [[380.74154534 377.93885804 376.57314837 379.4845291 ]
 [380.74154534 377.93885804 376.57314837 379.4845291 ]
 [380.74154534 377.93885804 376.57314837 379.4845291 ]
 ...
 [379.64435191 376.98671195 375.63985858 378.37023335]
 [379.64435191 376.98671195 375.63985858 378.37023335]
 [379.64435191 376.98671195 375.63985858 378.37023335]]

Dataset 3:
Best alpha: 37.92690190732246
Validation MSE: 38661.7270, R^2: -0.2995
Predictions for 2025-2027:

Random Forest

In [30]:
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import mean_squared_error, r2_score
# import numpy as np

# # Define the hyperparameters for Random Forest
# param_grid = {
#     'n_estimators': [50, 100, 200],  # Number of trees
#     'max_depth': [5, 10, 15, None],  # Maximum depth of each tree
# }

# # Loop through each set, tune the model on the validation set, and predict on the test set
# for i, (X_train, X_val, X_test, y_train, y_val, y_test) in enumerate(ML_dfs):
#     # Initialise the Random Forest model
#     rf = RandomForestRegressor(random_state=42)
    
#     # Set up the GridSearchCV to tune 'n_estimators' and 'max_depth'
#     grid_search = GridSearchCV(rf, param_grid, scoring='neg_mean_squared_error', cv=5)
    
#     # Train the model using the training set and validate on the validation set
#     grid_search.fit(X_train, y_train)
    
#     # Select the best model based on the validation set
#     best_rf = grid_search.best_estimator_
    
#     # Evaluate on the validation set
#     y_val_pred = best_rf.predict(X_val)
#     val_mse = mean_squared_error(y_val, y_val_pred)
#     val_r2 = r2_score(y_val, y_val_pred)
    
#     # Make predictions on the test set
#     y_test_pred = best_rf.predict(X_test)
    
#     # Print the results
#     print(f"Dataset {i+1}:")
#     print(f"Best n_estimators: {best_rf.n_estimators}, Best max_depth: {best_rf.max_depth}")
#     print(f"Validation MSE: {val_mse:.4f}, R^2: {val_r2:.4f}")
#     print(f"Predictions for 2025-2027: {y_test_pred}\n")


Dataset 1:
Best n_estimators: 50, Best max_depth: 5
Validation MSE: 15541.6637, R^2: -1.8621
Predictions for 2025-2027: [[306.67878984 301.27463767 299.11212296 303.12977877]
 [306.67878984 301.27463767 299.11212296 303.12977877]
 [306.67878984 301.27463767 299.11212296 303.12977877]
 ...
 [207.5489577  206.71093996 199.83019282 206.8679707 ]
 [207.5489577  206.71093996 199.83019282 206.8679707 ]
 [207.5489577  206.71093996 199.83019282 206.8679707 ]]

Dataset 2:
Best n_estimators: 200, Best max_depth: 5
Validation MSE: 15918.4452, R^2: -0.8685
Predictions for 2025-2027: [[339.8062063  335.2454895  332.79726584 337.46072295]
 [339.8062063  335.2454895  332.79726584 337.46072295]
 [339.8062063  335.2454895  332.79726584 337.46072295]
 ...
 [356.92532146 350.10240446 348.54769827 353.40528999]
 [356.92532146 350.10240446 348.54769827 353.40528999]
 [356.92532146 350.10240446 348.54769827 353.40528999]]

Dataset 3:
Best n_estimators: 200, Best max_depth: 5
Validation MSE: 40027.1386, R^2:

KeyboardInterrupt: 

Arima Modelling

In [39]:
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error, r2_score

# Assuming 'y_train', 'y_val', and 'y_test' are DataFrames with multiple target columns
target_columns = ['dec_median', 'jun_median', 'mar_median', 'sep_median']  # Adjust according to your target columns

# Loop through each dataset
for i, (X_train, X_val, X_test, y_train, y_val, y_test) in enumerate(ML_dfs):
    print(f"Dataset {i+1} results:")
    
    # Apply ARIMA for each target variable
    for target in target_columns:
        print(f"Fitting ARIMA for target: {target}...")

        # Extract the target series for ARIMA
        y_train_series = y_train[target]
        y_val_series = y_val[target]
        y_test_series = y_test[target]
        
        # Define the ARIMA order (p, d, q)
        p = 5  # AR term
        d = 1  # Differencing term
        q = 0  # MA term
        
        # Fit ARIMA model to the training data
        model = ARIMA(y_train_series, order=(p, d, q))
        model_fit = model.fit()
        
        # Predict on the validation set
        y_val_pred = model_fit.forecast(steps=len(y_val_series))
        
        # Calculate validation MSE and R^2
        val_mse = mean_squared_error(y_val_series, y_val_pred)
        val_r2 = r2_score(y_val_series, y_val_pred)
        
        # Predict on the test set
        y_test_pred = model_fit.forecast(steps=len(y_test_series))
        
        # Print the results
        print(f"Target: {target}")
        print(f"Validation MSE: {val_mse:.4f}, Validation R^2: {val_r2:.4f}")
        print(f"Test predictions for 2025-2027: {y_test_pred}\n")


Dataset 1 results:
Fitting ARIMA for target: dec_median...


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Target: dec_median
Validation MSE: 11924.0906, Validation R^2: -1.3932
Test predictions for 2025-2027: 4710    270.0
4711    270.0
4712    270.0
4713    270.0
4714    270.0
        ...  
7081    270.0
7082    270.0
7083    270.0
7084    270.0
7085    270.0
Name: predicted_mean, Length: 2376, dtype: float64

Fitting ARIMA for target: jun_median...
Target: jun_median
Validation MSE: 8514.9311, Validation R^2: -0.8206
Test predictions for 2025-2027: 4710    278.0
4711    278.0
4712    278.0
4713    278.0
4714    278.0
        ...  
7081    278.0
7082    278.0
7083    278.0
7084    278.0
7085    278.0
Name: predicted_mean, Length: 2376, dtype: float64

Fitting ARIMA for target: mar_median...


  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(


Target: mar_median
Validation MSE: 6585.3367, Validation R^2: -0.4327
Test predictions for 2025-2027: 4710    290.0
4711    290.0
4712    290.0
4713    290.0
4714    290.0
        ...  
7081    290.0
7082    290.0
7083    290.0
7084    290.0
7085    290.0
Name: predicted_mean, Length: 2376, dtype: float64

Fitting ARIMA for target: sep_median...
Target: sep_median
Validation MSE: 15270.8297, Validation R^2: -0.9631
Test predictions for 2025-2027: 4710    270.0
4711    270.0
4712    270.0
4713    270.0
4714    270.0
        ...  
7081    270.0
7082    270.0
7083    270.0
7084    270.0
7085    270.0
Name: predicted_mean, Length: 2376, dtype: float64

Dataset 2 results:
Fitting ARIMA for target: dec_median...


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Target: dec_median
Validation MSE: 13019.1806, Validation R^2: -0.5289
Test predictions for 2025-2027: 6048    385.0
6049    385.0
6050    385.0
6051    385.0
6052    385.0
        ...  
9067    385.0
9068    385.0
9069    385.0
9070    385.0
9071    385.0
Name: predicted_mean, Length: 3024, dtype: float64

Fitting ARIMA for target: jun_median...


  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Target: jun_median
Validation MSE: 10965.9206, Validation R^2: -0.3980
Test predictions for 2025-2027: 6048    380.0
6049    380.0
6050    380.0
6051    380.0
6052    380.0
        ...  
9067    380.0
9068    380.0
9069    380.0
9070    380.0
9071    380.0
Name: predicted_mean, Length: 3024, dtype: float64

Fitting ARIMA for target: mar_median...


  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Target: mar_median
Validation MSE: 9613.7837, Validation R^2: -0.2451
Test predictions for 2025-2027: 6048    385.0
6049    385.0
6050    385.0
6051    385.0
6052    385.0
        ...  
9067    385.0
9068    385.0
9069    385.0
9070    385.0
9071    385.0
Name: predicted_mean, Length: 3024, dtype: float64

Fitting ARIMA for target: sep_median...


  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Target: sep_median
Validation MSE: 14825.6285, Validation R^2: -0.5062
Test predictions for 2025-2027: 6048    385.0
6049    385.0
6050    385.0
6051    385.0
6052    385.0
        ...  
9067    385.0
9068    385.0
9069    385.0
9070    385.0
9071    385.0
Name: predicted_mean, Length: 3024, dtype: float64

Dataset 3 results:
Fitting ARIMA for target: dec_median...
Target: dec_median
Validation MSE: 41279.0921, Validation R^2: -0.5562
Test predictions for 2025-2027: 4416    730.0
4417    730.0
4418    730.0
4419    730.0
4420    730.0
        ...  
6643    730.0
6644    730.0
6645    730.0
6646    730.0
6647    730.0
Name: predicted_mean, Length: 2232, dtype: float64

Fitting ARIMA for target: jun_median...


  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Target: jun_median
Validation MSE: 37539.0678, Validation R^2: -0.4914
Test predictions for 2025-2027: 4416    700.0
4417    700.0
4418    700.0
4419    700.0
4420    700.0
        ...  
6643    700.0
6644    700.0
6645    700.0
6646    700.0
6647    700.0
Name: predicted_mean, Length: 2232, dtype: float64

Fitting ARIMA for target: mar_median...
Target: mar_median
Validation MSE: 29547.7696, Validation R^2: -0.1925
Test predictions for 2025-2027: 4416    650.0
4417    650.0
4418    650.0
4419    650.0
4420    650.0
        ...  
6643    650.0
6644    650.0
6645    650.0
6646    650.0
6647    650.0
Name: predicted_mean, Length: 2232, dtype: float64

Fitting ARIMA for target: sep_median...


  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Target: sep_median
Validation MSE: 53528.5332, Validation R^2: -0.2675
Test predictions for 2025-2027: 4416    730.0
4417    730.0
4418    730.0
4419    730.0
4420    730.0
        ...  
6643    730.0
6644    730.0
6645    730.0
6646    730.0
6647    730.0
Name: predicted_mean, Length: 2232, dtype: float64

Dataset 4 results:
Fitting ARIMA for target: dec_median...
Target: dec_median
Validation MSE: 13090.2198, Validation R^2: -0.0237
Test predictions for 2025-2027: 5586    500.0
5587    500.0
5588    500.0
5589    500.0
5590    500.0
        ...  
8443    500.0
8444    500.0
8445    500.0
8446    500.0
8447    500.0
Name: predicted_mean, Length: 2862, dtype: float64

Fitting ARIMA for target: jun_median...


  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Target: jun_median
Validation MSE: 12960.4310, Validation R^2: -0.0462
Test predictions for 2025-2027: 5586    495.0
5587    495.0
5588    495.0
5589    495.0
5590    495.0
        ...  
8443    495.0
8444    495.0
8445    495.0
8446    495.0
8447    495.0
Name: predicted_mean, Length: 2862, dtype: float64

Fitting ARIMA for target: mar_median...
Target: mar_median
Validation MSE: 13093.4224, Validation R^2: -0.0722
Test predictions for 2025-2027: 5586    495.0
5587    495.0
5588    495.0
5589    495.0
5590    495.0
        ...  
8443    495.0
8444    495.0
8445    495.0
8446    495.0
8447    495.0
Name: predicted_mean, Length: 2862, dtype: float64

Fitting ARIMA for target: sep_median...


  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


Target: sep_median
Validation MSE: 17430.3394, Validation R^2: -0.0022
Test predictions for 2025-2027: 5586    500.0
5587    500.0
5588    500.0
5589    500.0
5590    500.0
        ...  
8443    500.0
8444    500.0
8445    500.0
8446    500.0
8447    500.0
Name: predicted_mean, Length: 2862, dtype: float64

Dataset 5 results:
Fitting ARIMA for target: dec_median...


  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(


Target: dec_median
Validation MSE: 30735.6154, Validation R^2: -0.0003
Test predictions for 2025-2027: 7014     575.0
7015     575.0
7016     575.0
7017     575.0
7018     575.0
         ...  
10537    575.0
10538    575.0
10539    575.0
10540    575.0
10541    575.0
Name: predicted_mean, Length: 3528, dtype: float64

Fitting ARIMA for target: jun_median...
Target: jun_median
Validation MSE: 29628.7162, Validation R^2: -0.0018
Test predictions for 2025-2027: 7014     570.0
7015     570.0
7016     570.0
7017     570.0
7018     570.0
         ...  
10537    570.0
10538    570.0
10539    570.0
10540    570.0
10541    570.0
Name: predicted_mean, Length: 3528, dtype: float64

Fitting ARIMA for target: mar_median...


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Target: mar_median
Validation MSE: 29221.1949, Validation R^2: -0.0005
Test predictions for 2025-2027: 7014     560.0
7015     560.0
7016     560.0
7017     560.0
7018     560.0
         ...  
10537    560.0
10538    560.0
10539    560.0
10540    560.0
10541    560.0
Name: predicted_mean, Length: 3528, dtype: float64

Fitting ARIMA for target: sep_median...


  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


Target: sep_median
Validation MSE: 35814.3684, Validation R^2: -0.0078
Test predictions for 2025-2027: 7014     575.0
7015     575.0
7016     575.0
7017     575.0
7018     575.0
         ...  
10537    575.0
10538    575.0
10539    575.0
10540    575.0
10541    575.0
Name: predicted_mean, Length: 3528, dtype: float64

Dataset 6 results:
Fitting ARIMA for target: dec_median...


  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


Target: dec_median
Validation MSE: 72555.9805, Validation R^2: -0.0687
Test predictions for 2025-2027: 6156    660.0
6157    660.0
6158    660.0
6159    660.0
6160    660.0
        ...  
9229    660.0
9230    660.0
9231    660.0
9232    660.0
9233    660.0
Name: predicted_mean, Length: 3078, dtype: float64

Fitting ARIMA for target: jun_median...
Target: jun_median
Validation MSE: 67724.9336, Validation R^2: -0.0509
Test predictions for 2025-2027: 6156    650.0
6157    650.0
6158    650.0
6159    650.0
6160    650.0
        ...  
9229    650.0
9230    650.0
9231    650.0
9232    650.0
9233    650.0
Name: predicted_mean, Length: 3078, dtype: float64

Fitting ARIMA for target: mar_median...


  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(


Target: mar_median
Validation MSE: 63925.0352, Validation R^2: -0.0051
Test predictions for 2025-2027: 6156    680.0
6157    680.0
6158    680.0
6159    680.0
6160    680.0
        ...  
9229    680.0
9230    680.0
9231    680.0
9232    680.0
9233    680.0
Name: predicted_mean, Length: 3078, dtype: float64

Fitting ARIMA for target: sep_median...
Target: sep_median
Validation MSE: 82105.9248, Validation R^2: -0.0747
Test predictions for 2025-2027: 6156    668.0
6157    668.0
6158    668.0
6159    668.0
6160    668.0
        ...  
9229    668.0
9230    668.0
9231    668.0
9232    668.0
9233    668.0
Name: predicted_mean, Length: 3078, dtype: float64

Dataset 7 results:
Fitting ARIMA for target: dec_median...


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


Target: dec_median
Validation MSE: 7621.5219, Validation R^2: -0.0508
Test predictions for 2025-2027: 7128     475.0
7129     475.0
7130     475.0
7131     475.0
7132     475.0
         ...  
10687    475.0
10688    475.0
10689    475.0
10690    475.0
10691    475.0
Name: predicted_mean, Length: 3564, dtype: float64

Fitting ARIMA for target: jun_median...
Target: jun_median
Validation MSE: 7293.6448, Validation R^2: -0.0013
Test predictions for 2025-2027: 7128     475.0
7129     475.0
7130     475.0
7131     475.0
7132     475.0
         ...  
10687    475.0
10688    475.0
10689    475.0
10690    475.0
10691    475.0
Name: predicted_mean, Length: 3564, dtype: float64

Fitting ARIMA for target: mar_median...


  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(


Target: mar_median
Validation MSE: 7521.1616, Validation R^2: -0.0107
Test predictions for 2025-2027: 7128     480.0
7129     480.0
7130     480.0
7131     480.0
7132     480.0
         ...  
10687    480.0
10688    480.0
10689    480.0
10690    480.0
10691    480.0
Name: predicted_mean, Length: 3564, dtype: float64

Fitting ARIMA for target: sep_median...
Target: sep_median
Validation MSE: 13800.4104, Validation R^2: -0.0584
Test predictions for 2025-2027: 7128     480.0
7129     480.0
7130     480.0
7131     480.0
7132     480.0
         ...  
10687    480.0
10688    480.0
10689    480.0
10690    480.0
10691    480.0
Name: predicted_mean, Length: 3564, dtype: float64



  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
