In [1]:
import pandas as pd
import os
from pathlib import Path

PROJECT_ROOT = Path(os.environ["PROJECT_ROOT"])
from auxiliary.static_data import *
from auxiliary.auxiliary_funcs import *
from auxiliary.impute_missing_values import df_new_house_transactions

Added 999 missing rows. Final shape: (6432, 13)


First, create dataframes for each year, with the true and predicted values for each sector. The predicted values are assumed to be the values of the preceding years.

In [2]:
# Add sector_num column for sorting
df_new_house_transactions["sector_num"] = (
    df_new_house_transactions["sector"].str.extract("(\d+)").astype(int)
)

# Create y_pred and y_true arrays for each time period
time_periods = [
    ("2020", 2019, 2020),
    ("2021", 2020, 2021),
    ("2022", 2021, 2022),
    ("2023", 2022, 2023),
]

for period_name, pred_yr, true_yr in time_periods:
    # Get true values (current year: Aug to July)
    mask_true = (
        (df_new_house_transactions["yr"] == true_yr)
        & (df_new_house_transactions["m"] >= 8)
    ) | (
        (df_new_house_transactions["yr"] == true_yr + 1)
        & (df_new_house_transactions["m"] <= 7)
    )

    # Get prediction values (previous year: Aug to July)
    mask_pred = (
        (df_new_house_transactions["yr"] == pred_yr)
        & (df_new_house_transactions["m"] >= 8)
    ) | (
        (df_new_house_transactions["yr"] == pred_yr + 1)
        & (df_new_house_transactions["m"] <= 7)
    )

    df_true = df_new_house_transactions[mask_true]
    df_pred = df_new_house_transactions[mask_pred]

    # Extract values
    exec(f"df_true_{period_name} = df_true[['new_house_transaction_amount', 'm', 'yr', 'sector_num']].reset_index(drop=True)")
    exec(f"df_pred_{period_name} = df_pred[['new_house_transaction_amount', 'm', 'yr', 'sector_num']].reset_index(drop=True)")

now for the predictions, let's use a multiplicative factor

In [3]:
for multiplicative_factor in np.arange(0.5, 1.5, 0.05):

    df_pred_2020_ = df_pred_2020.copy()
    df_pred_2021_ = df_pred_2021.copy()
    df_pred_2022_ = df_pred_2022.copy()
    df_pred_2023_ = df_pred_2023.copy()

    df_pred_list = [df_pred_2020_, df_pred_2021_, df_pred_2022_, df_pred_2023_]
    for df_pred in df_pred_list:
        for sector in range(1, 97):
            ind_sector = df_pred.loc[(df_pred['sector_num'] == sector) & (df_pred['m'] != 1)].index
            # ind_sector = df_pred.loc[(df_pred['sector_num'] == sector) ].index
            df_pred_per_sector = df_pred.loc[ind_sector, 'new_house_transaction_amount'].copy()
            df_pred.loc[ind_sector, 'new_house_transaction_amount'] = multiplicative_factor*df_pred_per_sector 

    y_preds = [df_pred_2020_['new_house_transaction_amount'].to_numpy(), df_pred_2021_['new_house_transaction_amount'].to_numpy(), 
    df_pred_2022_['new_house_transaction_amount'].to_numpy(), df_pred_2023_['new_house_transaction_amount'].to_numpy()]
    y_trues = [df_true_2020['new_house_transaction_amount'].to_numpy(), df_true_2021['new_house_transaction_amount'].to_numpy(), 
    df_true_2022['new_house_transaction_amount'].to_numpy(), df_true_2023['new_house_transaction_amount'].to_numpy()]

    years_of_prediction = [2020, 2021, 2022, 2023]

    print(f"Multiplicative factor: {multiplicative_factor}")
    for y_pred, y_true, year in zip(y_preds, y_trues, years_of_prediction):
        print(f"Year: {year}")
        print(f"Score: {random_half_score(y_pred, y_true,  n_repeats= 1152*4)}")

Multiplicative factor: 0.5
Year: 2020
Score: 0.30409610787308106
Year: 2021
Score: 0.37816691226135385
Year: 2022
Score: 0.38845526859567925
Year: 2023
Score: 0.4584562284450122
Multiplicative factor: 0.55
Year: 2020
Score: 0.31604852386841004
Year: 2021
Score: 0.3796406498518915
Year: 2022
Score: 0.3930576510308753
Year: 2023
Score: 0.4741210956781635
Multiplicative factor: 0.6000000000000001
Year: 2020
Score: 0.32554785698409966
Year: 2021
Score: 0.37823122929022623
Year: 2022
Score: 0.397670904539418
Year: 2023
Score: 0.48618429815886777
Multiplicative factor: 0.6500000000000001
Year: 2020
Score: 0.33376867449359593
Year: 2021
Score: 0.37586663048899055
Year: 2022
Score: 0.39929476797083896
Year: 2023
Score: 0.4931583284939077
Multiplicative factor: 0.7000000000000002
Year: 2020
Score: 0.34174652861332677
Year: 2021
Score: 0.34861597367182195
Year: 2022
Score: 0.39753398061245443
Year: 2023
Score: 0.4973113502983634
Multiplicative factor: 0.7500000000000002
Year: 2020
Score: 0.34889