In [3]:
import pandas as pd
import os
from pathlib import Path

PROJECT_ROOT = Path(os.environ["PROJECT_ROOT"])
from auxiliary.static_data import *
from auxiliary.auxiliary_funcs import *
from auxiliary.impute_missing_values import df_new_house_transactions

df_test = pd.read_csv("test.csv")
df_sample_submission = pd.read_csv("sample_submission.csv")

In [4]:
# Add sector_num column for sorting
df_new_house_transactions["sector_num"] = (
    df_new_house_transactions["sector"].str.extract("(\d+)").astype(int)
)

# Create y_pred and y_true arrays for each time period
time_periods = [
    ("2020", 2019, 2020),
    ("2021", 2020, 2021),
    ("2022", 2021, 2022),
    ("2023", 2022, 2023),
]

for period_name, pred_yr, true_yr in time_periods:
    # Get true values (current year: Aug to July)
    mask_true = (
        (df_new_house_transactions["yr"] == true_yr)
        & (df_new_house_transactions["m"] >= 8)
    ) | (
        (df_new_house_transactions["yr"] == true_yr + 1)
        & (df_new_house_transactions["m"] <= 7)
    )

    # Get prediction values (previous year: Aug to July)
    mask_pred = (
        (df_new_house_transactions["yr"] == pred_yr)
        & (df_new_house_transactions["m"] >= 8)
    ) | (
        (df_new_house_transactions["yr"] == pred_yr + 1)
        & (df_new_house_transactions["m"] <= 7)
    )

    # Create arrays sorted by sector
    df_true = df_new_house_transactions[mask_true].sort_values("sector_num")
    df_pred = df_new_house_transactions[mask_pred].sort_values("sector_num")

    # Extract values
    exec(f"y_true_{period_name} = df_true['new_house_transaction_amount'].values")
    exec(f"y_pred_{period_name} = df_pred['new_house_transaction_amount'].values")

    print(
        f"{period_name}: y_true shape = {eval(f'y_true_{period_name}').shape}, y_pred shape = {eval(f'y_pred_{period_name}').shape}"
    )

2020: y_true shape = (1152,), y_pred shape = (1152,)
2021: y_true shape = (1152,), y_pred shape = (1152,)
2022: y_true shape = (1152,), y_pred shape = (1152,)
2023: y_true shape = (1152,), y_pred shape = (1152,)


In [5]:
# 1152 simulations

y_preds = [y_pred_2020, y_pred_2021, y_pred_2022, y_pred_2023]
y_trues = [y_true_2020, y_true_2021, y_true_2022, y_true_2023]
years_of_prediction = [2020, 2021, 2022, 2023]

for y_pred, y_true, year in zip(y_preds, y_trues, years_of_prediction):
    print(f"Year: {year}")
    print(f"Score: {random_half_score(y_pred, y_true,  n_repeats= 1152)}")

Year: 2020


Score: 0.3652385278500956
Year: 2021
Score: 0.00016448990475432867
Year: 2022
Score: 0.3394445787876501
Year: 2023
Score: 0.4669963456680643


In [6]:
# 1152*10 simulations

y_preds = [y_pred_2020, y_pred_2021, y_pred_2022, y_pred_2023]
y_trues = [y_true_2020, y_true_2021, y_true_2022, y_true_2023]
years_of_prediction = [2020, 2021, 2022, 2023]

for y_pred, y_true, year in zip(y_preds, y_trues, years_of_prediction):
    print(f"Year: {year}")
    print(f"Score: {random_half_score(y_pred, y_true,  n_repeats= 1152*10)}")

Year: 2020
Score: 0.3652561221081446
Year: 2021
Score: 1.621946705041634e-05
Year: 2022
Score: 0.34040245279978504
Year: 2023
Score: 0.46700770077718834


So, it looks like it converges well.