In [1]:
import pandas as pd
import os
from pathlib import Path

PROJECT_ROOT = Path(os.environ["PROJECT_ROOT"])
from auxiliary.static_data import *
from auxiliary.auxiliary_funcs import *
from auxiliary.impute_missing_values import df_new_house_transactions

df_test = pd.read_csv("test.csv")
df_sample_submission = pd.read_csv("sample_submission.csv")

Added 999 missing rows. Final shape: (6432, 13)


In [2]:
# Add sector_num column for sorting
df_new_house_transactions["sector_num"] = (
    df_new_house_transactions["sector"].str.extract("(\d+)").astype(int)
)

# Create y_pred and y_true arrays for each time period
time_periods = [
    ("2020", 2019, 2020),
    ("2021", 2020, 2021),
    ("2022", 2021, 2022),
    ("2023", 2022, 2023),
]

for period_name, pred_yr, true_yr in time_periods:
    # Get true values (current year: Aug to July)
    mask_true = (
        (df_new_house_transactions["yr"] == true_yr)
        & (df_new_house_transactions["m"] >= 8)
    ) | (
        (df_new_house_transactions["yr"] == true_yr + 1)
        & (df_new_house_transactions["m"] <= 7)
    )

    # Get prediction values (previous year: Aug to July)
    mask_pred = (
        (df_new_house_transactions["yr"] == pred_yr)
        & (df_new_house_transactions["m"] >= 8)
    ) | (
        (df_new_house_transactions["yr"] == pred_yr + 1)
        & (df_new_house_transactions["m"] <= 7)
    )

    # Create arrays sorted by sector
    df_true = df_new_house_transactions[mask_true]
    df_pred = df_new_house_transactions[mask_pred]

    # Extract values
    exec(f"y_true_{period_name} = df_true['new_house_transaction_amount'].values")
    exec(f"y_pred_{period_name} = df_pred['new_house_transaction_amount'].values")

    print(
        f"{period_name}: y_true shape = {eval(f'y_true_{period_name}').shape}, y_pred shape = {eval(f'y_pred_{period_name}').shape}"
    )

2020: y_true shape = (1152,), y_pred shape = (1152,)
2021: y_true shape = (1152,), y_pred shape = (1152,)
2022: y_true shape = (1152,), y_pred shape = (1152,)
2023: y_true shape = (1152,), y_pred shape = (1152,)


In [3]:
# 1152 simulations

y_preds = [y_pred_2020, y_pred_2021, y_pred_2022, y_pred_2023]
y_trues = [y_true_2020, y_true_2021, y_true_2022, y_true_2023]
years_of_prediction = [2020, 2021, 2022, 2023]

for y_pred, y_true, year in zip(y_preds, y_trues, years_of_prediction):
    print(f"Year: {year}")
    print(f"Score: {random_half_score(y_pred, y_true,  n_repeats= 1152)}")

Year: 2020
Score: 0.3652749796340707
Year: 2021
Score: 0.0
Year: 2022
Score: 0.33986293417331725
Year: 2023
Score: 0.4669895027138628


In [4]:
# 1152*10 simulations

y_preds = [y_pred_2020, y_pred_2021, y_pred_2022, y_pred_2023]
y_trues = [y_true_2020, y_true_2021, y_true_2022, y_true_2023]
years_of_prediction = [2020, 2021, 2022, 2023]

for y_pred, y_true, year in zip(y_preds, y_trues, years_of_prediction):
    print(f"Year: {year}")
    print(f"Score: {random_half_score(y_pred, y_true,  n_repeats= 1152*10)}")

Year: 2020
Score: 0.3652546672355771
Year: 2021
Score: 4.9909963405189446e-05
Year: 2022
Score: 0.3403803000726862
Year: 2023
Score: 0.46700382139609


So, it looks like it converges well.

Let us keep the true values for 2020, 2021, 2022, 2023.

In [5]:
for year, y_true in zip(years_of_prediction, y_trues):
    # There are 96 sectors, sector numbers 1 to 96
    months = ["Aug", "Sep", "Oct", "Nov", "Dec"] + \
             [f"{m:02d}" for m in range(1, 8)]  # Jan-Jul as "01"-"07"
    month_names = ["Aug", "Sep", "Oct", "Nov", "Dec", "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul"]
    sector_ids = []
    # months Aug-Dec of current year
    for month in ["Aug", "Sep", "Oct", "Nov", "Dec"]:
        sector_ids += [f"{year} {month}_sector {i}" for i in range(1, 97)]
    # months Jan-Jul of next year
    next_year = year + 1
    for month in ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul"]:
        sector_ids += [f"{next_year} {month}_sector {i}" for i in range(1, 97)]
    df = pd.DataFrame({
        "id": sector_ids,
        "new_house_transaction_amount": y_true
    })
    out_path = os.path.join(PROJECT_ROOT, "train", f"new_house_transaction_amount_in_{year}.csv")
    df.to_csv(out_path, index=False)
