# 5.- Future Forecasting

> Important source: https://www.kaggle.com/code/ahmedabdulhamid/recursive-multistep-time-series-forecasting

## Sequence Length 6 and Prediction Length 3

| Model | MAE | RMSE | sMAPE | rRMSE |
|-------|-----|------|-------|-------|
| Transformer | 0.6057 | 1.6401 | 57.1570 | 15.8035 |
| Autoformer | 0.7611 | 2.1966 | 58.9506 | 21.1662 |
| Reformer | 0.6895 | 2.1401 | 57.6583 | 20.6216 |


## Sequence Length 4 and Prediction Length 3

| Model | MAE | RMSE | sMAPE | rRMSE |
|-------|-----|------|-------|-------|
| Reformer | 0.5098 | 1.4642 | 56.0617 | 14.1087 |
| Transformer | 0.4832 | 1.2848 | 56.0317 | 12.3799 |
| Autoformer | 0.6769 | 1.9190 | 58.3377 | 18.4909 |

## Prediction Analysis

In [41]:
import torch 
import os 
import torch
import numpy as np
import pandas as pd
from io import StringIO

In [42]:
normalized_data = pd.read_csv("../data/green_skill_classification/data_for_timeseries_normalized.csv")
raw_data = pd.read_csv("../data/green_skill_classification/data_for_timeseries.csv")

real_months = [
    "2024-07", "2024-08", "2024-09", "2024-10", "2024-11", "2024-12",
    "2025-01", "2025-03", "2025-04", "2025-05", "2025-06", "2025-07"
]

forecast_months = [
    "2025-08", "2025-09", "2025-10", "2025-11", "2025-12", "2026-01"
]

total_months = real_months + forecast_months


In [43]:
def load_predictions(folder : str) -> dict:
    predictions = {}
    for file in os.listdir(folder):
        if file.endswith(".pt"):
            filepath = os.path.join(folder, file)
            data = torch.load(filepath)
            data_np = data.numpy()
            predictions[file] = data_np
            print(f"File: {file}, Shape: {data_np.shape}")
    return predictions

FOLDERS = [
    "../models/predictions/absolute_predictions",
    "../models/predictions/normalized_predictions"
]

predictions = {}

for folder in FOLDERS:
    preds = load_predictions(folder)
    predictions.update(preds)

predictions

File: future_predictions_seq6_pred3_Transformer.pt, Shape: (1, 6, 274)
File: future_predictions_seq4_pred3_Transformer.pt, Shape: (1, 6, 274)
File: future_predictions_seq6_pred3_Autoformer.pt, Shape: (1, 6, 274)
File: future_predictions_seq4_pred3_Autoformer.pt, Shape: (1, 6, 274)
File: future_predictions_seq6_pred3_Reformer.pt, Shape: (1, 6, 274)
File: future_predictions_seq4_pred3_Reformer.pt, Shape: (1, 6, 274)
File: n_future_predictions_seq4_pred3_Transformer.pt, Shape: (1, 6, 274)
File: n_future_predictions_seq6_pred3_Transformer.pt, Shape: (1, 6, 274)
File: n_future_predictions_seq6_pred3_Informer.pt, Shape: (1, 6, 274)
File: n_future_predictions_seq4_pred3_Informer.pt, Shape: (1, 6, 274)
File: n_future_predictions_seq4_pred3_FEDformer.pt, Shape: (1, 6, 274)
File: n_future_predictions_seq6_pred3_FEDformer.pt, Shape: (1, 6, 274)


{'future_predictions_seq6_pred3_Transformer.pt': array([[[0.62443626, 0.77204406, 0.        , ..., 0.43774697,
          0.        , 0.        ],
         [0.16925685, 0.2686366 , 0.        , ..., 0.935793  ,
          2.0052707 , 1.5548707 ],
         [0.        , 0.4417882 , 0.00557862, ..., 0.39964044,
          1.147005  , 0.41140926],
         [0.68071425, 0.5803419 , 0.        , ..., 0.48165452,
          0.15721042, 0.        ],
         [0.15612577, 0.25465474, 0.        , ..., 0.9210227 ,
          2.1698883 , 1.3665144 ],
         [0.        , 0.5010111 , 0.02302299, ..., 0.59912235,
          1.4363145 , 0.7879764 ]]], shape=(1, 6, 274), dtype=float32),
 'future_predictions_seq4_pred3_Transformer.pt': array([[[0.58020693, 0.5535207 , 0.        , ..., 0.22261715,
          0.897269  , 0.        ],
         [0.49364397, 0.24448545, 0.        , ..., 1.1946288 ,
          2.8610911 , 1.9264534 ],
         [0.        , 0.61960006, 0.        , ..., 0.36615238,
          1.1885024 

At this point, we have generated **6-month** predictions for the three best performing models (**Transformer**, **Autoformer**, and **Reformer**) trained with `seq_len` = 6 and `pred_len` = 3, as well as with `seq_len` = 4 and `pred_len` = 3. We have six **tensors** where each tensor has the shape `(batches, pred_len, num_features)`.


### Join dataset with predictions

In [44]:
dataset = pd.read_csv("../data/green_skill_classification/data_for_timeseries.csv")
dataset.shape

(274, 14)

In [45]:
map_predictions = {
    "absolute": {},
    "normalized": {}
}
for prediction in predictions:
    if prediction.startswith("n_"):
        map_predictions["normalized"][prediction] = predictions[prediction]
    else:
        map_predictions["absolute"][prediction] = predictions[prediction]

map_predictions.keys()

for key in map_predictions:
    for model in map_predictions[key]:
        print(f"Type: {key}, Model: {model}, Shape: {map_predictions[key][model].shape}")

Type: absolute, Model: future_predictions_seq6_pred3_Transformer.pt, Shape: (1, 6, 274)
Type: absolute, Model: future_predictions_seq4_pred3_Transformer.pt, Shape: (1, 6, 274)
Type: absolute, Model: future_predictions_seq6_pred3_Autoformer.pt, Shape: (1, 6, 274)
Type: absolute, Model: future_predictions_seq4_pred3_Autoformer.pt, Shape: (1, 6, 274)
Type: absolute, Model: future_predictions_seq6_pred3_Reformer.pt, Shape: (1, 6, 274)
Type: absolute, Model: future_predictions_seq4_pred3_Reformer.pt, Shape: (1, 6, 274)
Type: normalized, Model: n_future_predictions_seq4_pred3_Transformer.pt, Shape: (1, 6, 274)
Type: normalized, Model: n_future_predictions_seq6_pred3_Transformer.pt, Shape: (1, 6, 274)
Type: normalized, Model: n_future_predictions_seq6_pred3_Informer.pt, Shape: (1, 6, 274)
Type: normalized, Model: n_future_predictions_seq4_pred3_Informer.pt, Shape: (1, 6, 274)
Type: normalized, Model: n_future_predictions_seq4_pred3_FEDformer.pt, Shape: (1, 6, 274)
Type: normalized, Model: n_f

In [46]:
print(map_predictions["absolute"]['future_predictions_seq6_pred3_Transformer.pt'])

[[[0.62443626 0.77204406 0.         ... 0.43774697 0.         0.        ]
  [0.16925685 0.2686366  0.         ... 0.935793   2.0052707  1.5548707 ]
  [0.         0.4417882  0.00557862 ... 0.39964044 1.147005   0.41140926]
  [0.68071425 0.5803419  0.         ... 0.48165452 0.15721042 0.        ]
  [0.15612577 0.25465474 0.         ... 0.9210227  2.1698883  1.3665144 ]
  [0.         0.5010111  0.02302299 ... 0.59912235 1.4363145  0.7879764 ]]]


In [47]:
# region_id,skill_id,2024-07,2024-08,2024-09,2024-10,2024-11,2024-12,2025-01,2025-03,2025-04,2025-05,2025-06,2025-07
SAVE_ON = "../data/predictions/"

map_dataframes = {
    "absolute": pd.read_csv("../data/green_skill_classification/data_for_timeseries.csv"),
    "normalized": pd.read_csv("../data/green_skill_classification/data_for_timeseries_normalized.csv")
}

def create_future_dataframe() -> pd.DataFrame:
    new_dataframe = pd.DataFrame(columns=["region_id", "skill_id", "2024-07", "2024-08", "2024-09", "2024-10", "2024-11", "2024-12",
                                      "2025-01", "2025-03", "2025-04", "2025-05", "2025-06", "2025-07",
                                      "2025-08", "2025-09", "2025-10", "2025-11", "2025-12", "2026-01"])
    return new_dataframe

for key in map_predictions:
    for model in map_predictions[key]:
        data_array = map_predictions[key][model]
        future_df = create_future_dataframe()
        future_df[["region_id", "skill_id"]] = map_dataframes[key][["region_id", "skill_id"]]

        future_df[["2024-07", "2024-08", "2024-09", "2024-10", "2024-11", "2024-12",
                   "2025-01", "2025-03", "2025-04", "2025-05", "2025-06", "2025-07"]] = map_dataframes[key][["2024-07", "2024-08", "2024-09", "2024-10", "2024-11", "2024-12",
                                                                                                      "2025-01", "2025-03", "2025-04", "2025-05", "2025-06", "2025-07"]]
        for i in range(data_array.shape[1]):
            if i + 8 == 12:
                month = 2025
                month_str = f"{month}-12"
                future_df[month_str] = data_array[0, i, :]
                continue

            month = 2025 + (i + 8) // 12
            month_str = f"{month}-{(i + 8) % 12:02d}"
            future_df[month_str] = data_array[0, i, :]
            if key != "normalized":
                future_df[month_str] = future_df[month_str].round(2)

        filename = model.replace(".pt", ".csv")
        if key == "normalized":
            filepath = os.path.join(SAVE_ON, filename)
        else:
            filepath = os.path.join(SAVE_ON, filename)
        future_df.to_csv(filepath, index=False)

# Growth rate calculation

In [48]:
import pandas as pd
import numpy as np

def compute_and_merge_skill_growth(df: pd.DataFrame):
    real_months = [
        "2024-07", "2024-08", "2024-09", "2024-10", "2024-11", "2024-12",
        "2025-01", "2025-03", "2025-04", "2025-05", "2025-06", "2025-07"
    ]

    forecast_months = [
        "2025-08", "2025-09", "2025-10", "2025-11", "2025-12", "2026-01"
    ]

    df["R_avg"] = df[real_months].mean(axis=1)
    df["F_avg"] = df[forecast_months].mean(axis=1)

    df["G_abs"] = df["F_avg"] - df["R_avg"]
    epsilon = 1e-6
    df["G_rel"] = df["G_abs"] / df["R_avg"].replace(0, epsilon)

    tau_abs = df["G_abs"].quantile(0.75)
    tau_rel = df["G_rel"].quantile(0.75)

    def classify(row):
        abs_high = row["G_abs"] >= tau_abs
        rel_high = row["G_rel"] >= tau_rel

        if abs_high and rel_high:
            return "Star"
        elif (not abs_high) and rel_high:
            return "Emerging"
        elif abs_high and (not rel_high):
            return "Stable"
        else:
            return "Declining"

    df["quadrant"] = df.apply(classify, axis=1)

    ordered_cols = (
        ["region_id", "skill_id"] +
        real_months +
        forecast_months +
        ["R_avg", "F_avg", "G_abs", "G_rel", "quadrant"]
    )

    print(df[ordered_cols].head())
    return df[ordered_cols]


In [49]:
FOLDER = "../data/predictions/"
for file in os.listdir(FOLDER):
    if file.endswith(".csv"):
        filepath = os.path.join(FOLDER, file)
        df = pd.read_csv(filepath)
        
        result_df = compute_and_merge_skill_growth(df)
        print(result_df.head())
        output_filepath = os.path.join(FOLDER, f"skill_growth_{file}")
        result_df.to_csv(output_filepath, index=False)


   region_id  skill_id  2024-07  2024-08  2024-09  2024-10  2024-11  2024-12  \
0          1         1        1        0        3        1        2        2   
1          1         2        2        1        1        1        1        1   
2          1         3        0        0        0        0        1        1   
3          1         4        0        2        1        0        1        5   
4          1         5        2        0        0        0        1        1   

   2025-01  2025-03  ...  2025-09  2025-10  2025-11   2025-12  2026-01  \
0        1        1  ...     0.28     0.00     0.00  0.000000     0.00   
1        0        0  ...     0.30     0.74     0.34  0.178059     0.52   
2        0        0  ...     0.00     0.38     0.00  0.000000     0.16   
3        2        7  ...     2.09     3.52     0.00  2.036543     4.05   
4        0        2  ...     1.70     1.42     1.05  2.165881     1.30   

      R_avg     F_avg     G_abs     G_rel   quadrant  
0  0.916667  0.0816

In [50]:
FOLDER = "../data/predictions/"

MD_FILE = {
    "Normalized_Data": "../doc/future_predictions_count_normalized.md",
    "Raw_Data": "../doc/future_predictions_count_raw.md"
}

for file in os.listdir(FOLDER):
    for key in MD_FILE:
        if file.startswith("skill_growth_") and file.endswith(".csv"):
            if (key == "Normalized_Data" and file.startswith("skill_growth_n_")) or (key == "Raw_Data" and not file.startswith("skill_growth_n_")):
                filepath = os.path.join(FOLDER, file)
                df = pd.read_csv(filepath)

                quadrant_counts = df['quadrant'].value_counts().to_dict()

                with open(MD_FILE[key], 'a') as md_file:
                    md_file.write(f"## {file}\n\n")
                    md_file.write("| Quadrant   | Count |\n")
                    md_file.write("|------------|-------|\n")
                    for quadrant in ["Star", "Emerging", "Stable", "Declining"]:
                        count = quadrant_counts.get(quadrant, 0)
                        md_file.write(f"| {quadrant} | {count} |\n")
                    md_file.write("\n")