In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, r2_score
)
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
def set_random_seeds(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ["TF_DETERMINISTIC_OPS"] = "1"
    os.environ['PYTHONHASHSEED'] = "42"

In [None]:
set_random_seeds(42)

In [None]:
# features excluded from the final feature set based on prior feature selection
# + technical columns
excluded_features = [ 'x','sq_flag',
    'drct_v_lag_6h','drct_v_lag_12h','drct_v_lag_24h','drct_v_lag_48h',
    'drct_u_lag_6h','drct_u_lag_24h','drct_u_lag_48h',
    'tmpf_lag_12h','tmpf_lag_24h','tmpf_lag_48h',
    'hr_flag_rolling_sum_6h','hr_flag_rolling_sum_12h','hr_flag_rolling_sum_24h','hr_flag_rolling_sum_48h',
    'sq_flag_rolling_sum_6h','sq_flag_rolling_sum_12h','sq_flag_rolling_sum_24h','sq_flag_rolling_sum_48h',
    'relh_grad_rolling_max_6h','relh_grad_rolling_max_12h','relh_grad_rolling_max_24h','relh_grad_rolling_max_48h',
    'ts_flag_rolling_sum_6h','ts_flag_rolling_sum_12h','ts_flag_rolling_sum_24h','ts_flag_rolling_sum_48h',
    'mslp_rolling_mean_6h','mslp_rolling_mean_12h','mslp_rolling_mean_24h','mslp_rolling_mean_48h',
    'gust_rolling_max_12h','gust_rolling_max_24h','gust_rolling_max_48h',
    'alti_rolling_mean_6h','alti_rolling_mean_12h','alti_rolling_mean_24h','alti_rolling_mean_48h',
    'sknt_rolling_max_6h','sknt_rolling_max_12h','sknt_rolling_max_24h','sknt_rolling_max_48h',
    'relh_rolling_mean_6h','relh_rolling_mean_12h','relh_rolling_mean_24h','relh_rolling_mean_48h',
    'p0li_rolling_sum_6h','p0li_rolling_sum_12h','p0li_rolling_sum_24h','p0li_rolling_sum_48h',
    'time_interval','anomaly_threshold', 'year',
    'IDW_ts_flag_rolling_sum_12h','rolling_max_12h','rolling_mean_12h','tmpf_lag_6h','drct_u_lag_12h']


One-Step LSTM (Baseline)

In [None]:

# One-Step LSTM 3 Folds time-series validation
df_baseline = df.copy()
df_baseline["log_sum_48"] = np.log1p(df_baseline["sum_48"])

# X and Y
X_full = df_baseline.drop(columns=excluded_features, errors="ignore").copy()
y_full = df_baseline["log_sum_48"].copy()

# numeric columns only
X_full = X_full.select_dtypes(include=["int64", "float64"])
num_cols = X_full.columns

# 3 folds time series split
tscv = TimeSeriesSplit(n_splits=3)

def build_model(n_feat: int) -> keras.Model:
    m = keras.Sequential([
        layers.LSTM(16, dropout=0.2, input_shape=(1, n_feat)),
        layers.Dense(1, activation="relu")
    ])
    m.compile(optimizer="adam", loss="mse")
    return m

cv_results = []

for k, (tr_idx, val_idx) in enumerate(tscv.split(X_full), 1):
    print(f"\n── Fold {k} ")
    X_tr, X_val = X_full.iloc[tr_idx].copy(), X_full.iloc[val_idx].copy()
    y_tr, y_val = y_full.iloc[tr_idx].copy(), y_full.iloc[val_idx].copy()

    # impute, scale, fit on train only
    imp = SimpleImputer(strategy="median").fit(X_tr[num_cols])
    scaler = MinMaxScaler().fit(imp.transform(X_tr[num_cols]))

    X_tr_prep  = scaler.transform(imp.transform(X_tr[num_cols]))
    X_val_prep = scaler.transform(imp.transform(X_val[num_cols]))

    # reshape → (samples, 1, n_features)
    X_tr_3d  = X_tr_prep.reshape((X_tr_prep.shape[0], 1, len(num_cols)))
    X_val_3d = X_val_prep.reshape((X_val_prep.shape[0], 1, len(num_cols)))

    model = build_model(len(num_cols))
    es = EarlyStopping(monitor="loss", patience=2, restore_best_weights=True)

    model.fit(X_tr_3d, y_tr,
              epochs=30, batch_size=32,
              callbacks=[es], verbose=0)

    # predict and metrics
    y_val_pred_log = model.predict(X_val_3d, verbose=0).ravel()

    # log-scale
    mse_log  = mean_squared_error(y_val, y_val_pred_log)
    rmse_log = np.sqrt(mse_log)
    mae_log  = mean_absolute_error(y_val, y_val_pred_log)
    r2_log   = r2_score(y_val, y_val_pred_log)

    # original scale
    y_true_org = np.expm1(y_val)
    y_pred_org = np.expm1(y_val_pred_log)

    mse_org  = mean_squared_error(y_true_org, y_pred_org)
    rmse_org = np.sqrt(mse_org)
    mae_org  = mean_absolute_error(y_true_org, y_pred_org)
    r2_org   = r2_score(y_true_org, y_pred_org)

    print(f"Log  : MSE={mse_log:.4f} RMSE={rmse_log:.4f} "
          f"MAE={mae_log:.4f} R²={r2_log:.4f}")
    print(f"Orig : MSE={mse_org:,.0f} RMSE={rmse_org:,.0f} "
          f"MAE={mae_org:,.0f} R²={r2_org:.4f}")

    cv_results.append([k, mse_log, rmse_log, mae_log, r2_log,
                          mse_org, rmse_org, mae_org, r2_org])

# pivot table
cv_df = pd.DataFrame(cv_results, columns=[
    "fold", "mse_log", "rmse_log", "mae_log", "r2_log",
    "mse_org", "rmse_org", "mae_org", "r2_org"
]).set_index("fold")

print("\n3-Fold CV summary")
display(cv_df.round(4))

print("\nAverages across folds:")
display(cv_df.mean().round(4))

In [None]:
# One-Step LSTM on full Train (2021)
df_baseline = df_train_real.copy()
df_baseline['log_sum_48'] = np.log1p(df_baseline['sum_48'])

# --------------------------------------------------
# 1) X / y  (убираем лишние колонки)
# --------------------------------------------------
X_base = df_baseline.drop(columns=excluded_features, errors='ignore').copy()
y_base = df_baseline['log_sum_48'].copy()

# keep only numeric columns
non_num  = X_base.select_dtypes(exclude=['int64', 'float64']).columns
X_base.drop(columns=non_num, inplace=True)

num_cols = X_base.columns               # saved features order


# inf to NaN; SimpleImputer; Min-Max scaler
X_base.replace([np.inf, -np.inf], np.nan, inplace=True)

imp_base = SimpleImputer(strategy='median').fit(X_base[num_cols])
X_base[num_cols] = imp_base.transform(X_base[num_cols])

scaler_base = MinMaxScaler().fit(X_base[num_cols])
X_base_scaled = scaler_base.transform(X_base[num_cols])

# reshape and train
X_base_3d = X_base_scaled.reshape(
    (X_base_scaled.shape[0], 1, len(num_cols))
)

model_base = keras.Sequential([
    layers.LSTM(16, input_shape=(1, len(num_cols))),
    layers.Dense(1, activation='relu')
])
model_base.compile(optimizer='adam', loss='mse')

early_stopping = EarlyStopping(monitor='loss',
                               patience=2,
                               restore_best_weights=True)

model_base.fit(
    X_base_3d, y_base,
    epochs=30,
    batch_size=32,
    callbacks=[early_stopping],
    verbose=1
)

# metrics in log scale
y_pred_log = model_base.predict(X_base_3d, verbose=0).ravel()

print("\n=== Baseline LSTM — TRAIN (log-scale) ===")
print(f"MSE : {mean_squared_error(y_base, y_pred_log):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_base, y_pred_log)):.4f}")
print(f"MAE : {mean_absolute_error(y_base, y_pred_log):.4f}")
print(f"R²  : {r2_score(y_base, y_pred_log):.4f}")

# metrics in original scale
y_true_orig = np.expm1(y_base)
y_pred_orig = np.expm1(y_pred_log)

print("\n=== Baseline LSTM — TRAIN (original scale) ===")
print(f"MSE : {mean_squared_error(y_true_orig, y_pred_orig):,.2f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_true_orig, y_pred_orig)):,.2f}")
print(f"MAE : {mean_absolute_error(y_true_orig, y_pred_orig):,.2f}")
print(f"R²  : {r2_score(y_true_orig, y_pred_orig):.4f}")

In [None]:
# One-Step LSTM inference on Test (2022)
df_baseline_test = df_test_real.copy()
df_baseline_test['log_sum_48'] = np.log1p(df_baseline_test['sum_48'])

# ←   num_cols_base – список признаков, на которых тренировался baseline
X_base_test = (df_baseline_test
               .drop(columns=excluded_features, errors='ignore')
               .reindex(columns=num_cols))     # fixed order

y_base_test = df_baseline_test['log_sum_48'].copy()

# imputer, scaler
X_base_test.replace([np.inf, -np.inf], np.nan, inplace=True)
X_base_test[num_cols] = imp_base.transform(X_base_test[num_cols])
X_base_test_scaled = scaler_base.transform(X_base_test[num_cols])

# reshape and predict
X_base_test_3d = X_base_test_scaled.reshape(
        (X_base_test_scaled.shape[0], 1, len(num_cols))
)

y_test_pred_log_base = model_base.predict(X_base_test_3d).ravel()

# metrics in log-scale
mse_log_base  = mean_squared_error(y_base_test, y_test_pred_log_base)
rmse_log_base = np.sqrt(mse_log_base)
mae_log_base  = mean_absolute_error(y_base_test, y_test_pred_log_base)
r2_log_base   = r2_score(y_base_test, y_test_pred_log_base)

print("\nBaseline LSTM on TEST-2022 (log-scale) ")
print(f"MSE ={mse_log_base :.4f} | RMSE={rmse_log_base:.4f} | "
      f"MAE ={mae_log_base :.4f} | R² ={r2_log_base :.4f}")

# metrics in original scale
y_true_orig = np.expm1(y_base_test)
y_pred_orig = np.expm1(y_test_pred_log_base)

mse_orig  = mean_squared_error(y_true_orig, y_pred_orig)
rmse_orig = np.sqrt(mse_orig)
mae_orig  = mean_absolute_error(y_true_orig, y_pred_orig)
r2_orig   = r2_score(y_true_orig, y_pred_orig)

print(f"RMSE_orig={rmse_orig:.2f} | MAE_orig={mae_orig:.2f} | "
      f"R²={r2_orig:.4f} | MSE_orig={mse_orig:.2f}")
print("\n Baseline LSTM on TEST-2022 (original scale) ")
print(f"MSE ={mse_orig:.2f} | RMSE={rmse_orig:.2f} | "
      f"MAE ={mae_orig:.2f} | R² ={r2_orig:.4f}")