In [1]:
import numpy as np
import pandas as pd
from scipy.fft import fft, ifft
from scipy.ndimage import gaussian_filter1d
from scipy.signal import find_peaks
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.svm import SVR

In [2]:
def ewt_boundaries_equal_energy(spectrum, N):
    energy = np.cumsum(spectrum) / np.sum(spectrum)
    boundaries = []
    for k in range(1, N):
        idx = np.argmin(np.abs(energy - k / N))
        boundaries.append(idx / len(spectrum) * np.pi)
    return boundaries

def ewt_boundaries(spectrum, N, smooth_sigma=2):
    spectrum_smooth = gaussian_filter1d(spectrum, sigma=smooth_sigma)
    if np.allclose(spectrum_smooth, spectrum_smooth[0]):
        return ewt_boundaries_equal_energy(spectrum, N)
    peaks, _ = find_peaks(spectrum_smooth)
    if len(peaks) < N - 1:
        return ewt_boundaries_equal_energy(spectrum, N)
    amps = spectrum_smooth[peaks]
    top_peaks = sorted([p for _, p in sorted(zip(amps, peaks), reverse=True)][:N-1])
    boundaries = [p / len(spectrum) * np.pi for p in top_peaks]
    return boundaries

def make_filter_bank(boundaries, L):
    freqs = np.linspace(0, np.pi, L//2 + 1)
    mfb = []
    # scaling lowpass
    phi = np.zeros_like(freqs)
    phi[freqs <= boundaries[0]] = 1
    mfb.append(phi)
    # wavelet bands
    for i in range(len(boundaries)):
        psi = np.zeros_like(freqs)
        if i == len(boundaries) - 1:
            mask = (freqs > boundaries[i])
        else:
            mask = (freqs > boundaries[i]) & (freqs <= boundaries[i+1])
        psi[mask] = 1
        mfb.append(psi)
    return mfb

def EWT1D(signal, N=3, smooth_sigma=2):
    L = len(signal)
    spectrum_half = np.abs(fft(signal))[:L//2 + 1]
    boundaries = ewt_boundaries(spectrum_half, N, smooth_sigma=smooth_sigma)
    mfb = make_filter_bank(boundaries, L)
    modes = []
    S_full = fft(signal)
    for filt in mfb:
        filt_full = np.concatenate([filt, filt[-2:0:-1]])
        mode_freq = S_full * filt_full
        mode_time = np.real(ifft(mode_freq))
        modes.append(mode_time)
    return np.array(modes), mfb, boundaries

def iEWT1D(modes, mfb=None):
    # simple inverse: sum modes
    return np.sum(modes, axis=0)

In [3]:
def create_multivariate_lagged_dataset(df, target_col, feature_cols, lag=3):
    data = df[feature_cols].values
    target_idx = feature_cols.index(target_col)
    X, y = [], []
    for i in range(lag, len(df)):
        X.append(data[i-lag:i].flatten())
        y.append(data[i, target_idx])
    return np.array(X), np.array(y)

def safe_mape(y_true, y_pred, min_denom=1.0):
    y_true = np.asarray(y_true); y_pred = np.asarray(y_pred)
    mask = np.abs(y_true) >= min_denom
    if np.sum(mask) == 0:
        return np.nan
    return float(np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100)

def sde(y_true, y_pred):
    return float(np.std(np.asarray(y_true) - np.asarray(y_pred)))

In [4]:
class GWO:
    def __init__(self, obj_func, lb, ub, dim, n_agents=12, n_iter=25, seed=42):
        self.obj_func = obj_func
        self.lb = np.array(lb, dtype=float)
        self.ub = np.array(ub, dtype=float)
        self.dim = dim
        self.n_agents = n_agents
        self.n_iter = n_iter
        self.rng = np.random.default_rng(seed)

    def optimize(self):
        wolves = self.rng.uniform(self.lb, self.ub, size=(self.n_agents, self.dim))
        fitness = np.array([self.obj_func(w) for w in wolves])
        idx = np.argsort(fitness)
        alpha, beta, delta = wolves[idx[0]].copy(), wolves[idx[1]].copy(), wolves[idx[2]].copy()
        f_alpha, f_beta, f_delta = float(fitness[idx[0]]), float(fitness[idx[1]]), float(fitness[idx[2]])
        for t in range(self.n_iter):
            a = 2 - 2 * (t / (self.n_iter - 1 + 1e-9))
            for i in range(self.n_agents):
                X = wolves[i].copy()
                for j in range(self.dim):
                    r1, r2 = self.rng.random(), self.rng.random()
                    A1 = 2 * a * r1 - a; C1 = 2 * r2
                    D_alpha = abs(C1 * alpha[j] - X[j]); X1 = alpha[j] - A1 * D_alpha

                    r1, r2 = self.rng.random(), self.rng.random()
                    A2 = 2 * a * r1 - a; C2 = 2 * r2
                    D_beta = abs(C2 * beta[j] - X[j]); X2 = beta[j] - A2 * D_beta

                    r1, r2 = self.rng.random(), self.rng.random()
                    A3 = 2 * a * r1 - a; C3 = 2 * r2
                    D_delta = abs(C3 * delta[j] - X[j]); X3 = delta[j] - A3 * D_delta

                    X[j] = (X1 + X2 + X3) / 3.0
                wolves[i] = np.clip(X, self.lb, self.ub)
            fitness = np.array([self.obj_func(w) for w in wolves])
            idx = np.argsort(fitness)
            if fitness[idx[0]] < f_alpha:
                alpha, f_alpha = wolves[idx[0]].copy(), float(fitness[idx[0]])
            if fitness[idx[1]] < f_beta:
                beta, f_beta = wolves[idx[1]].copy(), float(fitness[idx[1]])
            if fitness[idx[2]] < f_delta:
                delta, f_delta = wolves[idx[2]].copy(), float(fitness[idx[2]])
        return alpha, f_alpha

In [5]:
def decode_svr_params(position, logC_min=-3, logC_max=3, logg_min=-6, logg_max=0, eps_min=1e-4, eps_max=1.0):
    log10C = float(np.clip(position[0], logC_min, logC_max))
    log10gamma = float(np.clip(position[1], logg_min, logg_max))
    eps_raw = float(position[2])
    C = 10.0 ** log10C
    gamma = 10.0 ** log10gamma
    # map eps_raw in [0,1] to [eps_min, eps_max]
    eps = eps_min + (eps_max - eps_min) * (np.clip(eps_raw, 0.0, 1.0))
    return C, gamma, eps

def make_svr_objective(X_train, y_train, X_val, y_val, random_state=42,
                       logC_min=-3, logC_max=3, logg_min=-6, logg_max=0, eps_min=1e-4, eps_max=1.0):
    scaler = StandardScaler()
    X_train_s = scaler.fit_transform(X_train)
    X_val_s = scaler.transform(X_val)
    def objective(position):
        C, gamma, eps = decode_svr_params(position, logC_min, logC_max, logg_min, logg_max, eps_min, eps_max)
        try:
            model = SVR(kernel='rbf', C=C, gamma=gamma, epsilon=eps)
            model.fit(X_train_s, y_train)
            y_pred = model.predict(X_val_s)
            return float(np.sqrt(mean_squared_error(y_val, y_pred)))
        except Exception:
            return 1e6
    return objective

In [6]:
def ewt_gwo_svm_iewt_pipeline(
    df,
    target_column,
    feature_columns,
    lag_steps=12,
    n_modes=4,
    gwo_agents=12,
    gwo_iters=25,
    random_state=42,
    logC_min=-3, logC_max=3,
    logg_min=-6, logg_max=0,
    eps_min=1e-4, eps_max=1.0,
    smooth_sigma=2,
    max_step_eval=7
):
    """
    Returns:
      dict with per-mode best params, one-step reconstructed metrics,
      multi-step reconstructed metrics, boundaries, and predictions.
    """
    # 1) EWT decomposition on full target
    signal = df[target_column].values
    modes, mfb, boundaries = EWT1D(signal, N=n_modes, smooth_sigma=smooth_sigma)

    # 2) time-aware splits after creating lagged X (same across modes)
    n_samples = len(signal) - lag_steps
    if n_samples <= 0:
        raise ValueError("lag_steps too large relative to series length.")
    train_end = int(0.7 * n_samples)
    val_end = int(0.85 * n_samples)

    per_mode_info = []
    test_mode_preds = []
    test_mode_lengths = []

    # Process each mode independently
    for mode_idx in range(n_modes):
        mode_series = modes[mode_idx]
        df_mode = df.copy()
        df_mode[target_column] = mode_series

        X_mode, y_mode = create_multivariate_lagged_dataset(df_mode, target_column, feature_columns, lag=lag_steps)
        X_train, y_train = X_mode[:train_end], y_mode[:train_end]
        X_val, y_val     = X_mode[train_end:val_end], y_mode[train_end:val_end]
        X_test, y_test   = X_mode[val_end:], y_mode[val_end:]

        # check sizes
        if len(X_train) < 5 or len(X_val) < 1 or len(X_test) < 1:
            print(f"[Mode {mode_idx+1}] insufficient data – skipping mode.")
            per_mode_info.append({"mode": mode_idx+1, "skipped": True})
            test_mode_preds.append(np.zeros_like(X_test[:,0]) if X_test.size else np.array([]))
            test_mode_lengths.append(len(X_test))
            continue

        # 3) Build objective for GWO
        obj = make_svr_objective(
            X_train, y_train, X_val, y_val,
            random_state=random_state,
            logC_min=logC_min, logC_max=logC_max,
            logg_min=logg_min, logg_max=logg_max,
            eps_min=eps_min, eps_max=eps_max
        )

        # GWO bounds: [log10C, log10gamma, eps_raw (0..1)]
        lb = np.array([logC_min, logg_min, 0.0], dtype=float)
        ub = np.array([logC_max, logg_max, 1.0], dtype=float)

        gwo = GWO(obj, lb, ub, dim=3, n_agents=gwo_agents, n_iter=gwo_iters, seed=random_state + mode_idx)
        best_pos, best_fit = gwo.optimize()

        # decode params
        C_best, gamma_best, eps_best = decode_svr_params(best_pos, logC_min, logC_max, logg_min, logg_max, eps_min, eps_max)

        # 4) Train final SVR on train+val
        scaler = StandardScaler()
        X_trval = np.vstack([X_train, X_val])
        y_trval = np.concatenate([y_train, y_val])
        X_trval_s = scaler.fit_transform(X_trval)
        X_test_s = scaler.transform(X_test)

        final_model = SVR(kernel='rbf', C=C_best, gamma=gamma_best, epsilon=eps_best)
        final_model.fit(X_trval_s, y_trval)
        y_pred_test_mode = final_model.predict(X_test_s)

        per_mode_info.append({
            "mode": mode_idx+1,
            "best_val_rmse": float(best_fit),
            "log10C": float(np.clip(best_pos[0], logC_min, logC_max)),
            "C": float(C_best),
            "log10gamma": float(np.clip(best_pos[1], logg_min, logg_max)),
            "gamma": float(gamma_best),
            "epsilon": float(eps_best),
            "test_len": int(len(y_test))
        })

        test_mode_preds.append(y_pred_test_mode)
        test_mode_lengths.append(len(y_test))

    # 5) Align test predictions and IEWT reconstruct
    n_test = None
    for ln in test_mode_lengths:
        if ln:
            n_test = ln
            break
    if n_test is None:
        raise RuntimeError("No valid test data across modes.")

    stacked = []
    for arr in test_mode_preds:
        a = np.asarray(arr)
        if len(a) == n_test:
            stacked.append(a)
        elif len(a) == 0:
            stacked.append(np.zeros(n_test))
        elif len(a) < n_test:
            stacked.append(np.concatenate([a, np.zeros(n_test - len(a))]))
        else:
            stacked.append(a[:n_test])
    stacked = np.array(stacked)  # shape (n_modes, n_test)

    y_pred_reconstructed = iEWT1D(stacked, mfb)
    y_true_test = df[target_column].values[lag_steps + val_end : lag_steps + val_end + n_test]

    # 6) One-step reconstructed metrics
    one_mae = mean_absolute_error(y_true_test, y_pred_reconstructed)
    one_rmse = np.sqrt(mean_squared_error(y_true_test, y_pred_reconstructed))
    one_mape = safe_mape(y_true_test, y_pred_reconstructed)
    one_sde = sde(y_true_test, y_pred_reconstructed)

    one_step_metrics = {"MAE": float(one_mae), "RMSE": float(one_rmse), "MAPE (%)": float(one_mape) if not np.isnan(one_mape) else np.nan, "SDE": float(one_sde)}

    # 7) Multi-step direct reconstructed metrics
    multistep_rows = []
    for step in range(1, max_step_eval + 1):
        mode_step_preds = []
        valid = True
        for mode_idx in range(n_modes):
            info = per_mode_info[mode_idx]
            if info.get("skipped", False):
                mode_step_preds.append(np.zeros(max(0, n_test - step)))
                continue

            # recreate X_mode and splits
            mode_series = modes[mode_idx]
            df_mode = df.copy()
            df_mode[target_column] = mode_series
            X_mode, y_mode = create_multivariate_lagged_dataset(df_mode, target_column, feature_columns, lag=lag_steps)
            X_train, y_train = X_mode[:train_end], y_mode[:train_end]
            X_test_all, y_test_all = X_mode[val_end:], y_mode[val_end:]

            if X_test_all.shape[0] <= step:
                valid = False
                break

            X_test_step = X_test_all[:-step]   # inputs to predict t+step
            # train model on TRAIN ONLY with tuned params
            C_best = info.get("C", 1.0)
            gamma_best = info.get("gamma", 1.0)
            eps_best = info.get("epsilon", 0.1)

            scaler_train = StandardScaler()
            X_train_s = scaler_train.fit_transform(X_train)
            X_test_step_s = scaler_train.transform(X_test_step)

            model_step = SVR(kernel='rbf', C=C_best, gamma=gamma_best, epsilon=eps_best)
            model_step.fit(X_train_s, y_train)
            y_pred_step_mode = model_step.predict(X_test_step_s)  # length = n_test - step
            mode_step_preds.append(y_pred_step_mode)

        if not valid:
            break

        mode_step_preds = np.array(mode_step_preds)  # shape (n_modes, n_test-step)
        y_pred_step_recon = iEWT1D(mode_step_preds, mfb=None)
        y_true_step = df[target_column].values[lag_steps + val_end + step : lag_steps + val_end + step + y_pred_step_recon.shape[0]]

        # align lengths
        L_true = len(y_true_step); L_pred = len(y_pred_step_recon)
        m = min(L_true, L_pred)
        if m == 0:
            break
        y_true_step = y_true_step[:m]
        y_pred_step_recon = y_pred_step_recon[:m]

        multistep_rows.append({
            "Step": step,
            "MAE": float(mean_absolute_error(y_true_step, y_pred_step_recon)),
            "RMSE": float(np.sqrt(mean_squared_error(y_true_step, y_pred_step_recon))),
            "MAPE (%)": float(safe_mape(y_true_step, y_pred_step_recon)),
            "SDE": float(sde(y_true_step, y_pred_step_recon))
        })

    multistep_df = pd.DataFrame(multistep_rows)

    return {
        "per_mode_info": per_mode_info,
        "one_step_metrics": one_step_metrics,
        "multistep_df": multistep_df,
        "boundaries_rad": boundaries,
        "y_true_test": y_true_test,
        "y_pred_reconstructed": y_pred_reconstructed
    }


In [7]:
feature_columns = ['AirTemp','Azimuth','CloudOpacity','DewpointTemp','Dhi','Dni','Ebh',
                   'WindDirection10m','Ghi','RelativeHumidity','SurfacePressure','WindSpeed10m']
df = pd.read_csv('/Users/hrishityelchuri/Documents/windPred/raw/8.52 hrishit data.csv')

df['PeriodEnd'] = pd.to_datetime(df['PeriodEnd'])
df['PeriodStart'] = pd.to_datetime(df['PeriodStart'])
df = df.sort_values('PeriodEnd')

In [None]:
res = ewt_gwo_svm_iewt_pipeline(
    df,
    target_column='WindSpeed10m',
    feature_columns=feature_columns,
    lag_steps=12,
    n_modes=4,
    gwo_agents=12,
    gwo_iters=25,
    random_state=42,
    logC_min=-3, logC_max=3,
    logg_min=-6, logg_max=0,
    eps_min=1e-4, eps_max=0.5,
    smooth_sigma=2,
    max_step_eval=7
)
print("Per-mode best params:")
print(pd.DataFrame(res['per_mode_info']))
print("\nOne-step reconstructed metrics:")
print(res['one_step_metrics'])
print("\nMulti-step reconstructed metrics:")
print(res['multistep_df'].to_string(index=False))