In [107]:
!pip install torch



In [108]:
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [109]:
import pandas as pd
import numpy as np

price_df = pd.read_csv("/content/training_data.csv", parse_dates=['date'])
sent_df = pd.read_csv("/content/sentiment_data.csv", parse_dates=['date'])

# Convert to datetime
price_df["date"] = pd.to_datetime(price_df["date"])
sent_df["date"] = pd.to_datetime(sent_df["date"])

# Sort both by date
price_df = price_df.sort_values("date")
sent_df = sent_df.sort_values("date")

# Shift sentiment *by one day backward* to avoid lookahead bias
sent_df["sentiment_score"] = sent_df["sentiment_score"].shift(1)

# Merge properly on date
merged_df = pd.merge(price_df, sent_df, on="date", how="left")
merged_df = merged_df.dropna(subset=["sentiment_score"])

# Sort again for safety
merged_df = merged_df.sort_values("date").reset_index(drop=True)

# Inspect sample
print(merged_df.head(15))
print(merged_df.tail(15))

         date  price_usd_per_mmbtu        source  sentiment_score
0  2012-12-10                 3.35  FRED:DHHNGSP         0.045655
1  2012-12-11                 3.39  FRED:DHHNGSP         0.584642
2  2012-12-12                 3.33  FRED:DHHNGSP         0.403724
3  2012-12-13                 3.27  FRED:DHHNGSP         0.553235
4  2012-12-14                 3.15  FRED:DHHNGSP         0.086875
5  2012-12-17                 3.20  FRED:DHHNGSP         0.425910
6  2012-12-18                 3.29  FRED:DHHNGSP         0.267380
7  2012-12-19                 3.25  FRED:DHHNGSP         0.344278
8  2012-12-20                 3.35  FRED:DHHNGSP         0.574574
9  2012-12-21                 3.42  FRED:DHHNGSP         0.205881
10 2012-12-24                 3.30  FRED:DHHNGSP        -0.191383
11 2012-12-27                 3.31  FRED:DHHNGSP         0.058812
12 2012-12-28                 3.40  FRED:DHHNGSP        -0.017793
13 2012-12-31                 3.43  FRED:DHHNGSP        -0.012983
14 2013-01

In [110]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBRegressor
import joblib
from collections import deque

# --- A. DATA PREPARATION & FEATURE ENGINEERING (Confirmed & Cleaned) ---

# Assuming price_df and sent_df are loaded and merged into merged_df as shown above
# merged_df now contains: date, price_usd_per_mmbtu, sentiment_score (shifted)
# Data cleaning:
df = merged_df.sort_values("date").reset_index(drop=True)
clean_df = df.dropna(subset=["price_usd_per_mmbtu", "sentiment_score"]).reset_index(drop=True)

# Feature Engineering (re-apply to the clean_df to get correct index/length)
span_fast = 7
span_slow = 30

clean_df[f'rolling_mean_{span_fast}'] = clean_df['price_usd_per_mmbtu'].ewm(span=span_fast, adjust=False).mean()
clean_df[f'rolling_mean_{span_slow}'] = clean_df['price_usd_per_mmbtu'].ewm(span=span_slow, adjust=False).mean()
clean_df[f'sentiment_rolling_{span_fast}'] = clean_df['sentiment_score'].ewm(span=span_fast, adjust=False).mean()
clean_df[f'sentiment_rolling_{span_slow}'] = clean_df['sentiment_score'].ewm(span=span_slow, adjust=False).mean()

clean_df['crossover_signal'] = np.where(
    clean_df[f'rolling_mean_{span_fast}'] > clean_df[f'rolling_mean_{span_slow}'], 1, 0)
clean_df['sentiment_interaction'] = clean_df['sentiment_score'] * clean_df['crossover_signal']

for lag in [2, 7, 30]:
    clean_df[f'price_lag{lag}'] = clean_df['price_usd_per_mmbtu'].shift(lag)

clean_df['target'] = clean_df['price_usd_per_mmbtu'].shift(-1)
clean_df = clean_df.dropna().reset_index(drop=True)

In [111]:
# LSTM DATA PREP
LOOKBACK_WINDOW = 21
NUM_FEATURES = 3 # Price, EWM_7, EWM_30
LSTM_FEATURES = ['price_usd_per_mmbtu', f'rolling_mean_{span_fast}', f'rolling_mean_{span_slow}']
XGB_FEATURES = [ # Defining here for clarity
    f"price_lag{lag}" for lag in [2, 7, 30]
] + [
    f"rolling_mean_{span_fast}", f"rolling_mean_{span_slow}",
    "sentiment_score", f"sentiment_rolling_{span_fast}", f"sentiment_rolling_{span_slow}",
    "crossover_signal", "sentiment_interaction"
]

data = clean_df[LSTM_FEATURES].values
target = clean_df['target'].values

scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data)

In [112]:
# pytorch LSTM setup & training
class CommodityDataset(Dataset):
    def __init__(self, data, targets, lookback_window):
        self.data = data
        self.targets = targets
        self.lookback_window = lookback_window
        self.num_samples = len(self.data) - self.lookback_window

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        X = self.data[idx:idx + self.lookback_window, :]
        Y = self.targets[idx + self.lookback_window]
        return torch.tensor(X, dtype=torch.float32), torch.tensor(Y, dtype=torch.float32)

dataset = CommodityDataset(scaled_data, target, LOOKBACK_WINDOW)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

class PyTorchLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size=1, dropout_rate=0.2):
        super(PyTorchLSTM, self).__init__()
        self.hidden_size = hidden_size; self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers,
                            batch_first=True, dropout=dropout_rate if num_layers > 1 else 0)
        self.linear = nn.Linear(hidden_size, output_size)
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0)); out = self.linear(out[:, -1, :])
        return out

In [113]:
# Model Initialization and Training Loop
from tqdm import tqdm
NUM_EPOCHS = 100
input_size = NUM_FEATURES; hidden_size = 50; num_layers = 2
lstm_model = PyTorchLSTM(input_size, hidden_size, num_layers)
optimizer = optim.Adam(lstm_model.parameters(), lr=0.001); criterion = nn.MSELoss()

for epoch in tqdm(range(NUM_EPOCHS)):
    for sequences, targets_batch in dataloader:
        optimizer.zero_grad(); predictions = lstm_model(sequences)
        loss = criterion(predictions.squeeze(), targets_batch)
        loss.backward(); optimizer.step()
    print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Loss: {loss.item():.4f}")
print("LSTM training completed.")

  1%|          | 1/100 [00:01<02:58,  1.80s/it]

Epoch 1/100, Loss: 0.6213


  2%|▏         | 2/100 [00:02<01:55,  1.18s/it]

Epoch 2/100, Loss: 0.6239


  3%|▎         | 3/100 [00:03<01:33,  1.03it/s]

Epoch 3/100, Loss: 0.0496


  4%|▍         | 4/100 [00:03<01:23,  1.16it/s]

Epoch 4/100, Loss: 0.2294


  5%|▌         | 5/100 [00:04<01:15,  1.26it/s]

Epoch 5/100, Loss: 0.2361


  6%|▌         | 6/100 [00:05<01:10,  1.33it/s]

Epoch 6/100, Loss: 0.1384


  7%|▋         | 7/100 [00:05<01:08,  1.37it/s]

Epoch 7/100, Loss: 0.0227


  8%|▊         | 8/100 [00:06<01:06,  1.39it/s]

Epoch 8/100, Loss: 0.0920


  9%|▉         | 9/100 [00:07<01:05,  1.38it/s]

Epoch 9/100, Loss: 0.0228


 10%|█         | 10/100 [00:09<01:31,  1.01s/it]

Epoch 10/100, Loss: 0.0329


 11%|█         | 11/100 [00:09<01:20,  1.10it/s]

Epoch 11/100, Loss: 0.0242


 12%|█▏        | 12/100 [00:10<01:14,  1.18it/s]

Epoch 12/100, Loss: 0.1437


 13%|█▎        | 13/100 [00:11<01:15,  1.16it/s]

Epoch 13/100, Loss: 0.0763


 14%|█▍        | 14/100 [00:12<01:18,  1.10it/s]

Epoch 14/100, Loss: 0.1681


 15%|█▌        | 15/100 [00:13<01:18,  1.09it/s]

Epoch 15/100, Loss: 0.0078


 16%|█▌        | 16/100 [00:14<01:19,  1.06it/s]

Epoch 16/100, Loss: 0.0324


 17%|█▋        | 17/100 [00:15<01:12,  1.15it/s]

Epoch 17/100, Loss: 0.0353


 18%|█▊        | 18/100 [00:15<01:06,  1.23it/s]

Epoch 18/100, Loss: 0.0254


 19%|█▉        | 19/100 [00:16<01:02,  1.30it/s]

Epoch 19/100, Loss: 0.0119


 20%|██        | 20/100 [00:17<00:59,  1.34it/s]

Epoch 20/100, Loss: 0.0663


 21%|██        | 21/100 [00:17<00:57,  1.37it/s]

Epoch 21/100, Loss: 0.0217


 22%|██▏       | 22/100 [00:18<00:55,  1.41it/s]

Epoch 22/100, Loss: 0.0379


 23%|██▎       | 23/100 [00:19<00:54,  1.41it/s]

Epoch 23/100, Loss: 0.1376


 24%|██▍       | 24/100 [00:19<00:53,  1.42it/s]

Epoch 24/100, Loss: 0.0428


 25%|██▌       | 25/100 [00:20<00:51,  1.45it/s]

Epoch 25/100, Loss: 0.0435


 26%|██▌       | 26/100 [00:21<00:50,  1.45it/s]

Epoch 26/100, Loss: 0.0165


 27%|██▋       | 27/100 [00:21<00:50,  1.45it/s]

Epoch 27/100, Loss: 0.0077


 28%|██▊       | 28/100 [00:22<00:48,  1.47it/s]

Epoch 28/100, Loss: 0.0185


 29%|██▉       | 29/100 [00:23<00:49,  1.43it/s]

Epoch 29/100, Loss: 0.0108


 30%|███       | 30/100 [00:23<00:49,  1.42it/s]

Epoch 30/100, Loss: 0.0271


 31%|███       | 31/100 [00:24<00:52,  1.31it/s]

Epoch 31/100, Loss: 0.1037


 32%|███▏      | 32/100 [00:25<00:56,  1.21it/s]

Epoch 32/100, Loss: 0.0209


 33%|███▎      | 33/100 [00:26<00:57,  1.17it/s]

Epoch 33/100, Loss: 0.0320


 34%|███▍      | 34/100 [00:27<00:59,  1.10it/s]

Epoch 34/100, Loss: 0.0167


 35%|███▌      | 35/100 [00:28<00:54,  1.18it/s]

Epoch 35/100, Loss: 0.0194


 36%|███▌      | 36/100 [00:29<00:51,  1.23it/s]

Epoch 36/100, Loss: 0.0264


 37%|███▋      | 37/100 [00:29<00:48,  1.31it/s]

Epoch 37/100, Loss: 0.0600


 38%|███▊      | 38/100 [00:30<00:45,  1.36it/s]

Epoch 38/100, Loss: 0.0086


 39%|███▉      | 39/100 [00:31<00:44,  1.39it/s]

Epoch 39/100, Loss: 0.0412


 40%|████      | 40/100 [00:31<00:42,  1.41it/s]

Epoch 40/100, Loss: 0.0314


 41%|████      | 41/100 [00:32<00:41,  1.43it/s]

Epoch 41/100, Loss: 0.1416


 42%|████▏     | 42/100 [00:33<00:40,  1.44it/s]

Epoch 42/100, Loss: 0.0233


 43%|████▎     | 43/100 [00:33<00:38,  1.46it/s]

Epoch 43/100, Loss: 0.0053


 44%|████▍     | 44/100 [00:34<00:38,  1.47it/s]

Epoch 44/100, Loss: 0.0145


 45%|████▌     | 45/100 [00:35<00:37,  1.49it/s]

Epoch 45/100, Loss: 0.0131


 46%|████▌     | 46/100 [00:35<00:36,  1.49it/s]

Epoch 46/100, Loss: 0.0260


 47%|████▋     | 47/100 [00:36<00:35,  1.48it/s]

Epoch 47/100, Loss: 0.0831


 48%|████▊     | 48/100 [00:37<00:35,  1.48it/s]

Epoch 48/100, Loss: 0.0375


 49%|████▉     | 49/100 [00:38<00:35,  1.45it/s]

Epoch 49/100, Loss: 0.1214


 50%|█████     | 50/100 [00:40<01:01,  1.23s/it]

Epoch 50/100, Loss: 0.3551


 51%|█████     | 51/100 [00:42<01:15,  1.54s/it]

Epoch 51/100, Loss: 0.0048


 52%|█████▏    | 52/100 [00:44<01:09,  1.45s/it]

Epoch 52/100, Loss: 0.0086


 53%|█████▎    | 53/100 [00:44<00:57,  1.22s/it]

Epoch 53/100, Loss: 0.0241


 54%|█████▍    | 54/100 [00:45<00:48,  1.05s/it]

Epoch 54/100, Loss: 0.0161


 55%|█████▌    | 55/100 [00:46<00:42,  1.06it/s]

Epoch 55/100, Loss: 0.0425


 56%|█████▌    | 56/100 [00:46<00:37,  1.16it/s]

Epoch 56/100, Loss: 0.0377


 57%|█████▋    | 57/100 [00:47<00:34,  1.24it/s]

Epoch 57/100, Loss: 0.0311


 58%|█████▊    | 58/100 [00:48<00:32,  1.31it/s]

Epoch 58/100, Loss: 0.0400


 59%|█████▉    | 59/100 [00:48<00:30,  1.36it/s]

Epoch 59/100, Loss: 0.0488


 60%|██████    | 60/100 [00:49<00:28,  1.38it/s]

Epoch 60/100, Loss: 0.0109


 61%|██████    | 61/100 [00:50<00:28,  1.38it/s]

Epoch 61/100, Loss: 0.0413


 62%|██████▏   | 62/100 [00:50<00:27,  1.40it/s]

Epoch 62/100, Loss: 0.0435


 63%|██████▎   | 63/100 [00:51<00:26,  1.40it/s]

Epoch 63/100, Loss: 0.0049


 64%|██████▍   | 64/100 [00:52<00:26,  1.35it/s]

Epoch 64/100, Loss: 0.0368


 65%|██████▌   | 65/100 [00:53<00:28,  1.24it/s]

Epoch 65/100, Loss: 0.0272


 66%|██████▌   | 66/100 [00:54<00:29,  1.15it/s]

Epoch 66/100, Loss: 0.0191


 67%|██████▋   | 67/100 [00:55<00:29,  1.11it/s]

Epoch 67/100, Loss: 0.0165


 68%|██████▊   | 68/100 [00:56<00:28,  1.13it/s]

Epoch 68/100, Loss: 0.0692


 69%|██████▉   | 69/100 [00:56<00:25,  1.21it/s]

Epoch 69/100, Loss: 0.0389


 70%|███████   | 70/100 [00:57<00:23,  1.28it/s]

Epoch 70/100, Loss: 0.0268


 71%|███████   | 71/100 [00:58<00:21,  1.33it/s]

Epoch 71/100, Loss: 0.0825


 72%|███████▏  | 72/100 [00:58<00:20,  1.37it/s]

Epoch 72/100, Loss: 0.1051


 73%|███████▎  | 73/100 [00:59<00:19,  1.36it/s]

Epoch 73/100, Loss: 0.0241


 74%|███████▍  | 74/100 [01:00<00:18,  1.40it/s]

Epoch 74/100, Loss: 0.0271


 75%|███████▌  | 75/100 [01:00<00:17,  1.43it/s]

Epoch 75/100, Loss: 0.0095


 76%|███████▌  | 76/100 [01:01<00:16,  1.46it/s]

Epoch 76/100, Loss: 0.0173


 77%|███████▋  | 77/100 [01:02<00:15,  1.45it/s]

Epoch 77/100, Loss: 0.0200


 78%|███████▊  | 78/100 [01:02<00:15,  1.44it/s]

Epoch 78/100, Loss: 0.1535


 79%|███████▉  | 79/100 [01:03<00:14,  1.46it/s]

Epoch 79/100, Loss: 0.0170


 80%|████████  | 80/100 [01:04<00:13,  1.47it/s]

Epoch 80/100, Loss: 0.0095


 81%|████████  | 81/100 [01:05<00:12,  1.47it/s]

Epoch 81/100, Loss: 0.0081


 82%|████████▏ | 82/100 [01:05<00:12,  1.49it/s]

Epoch 82/100, Loss: 0.0302


 83%|████████▎ | 83/100 [01:06<00:12,  1.33it/s]

Epoch 83/100, Loss: 0.0118


 84%|████████▍ | 84/100 [01:07<00:13,  1.19it/s]

Epoch 84/100, Loss: 0.0219


 85%|████████▌ | 85/100 [01:08<00:13,  1.14it/s]

Epoch 85/100, Loss: 0.0235


 86%|████████▌ | 86/100 [01:09<00:13,  1.00it/s]

Epoch 86/100, Loss: 0.0062


 87%|████████▋ | 87/100 [01:10<00:11,  1.11it/s]

Epoch 87/100, Loss: 0.0053


 88%|████████▊ | 88/100 [01:11<00:10,  1.20it/s]

Epoch 88/100, Loss: 0.0028


 89%|████████▉ | 89/100 [01:11<00:08,  1.26it/s]

Epoch 89/100, Loss: 0.0173


 90%|█████████ | 90/100 [01:12<00:07,  1.32it/s]

Epoch 90/100, Loss: 0.0371


 91%|█████████ | 91/100 [01:13<00:06,  1.34it/s]

Epoch 91/100, Loss: 0.0285


 92%|█████████▏| 92/100 [01:14<00:05,  1.37it/s]

Epoch 92/100, Loss: 0.0251


 93%|█████████▎| 93/100 [01:14<00:04,  1.41it/s]

Epoch 93/100, Loss: 0.0226


 94%|█████████▍| 94/100 [01:15<00:04,  1.43it/s]

Epoch 94/100, Loss: 0.0243


 95%|█████████▌| 95/100 [01:16<00:03,  1.43it/s]

Epoch 95/100, Loss: 0.0151


 96%|█████████▌| 96/100 [01:16<00:02,  1.43it/s]

Epoch 96/100, Loss: 0.7250


 97%|█████████▋| 97/100 [01:17<00:02,  1.29it/s]

Epoch 97/100, Loss: 0.1174


 98%|█████████▊| 98/100 [01:18<00:01,  1.24it/s]

Epoch 98/100, Loss: 0.0141


 99%|█████████▉| 99/100 [01:19<00:00,  1.30it/s]

Epoch 99/100, Loss: 0.0135


100%|██████████| 100/100 [01:20<00:00,  1.25it/s]

Epoch 100/100, Loss: 0.0039
LSTM training completed.





In [123]:
# XGBoost residual training
def generate_lstm_predictions(model, dataset):
    dataloader = DataLoader(dataset, batch_size=64, shuffle=False)
    model.eval(); all_predictions = []
    with torch.no_grad():
        for sequences, _ in dataloader:
            predictions_tensor = model(sequences)
            all_predictions.append(predictions_tensor.cpu().numpy())
    return np.concatenate(all_predictions).flatten()

base_lstm_predictions = generate_lstm_predictions(lstm_model, dataset)
actual_targets_for_xgb = target[LOOKBACK_WINDOW:] # Targets corresponding to the predictions
xgb_residual_target = actual_targets_for_xgb - base_lstm_predictions

X_xgb = clean_df[XGB_FEATURES].iloc[LOOKBACK_WINDOW:].values

xgb_residual_model = XGBRegressor(objective='reg:squarederror', n_estimators=500, max_depth=5,
                                 learning_rate=0.05, tree_method='hist', random_state=42)
xgb_residual_model.fit(X_xgb, xgb_residual_target)
print("XGBoost Residual Model trained.")

XGBoost Residual Model trained.


In [119]:
# saving model
torch.save(lstm_model.state_dict(), 'lstm_model_weights.pth')
joblib.dump(xgb_residual_model, 'xgb_residual_model.joblib')
joblib.dump(scaler, 'minmax_scaler.joblib')
print("Models and Scaler saved.")

Models and Scaler saved.


In [116]:
LAST_TRAINING_INDEX = len(clean_df) - 1
initial_history_row = clean_df.iloc[LAST_TRAINING_INDEX].to_dict()
initial_history_row = pd.Series(initial_history_row)

# 2. Get the 21-day buffer data
START_BUFFER_INDEX = len(clean_df) - LOOKBACK_WINDOW

# Extract the last LOOKBACK_WINDOW rows of unscaled LSTM features
initial_buffer_data = clean_df[LSTM_FEATURES].iloc[START_BUFFER_INDEX:len(clean_df)].values

# 3. Initialize the History Buffer (deque)
initial_history_buffer = deque(
    [row for row in initial_buffer_data],
    maxlen=LOOKBACK_WINDOW
)

print(f"\nInference Initialized: Buffer size is {len(initial_history_buffer)}.")
print(f"Next prediction will be based on the last row:\n{initial_history_row.to_frame().T}")

# You are now ready to execute the run_recursive_inference_HYBRID function
# from the previous response using these initialized, real-data variables.


Inference Initialized: Buffer size is 21.
Next prediction will be based on the last row:
                  date price_usd_per_mmbtu        source sentiment_score  \
0  2019-12-30 00:00:00                2.06  FRED:DHHNGSP         0.86988   

  rolling_mean_7 rolling_mean_30 sentiment_rolling_7 sentiment_rolling_30  \
0       2.087273         2.29528            0.162096             0.067071   

  crossover_signal sentiment_interaction price_lag2 price_lag7 price_lag30  \
0                0                   0.0       2.11        2.3        2.66   

  target  
0   2.09  


In [117]:
# Inference
LOOKBACK_WINDOW = 21
NUM_FEATURES = 3
LSTM_FEATURES = ['price_usd_per_mmbtu', 'rolling_mean_7', 'rolling_mean_30']
XGB_FEATURES = [
    "price_lag2", "price_lag7", "price_lag30", "rolling_mean_7", "rolling_mean_30",
    "sentiment_score", "sentiment_rolling_7", "sentiment_rolling_30",
    "crossover_signal", "sentiment_interaction"
]

In [None]:
class PyTorchLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size=1, dropout_rate=0.2):
        super(PyTorchLSTM, self).__init__()
        self.hidden_size = hidden_size; self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers,
                            batch_first=True, dropout=dropout_rate if num_layers > 1 else 0)
        self.linear = nn.Linear(hidden_size, output_size)
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0)); out = self.linear(out[:, -1, :])
        return out

# Helper Function: Feature Calculation for Inference
def create_inference_features_realtime(history_row, current_date_sentiment):
    alpha_7 = 2 / (7 + 1); alpha_30 = 2 / (30 + 1)
    current_price = history_row['price_usd_per_mmbtu']

    # Calculate NEXT day's EWMs based on T-1 state
    rolling_mean_7 = (current_price * alpha_7) + (history_row['rolling_mean_7'] * (1 - alpha_7))
    rolling_mean_30 = (current_price * alpha_30) + (history_row['rolling_mean_30'] * (1 - alpha_30))
    sentiment_rolling_7 = (current_date_sentiment * alpha_7) + (history_row['sentiment_rolling_7'] * (1 - alpha_7))
    sentiment_rolling_30 = (current_date_sentiment * alpha_30) + (history_row['sentiment_rolling_30'] * (1 - alpha_30))

    crossover_signal = 1 if rolling_mean_7 > rolling_mean_30 else 0
    sentiment_interaction = current_date_sentiment * crossover_signal

    features = {
        'price_lag2': history_row['price_lag2'], 'price_lag7': history_row['price_lag7'], 'price_lag30': history_row['price_lag30'],
        'rolling_mean_7': rolling_mean_7, 'rolling_mean_30': rolling_mean_30,
        'sentiment_score': current_date_sentiment,
        'sentiment_rolling_7': sentiment_rolling_7, 'sentiment_rolling_30': sentiment_rolling_30,
        'crossover_signal': crossover_signal, 'sentiment_interaction': sentiment_interaction
    }
    return pd.DataFrame([features])

# Helper Function: Sequence Scaling
def get_scaled_sequence(history_buffer, scaler):
    unscaled_sequence = np.stack(list(history_buffer))
    scaled_sequence = scaler.transform(unscaled_sequence)
    X_sequence = torch.tensor(scaled_sequence, dtype=torch.float32).unsqueeze(0)
    return X_sequence

# Helper Function: Load Artifacts
def load_artifacts():
    print("Loading model artifacts...")
    try:
        scaler = joblib.load('minmax_scaler.joblib')
        xgb_residual_model = joblib.load('xgb_residual_model.joblib')

        input_size, hidden_size, num_layers = NUM_FEATURES, 50, 2
        lstm_model = PyTorchLSTM(input_size, hidden_size, num_layers)

        lstm_model_state = torch.load('lstm_model_weights.pth', weights_only=False)
        lstm_model.load_state_dict(lstm_model_state)

        lstm_model.eval()
        print("Artifacts loaded successfully.")

        return lstm_model, xgb_residual_model, scaler

    except FileNotFoundError as e:
        print(f"Error: One or more artifact files not found. Run the training cell first to generate: {e}")
        # Return dummy models to prevent crash, allowing the history logic to be tested.
        class DummyLSTM(nn.Module):
            def __init__(self): super().__init__(); self.linear = nn.Linear(50, 1)
            def forward(self, x): return torch.tensor(5.0, dtype=torch.float32).view(-1, 1)

        dummy_scaler = MinMaxScaler().fit(np.random.rand(100, 3))
        dummy_xgb = XGBRegressor().fit(np.random.rand(10, 10), np.random.rand(10))
        return DummyLSTM(), dummy_xgb, dummy_scaler

In [None]:
def run_recursive_inference_HYBRID_REALTIME(
    lstm_model, xgb_residual_model, last_historical_row, history_buffer,
    sent_lookup, dates_to_predict, scaler, LOOKBACK_WINDOW, XGB_FEATURES, raw_price_history
):

    last_historical_row = last_historical_row.copy()
    results = []

    print(f"\n--- Starting Hybrid Prediction for {len(dates_to_predict)} Days ---")

    for target_date_str in dates_to_predict:
        target_date = pd.to_datetime(target_date_str)
        sentiment_day_raw_data = target_date - pd.Timedelta(days=1)
        current_sentiment_feature = sent_lookup.get(sentiment_day_raw_data, 0.0)

        # Calculate Single-Step Features (Includes UN-SCALED T+1 EWMs)
        X_predict_df = create_inference_features_realtime(last_historical_row, current_sentiment_feature)

        # Extract UN-SCALED T+1 EWMs for the current time step (T+1)
        new_ewm7_unscaled = X_predict_df['rolling_mean_7'].iloc[0]
        new_ewm30_unscaled = X_predict_df['rolling_mean_30'].iloc[0]

        # 1. LSTM Prediction (Base Forecast)
        X_sequence_tensor = get_scaled_sequence(history_buffer, scaler)
        with torch.no_grad():
            # Get the SCALED LSTM prediction for T+1 price
            scaled_lstm_prediction_T_plus_1 = lstm_model(X_sequence_tensor).cpu().numpy().flatten()[0]

        # --- CRITICAL FIX: INVERSE SCALING ---
        # A. Create a dummy row for the UN-SCALED EWMs and a placeholder for the price.
        # This allows us to scale the EWMs correctly using the existing scaler.

        # Use the LAST KNOWN UN-SCALED PRICE (T) as the placeholder for price
        # and the calculated UN-SCALED EWMs (T+1) as the other two columns.
        # This ensures the T+1 EWMs are properly scaled using the scaler's price range.
        unscaled_ewm_row = np.array([[
            last_historical_row['price_usd_per_mmbtu'], # Placeholder price (T)
            new_ewm7_unscaled,                         # EWM_7 (T+1)
            new_ewm30_unscaled                         # EWM_30 (T+1)
        ]])

        # B. Scale the row to get the correctly scaled EWM features for T+1
        scaled_ewm_row = scaler.transform(unscaled_ewm_row)

        # C. Construct the final scaled prediction row:
        # [Scaled Price (T+1), Scaled EWM7 (T+1), Scaled EWM30 (T+1)]
        scaled_prediction_row = np.array([[
            scaled_lstm_prediction_T_plus_1,
            scaled_ewm_row[0, 1],
            scaled_ewm_row[0, 2]
        ]])

        # D. Inverse transform the complete row. The UN-SCALED price is the first element [0, 0].
        base_lstm_prediction_unscaled = scaler.inverse_transform(scaled_prediction_row)[0, 0]

        # 2. XGBoost Correction (Residual)
        X_xgb_input = X_predict_df[XGB_FEATURES].values
        predicted_residual = xgb_residual_model.predict(X_xgb_input)[0]

        # 3. Final Hybrid Prediction
        # Combine the UN-SCALED base forecast with the UN-SCALED residual.
        predicted_price = base_lstm_prediction_unscaled + predicted_residual
        results.append({'id': target_date_str, 'price_usd_per_mmbtu': predicted_price})

        # --- HISTORY UPDATE ---

        # Update Raw Price History (must come before updating historical_row)
        raw_price_history.append(predicted_price) # Add the new predicted price

        # Update Price Lags (uses the raw_price_history deque)
        last_historical_row['price_lag30'] = raw_price_history[0]
        last_historical_row['price_lag7'] = raw_price_history[-7]
        last_historical_row['price_lag2'] = raw_price_history[-2]

        # Update single-row history state (T+1 values become T values for the next iteration)
        last_historical_row['price_usd_per_mmbtu'] = predicted_price
        last_historical_row['rolling_mean_7'] = new_ewm7_unscaled      # Use UN-SCALED T+1 EWM
        last_historical_row['rolling_mean_30'] = new_ewm30_unscaled    # Use UN-SCALED T+1 EWM
        last_historical_row['sentiment_rolling_7'] = X_predict_df['sentiment_rolling_7'].iloc[0]
        last_historical_row['sentiment_rolling_30'] = X_predict_df['sentiment_rolling_30'].iloc[0]
        last_historical_row['sentiment_score'] = current_sentiment_feature
        last_historical_row['crossover_signal'] = X_predict_df['crossover_signal'].iloc[0]
        last_historical_row['sentiment_interaction'] = X_predict_df['sentiment_interaction'].iloc[0]

        # Update the LSTM History Buffer (FIFO queue)
        # Use the final UN-SCALED T+1 predicted price and UN-SCALED T+1 EWMs
        new_lstm_row = np.array([predicted_price, new_ewm7_unscaled, new_ewm30_unscaled])
        history_buffer.append(new_lstm_row)

    return pd.DataFrame(results)

In [None]:
# DATA SETUP & EXECUTION
import os
# Data Pre-checks and Loading
if not os.path.exists("/content/training_data.csv") or not os.path.exists("/content/sentiment_data.csv") or not os.path.exists("/content/test-template.csv"):
    raise FileNotFoundError("Please upload 'training_data.csv', 'sentiment_data.csv', and 'test-template.csv' to your Colab environment.")

# Assuming the training was executed previously, we now load the data to define the start point
price_df = pd.read_csv("/content/training_data.csv", parse_dates=['date'])
sent_df = pd.read_csv("/content/sentiment_data.csv", parse_dates=['date'])
test_template_df = pd.read_csv("/content/test-template.csv", parse_dates=['id'])

# Re-run Feature Engineering to define 'clean_df' final state
def create_full_training_df(price_df, sent_df):
    merged_df = pd.merge(price_df, sent_df.assign(date=lambda x: x['date'].shift(1)), on="date", how="left").fillna(0)
    merged_df = merged_df.dropna(subset=["price_usd_per_mmbtu"]).reset_index(drop=True)

    span_fast = 7; span_slow = 30
    merged_df[f'rolling_mean_{span_fast}'] = merged_df['price_usd_per_mmbtu'].ewm(span=span_fast, adjust=False).mean()
    merged_df[f'rolling_mean_{span_slow}'] = merged_df['price_usd_per_mmbtu'].ewm(span=span_slow, adjust=False).mean()
    merged_df[f'sentiment_rolling_{span_fast}'] = merged_df['sentiment_score'].ewm(span=span_fast, adjust=False).mean()
    merged_df[f'sentiment_rolling_{span_slow}'] = merged_df['sentiment_score'].ewm(span=span_slow, adjust=False).mean()
    merged_df['crossover_signal'] = np.where(merged_df[f'rolling_mean_{span_fast}'] > merged_df[f'rolling_mean_{span_slow}'], 1, 0)
    merged_df['sentiment_interaction'] = merged_df['sentiment_score'] * merged_df['crossover_signal']
    for lag in [2, 7, 30]:
        merged_df[f'price_lag{lag}'] = merged_df['price_usd_per_mmbtu'].shift(lag)

    merged_df['target'] = merged_df['price_usd_per_mmbtu'].shift(-1)

    # We drop NAs but keep the last row with a NaN target (if any) because we need it for initialization
    clean_df = merged_df.dropna(subset=XGB_FEATURES + LSTM_FEATURES).reset_index(drop=True)
    return clean_df


clean_df = create_full_training_df(price_df, sent_df)

# Load Models and Initialize History
lstm_model, xgb_residual_model, scaler = load_artifacts()

# --- NEW RAW PRICE HISTORY & LAG INITIALIZATION (The Fix) ---

LAG_HISTORY_LENGTH = 30
LAST_TRAINING_INDEX = len(clean_df) - 1

# 1. Get the last 30 UN-SCALED price points (T-30 to T-1).
# We need the last 30 valid prices *before* the current row's price (T).
# The price at T is clean_df.iloc[LAST_TRAINING_INDEX]['price_usd_per_mmbtu']
# We take the 30 days ending at T-1.
raw_price_data = clean_df['price_usd_per_mmbtu'].iloc[LAST_TRAINING_INDEX - LAG_HISTORY_LENGTH : LAST_TRAINING_INDEX].values

# 2. CRITICAL: Initialize a deque to maintain the raw price history (30 items)
raw_price_history = deque(raw_price_data, maxlen=LAG_HISTORY_LENGTH)

# 3. Initialize the main historical row state
# This row contains the EWMs and other calculated features for the T-1 price.
initial_history_row = clean_df.iloc[LAST_TRAINING_INDEX].to_dict()
initial_history_row = pd.Series(initial_history_row)

# 4. Correctly set the lag values in the starting row using the new deque
initial_history_row['price_lag30'] = raw_price_history[0]   # Price 30 days ago (first item in deque)
initial_history_row['price_lag7'] = raw_price_history[-7]    # Price 7 days ago
initial_history_row['price_lag2'] = raw_price_history[-2]    # Price 2 days ago

# --- EXISTING LSTM BUFFER SETUP ---
# Initialize LSTM History Buffer (the last 21 UN-SCALED rows)
START_BUFFER_INDEX = len(clean_df) - LOOKBACK_WINDOW
initial_buffer_data = clean_df[LSTM_FEATURES].iloc[START_BUFFER_INDEX:len(clean_df)].values
initial_history_buffer = deque(
    [row for row in initial_buffer_data],
    maxlen=LOOKBACK_WINDOW
)

# Prepare Prediction Dates and Sentiment Lookup (Same logic as before)
dates_to_predict = test_template_df['id'].dt.strftime('%Y-%m-%d').tolist()
prediction_dates = test_template_df['id'].tolist()
sentiment_dates_needed = [d - pd.Timedelta(days=1) for d in prediction_dates]

# We need a robust way to generate sentiment for the *entire* prediction period.
# In a real environment, you'd load this from a real-time feed.
# For this execution, we simulate plausible sentiment based on the date index.
def generate_simulated_sentiment(start_date, num_days):
    np.random.seed(42) # Ensure deterministic simulation
    return {
        start_date + pd.Timedelta(days=i): np.random.uniform(-0.4, 0.4)
        for i in range(num_days)
    }

# Simulating sentiment from the day before the first prediction
start_sentiment_date = prediction_dates[0] - pd.Timedelta(days=1)
sent_lookup = generate_simulated_sentiment(start_sentiment_date, len(sentiment_dates_needed))

Loading model artifacts...
Artifacts loaded successfully.


In [None]:
# Execute Final Inference
final_predictions = run_recursive_inference_HYBRID_REALTIME(
    lstm_model=lstm_model,
    xgb_residual_model=xgb_residual_model,
    last_historical_row=initial_history_row,
    history_buffer=initial_history_buffer,
    sent_lookup=sent_lookup,
    dates_to_predict=dates_to_predict,
    scaler=scaler,
    LOOKBACK_WINDOW=LOOKBACK_WINDOW,
    XGB_FEATURES=XGB_FEATURES,
    raw_price_history=raw_price_history
)

print("\n==================================================")
print("✅ HYBRID MODEL INFERENCE COMPLETE")
print("==================================================")
print(f"Total Predictions Generated: {len(final_predictions)}")
print("\nPredictions Output:")
print(final_predictions.to_string(index=False))


--- Starting Hybrid Prediction for 1434 Days ---

✅ HYBRID MODEL INFERENCE COMPLETE
Total Predictions Generated: 1434

Predictions Output:
        id  price_usd_per_mmbtu
2020-01-02            14.412982
2020-01-03            34.834186
2020-01-06            39.766480
2020-01-07            38.962099
2020-01-08            39.825762
2020-01-09            40.034911
2020-01-10            39.822818
2020-01-13            39.796482
2020-01-14            39.468927
2020-01-15            39.394839
2020-01-16            39.502522
2020-01-17            39.558537
2020-01-21            39.296081
2020-01-22            39.462497
2020-01-23            39.476055
2020-01-24            39.339650
2020-01-27            39.530967
2020-01-28            39.536051
2020-01-29            39.878312
2020-01-30            40.580233
2020-01-31            42.324722
2020-02-03            42.917006
2020-02-04            43.190256
2020-02-05            43.189050
2020-02-06            43.262068
2020-02-07            43.104