In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys


import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler  # ADD THIS LINE
color_pal = sns.color_palette()
plt.style.use('fivethirtyeight')
sys.path.append('../src')
from irish_buoy_data import IrishBuoyData

import torch
import torch.nn as nn
# %%
from torch.utils.data import Subset
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader

# Add Numba imports here
from numba import jit, prange

# Preparing the Data Frame

In [2]:
# Step 1: Get your multi-level data (you already have this)
buoy_stations = ['M2', 'M3', 'M4', 'M5', 'M6']
all_buoy_data = []

for station_id in buoy_stations:
    try:
        buoy = IrishBuoyData(station_id=station_id)
        data = buoy.fetch_data(days_back=1875)
        data = data[~data.index.duplicated(keep='first')]
        data = data.drop(columns=['station_id'])
        all_buoy_data.append(data)
    except Exception as e:
        print(f"  ✗ {station_id}: Error - {e}")

# Create multi-level DataFrame
df_multi = pd.concat(all_buoy_data, axis=1, keys=buoy_stations)
df_multi = df_multi.interpolate(method='time', limit=3).fillna(method='ffill').fillna(method='bfill')

# Step 2: Convert to 3D tensor [time, buoys, features]
def multi_to_tensor(df_multi):
    """
    Convert multi-level DataFrame to 3D tensor
    Structure: [timesteps, buoys, features]
    """
    buoy_ids = df_multi.columns.levels[0].tolist()
    n_timesteps = len(df_multi)
    n_buoys = len(buoy_ids)
    n_features = len(df_multi[buoy_ids[0]].columns)
    
    # Initialize tensor
    X_tensor = np.zeros((n_timesteps, n_buoys, n_features))
    
    # Fill tensor
    for b_idx, buoy in enumerate(buoy_ids):
        X_tensor[:, b_idx, :] = df_multi[buoy].values
    
    return X_tensor

# Create the tensor
X_tensor = multi_to_tensor(df_multi)

print(f"✓ Tensor created!")
print(f"Shape: {X_tensor.shape}")
print(f"  - Timesteps: {X_tensor.shape[0]}")
print(f"  - Buoys: {X_tensor.shape[1]}")
print(f"  - Features: {X_tensor.shape[2]}")

# Verify structure
print(f"\n=== Verification ===")
print(f"Feature names: {df_multi['M2'].columns.tolist()}")
print(f"\nM3 state at first timestep:")
print(X_tensor[0, 2, :])  # buoy index 2 = M3

print(f"\nDataFrame equivalent:")
print(df_multi['M4'].iloc[0])

✓ Tensor created!
Shape: (44200, 5, 6)
  - Timesteps: 44200
  - Buoys: 5
  - Features: 6

=== Verification ===
Feature names: ['WindSpeed (knots)', 'AirTemperature (degrees_C)', 'AtmosphericPressure (millibars)', 'WaveHeight (meters)', 'Hmax (meters)', 'Tp (seconds)']

M3 state at first timestep:
[  14.117    9.775 1001.16     3.125    4.805   12.539]

DataFrame equivalent:
WindSpeed (knots)                    14.117
AirTemperature (degrees_C)            9.775
AtmosphericPressure (millibars)    1001.160
WaveHeight (meters)                   3.125
Hmax (meters)                         4.805
Tp (seconds)                         12.539
Name: 2020-11-11 23:00:00+00:00, dtype: float64


  df_multi = df_multi.interpolate(method='time', limit=3).fillna(method='ffill').fillna(method='bfill')


# Preparing the Model

In [3]:
# ============================================================================
# Simple LSTM Model for Buoy State Prediction
# ============================================================================

import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt

# ============================================================================
# STEP 1: Simple Dataset (No External Forcing)
# ============================================================================

class SimpleBuoyDataset(Dataset):
    """
    Simple dataset: predict X_{t+1} from [X_{t-n}, ..., X_t]
    """
    def __init__(self, X_tensor, sequence_length=24):
        """
        Args:
            X_tensor: (timesteps, n_buoys, n_features)
            sequence_length: how many past timesteps to use
        """
        self.X = torch.FloatTensor(X_tensor)
        self.seq_len = sequence_length
        
    def __len__(self):
        return len(self.X) - self.seq_len
    
    def __getitem__(self, idx):
        # Input: past sequence [X_{t-n}, ..., X_t]
        X_history = self.X[idx:idx+self.seq_len]  # (seq_len, n_buoys, n_features)
        
        # Target: next state X_{t+1}
        X_target = self.X[idx+self.seq_len]  # (n_buoys, n_features)
        
        return X_history, X_target

# ============================================================================
# STEP 2: Simple LSTM Model
# ============================================================================

class SimpleLSTM(nn.Module):
    """
    Simple LSTM: X_{t+1} = f(X_t, X_{t-1}, ..., X_{t-n})
    
    Architecture:
    1. Flatten buoy dimensions
    2. LSTM processes sequence
    3. FC layer predicts next state
    """
    def __init__(self, n_buoys=5, n_features=6, hidden_dim=128, num_layers=2):
        super().__init__()
        
        self.n_buoys = n_buoys
        self.n_features = n_features
        self.input_dim = n_buoys * n_features  # Flatten: 5 * 6 = 30
        
        # LSTM layers
        self.lstm = nn.LSTM(
            input_size=self.input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=0.2 if num_layers > 1 else 0
        )
        
        # Output layer
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, self.input_dim)
        )
        
    def forward(self, x):
        """
        Args:
            x: (batch, seq_len, n_buoys, n_features)
        Returns:
            prediction: (batch, n_buoys, n_features)
        """
        batch_size, seq_len, n_buoys, n_features = x.shape
        
        # Flatten spatial dimensions: (batch, seq_len, n_buoys*n_features)
        x_flat = x.reshape(batch_size, seq_len, -1)
        
        # LSTM forward
        lstm_out, (h_n, c_n) = self.lstm(x_flat)
        
        # Use final hidden state
        final_hidden = h_n[-1]  # (batch, hidden_dim)
        
        # Predict next state
        output = self.fc(final_hidden)  # (batch, n_buoys*n_features)
        
        # Reshape back
        output = output.reshape(batch_size, n_buoys, n_features)
        
        return output

# ============================================================================
# STEP 3: Fast Training Function
# ============================================================================

def train_lstm(model, train_loader, val_loader, num_epochs=20, lr=0.001, device='cpu'):
    """Simple, fast training loop"""
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, factor=0.5)
    criterion = nn.MSELoss()
    
    train_losses = []
    val_losses = []
    
    import time
    
    for epoch in range(num_epochs):
        start_time = time.time()
        
        # Training
        model.train()
        train_loss = 0.0
        
        for X_history, X_target in train_loader:
            X_history = X_history.to(device)
            X_target = X_target.to(device)
            
            optimizer.zero_grad()
            
            # Forward pass
            pred = model(X_history)
            loss = criterion(pred, X_target)
            
            # Backward pass
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            
            train_loss += loss.item()
        
        train_loss /= len(train_loader)
        train_losses.append(train_loss)
        
        # Validation
        model.eval()
        val_loss = 0.0
        
        with torch.no_grad():
            for X_history, X_target in val_loader:
                X_history = X_history.to(device)
                X_target = X_target.to(device)
                
                pred = model(X_history)
                loss = criterion(pred, X_target)
                val_loss += loss.item()
        
        val_loss /= len(val_loader)
        val_losses.append(val_loss)
        
        scheduler.step(val_loss)
        
        epoch_time = time.time() - start_time
        print(f"Epoch {epoch+1}/{num_epochs} - Train: {train_loss:.6f}, Val: {val_loss:.6f} - {epoch_time:.1f}s")
    
    return train_losses, val_losses

# ============================================================================
# STEP 4: Setup and Train
# ============================================================================

In [4]:
# ============================================================================
# FIX: NORMALIZE DATA BEFORE TRAINING
# ============================================================================

print("="*70)
print("NORMALIZING DATA")
print("="*70)

# Your current data
recent_size = 30 * 24 * 6  # 6 months
X_tensor_recent = X_tensor[-recent_size:]

print(f"Original data shape: {X_tensor_recent.shape}")
print(f"Original stats - Mean: {X_tensor_recent.mean():.2f}, Std: {X_tensor_recent.std():.2f}")
print(f"Original range: [{X_tensor_recent.min():.2f}, {X_tensor_recent.max():.2f}]")

# NORMALIZE PROPERLY
from sklearn.preprocessing import StandardScaler

n_timesteps, n_buoys, n_features = X_tensor_recent.shape

# Store scalers for each feature (to denormalize later)
scalers = []
X_normalized = np.zeros_like(X_tensor_recent)

feature_names = ['WindSpeed', 'AirTemp', 'Pressure', 'WaveHeight', 'Hmax', 'Tp']

for f_idx in range(n_features):
    # Extract all values for this feature
    feature_values = X_tensor_recent[:, :, f_idx].reshape(-1, 1)
    
    # Fit scaler
    scaler = StandardScaler()
    normalized_values = scaler.fit_transform(feature_values)
    
    # Reshape back
    X_normalized[:, :, f_idx] = normalized_values.reshape(n_timesteps, n_buoys)
    
    scalers.append(scaler)
    
    print(f"{feature_names[f_idx]:12s} - Original: [{feature_values.min():.2f}, {feature_values.max():.2f}], "
          f"Normalized: [{normalized_values.min():.2f}, {normalized_values.max():.2f}]")

print(f"\nNormalized stats - Mean: {X_normalized.mean():.6f}, Std: {X_normalized.std():.6f}")


NORMALIZING DATA
Original data shape: (4320, 5, 6)
Original stats - Mean: 176.09, Std: 373.40
Original range: [0.00, 1041.02]
WindSpeed    - Original: [0.00, 48.95], Normalized: [-2.26, 4.80]
AirTemp      - Original: [5.28, 22.96], Normalized: [-3.23, 3.67]
Pressure     - Original: [900.00, 1041.02], Normalized: [-7.87, 2.14]
WaveHeight   - Original: [0.12, 12.54], Normalized: [-1.44, 5.41]
Hmax         - Original: [0.00, 23.12], Normalized: [-1.46, 6.25]
Tp           - Original: [2.34, 22.27], Normalized: [-2.10, 3.87]

Normalized stats - Mean: 0.000000, Std: 1.000000


In [5]:
# Run this first
print(SimpleBuoyDataset)  # Should show: <class '__main__.SimpleBuoyDataset'>

<class '__main__.SimpleBuoyDataset'>


In [6]:
# %%
# Check what X_normalized is
print(f"X_normalized type: {type(X_normalized)}")
print(f"X_normalized shape: {X_normalized.shape}")
print(f"X_normalized dtype: {X_normalized.dtype}")
print(f"X_normalized size (MB): {X_normalized.nbytes / 1e6:.2f}")

# Check if it's actually normalized
print(f"\nMean: {X_normalized.mean():.6f}")
print(f"Std: {X_normalized.std():.6f}")

X_normalized type: <class 'numpy.ndarray'>
X_normalized shape: (4320, 5, 6)
X_normalized dtype: float64
X_normalized size (MB): 1.04

Mean: 0.000000
Std: 1.000000


In [None]:
# ============================================================================
# NOW CREATE DATASET WITH NORMALIZED DATA
# ============================================================================

sequence_length = 4
dataset = SimpleBuoyDataset(X_normalized, sequence_length=sequence_length)

print(f"\nDataset size: {len(dataset)} samples")

# Split
train_size = int(0.8 * len(dataset))
train_indices = list(range(train_size))
val_indices = list(range(train_size, len(dataset)))

from torch.utils.data import Subset
train_dataset = Subset(dataset, train_indices)
val_dataset = Subset(dataset, val_indices)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=512, shuffle=False, num_workers=0)

print(f"Train: {len(train_dataset)}, Val: {len(val_dataset)}")

In [None]:
# ============================================================================
# RETRAIN WITH NORMALIZED DATA
# ============================================================================

device = 'mps' if torch.backends.mps.is_available() else 'cpu'
model = SimpleLSTM(n_buoys=5, n_features=6, hidden_dim=128, num_layers=2)

print(f"\nRetraining with normalized data...")

train_losses, val_losses = train_lstm(
    model, train_loader, val_loader, 
    num_epochs=20, 
    lr=0.001,
    device=device
)

In [None]:
# ============================================================================
# DENORMALIZE PREDICTIONS FOR EVALUATION
# ============================================================================

model.eval()
all_preds = []
all_targets = []

with torch.no_grad():
    for X_history, X_target in val_loader:
        X_history = X_history.to(device)
        pred = model(X_history)
        all_preds.append(pred.cpu().numpy())
        all_targets.append(X_target.numpy())

predictions_normalized = np.vstack(all_preds)
actuals_normalized = np.vstack(all_targets)

# Denormalize
predictions = np.zeros_like(predictions_normalized)
actuals = np.zeros_like(actuals_normalized)

for f_idx in range(n_features):
    # Predictions
    pred_flat = predictions_normalized[:, :, f_idx].reshape(-1, 1)
    pred_denorm = scalers[f_idx].inverse_transform(pred_flat)
    predictions[:, :, f_idx] = pred_denorm.reshape(-1, n_buoys)
    
    # Actuals
    actual_flat = actuals_normalized[:, :, f_idx].reshape(-1, 1)
    actual_denorm = scalers[f_idx].inverse_transform(actual_flat)
    actuals[:, :, f_idx] = actual_denorm.reshape(-1, n_buoys)

print(f"\n✓ Predictions denormalized")



In [None]:
# ============================================================================
# RECALCULATE METRICS WITH DENORMALIZED DATA
# ============================================================================

from sklearn.metrics import mean_squared_error, r2_score

print("\n" + "="*70)
print("PERFORMANCE METRICS (DENORMALIZED)")
print("="*70)

buoy_names = ['M2', 'M3', 'M4', 'M5', 'M6']

for b_idx, buoy in enumerate(buoy_names):
    print(f"\n{buoy}:")
    for f_idx, feature in enumerate(feature_names):
        actual = actuals[:, b_idx, f_idx]
        pred = predictions[:, b_idx, f_idx]
        
        rmse = np.sqrt(mean_squared_error(actual, pred))
        r2 = r2_score(actual, pred)
        
        print(f"  {feature:<15} RMSE: {rmse:.4f}, R²: {r2:.4f}")



In [None]:
# ============================================================================
# PLOT RESULTS
# ============================================================================

# Plot M5 Wind Speed
b_idx = 3  # M5
f_idx = 1  # WindSpeed

plt.figure(figsize=(15, 5))
plot_samples = min(168, len(actuals))  # Last week or all data

plt.plot(actuals[-plot_samples:, b_idx, f_idx], 'o-', label='Actual', markersize=4, linewidth=2)
plt.plot(predictions[-plot_samples:, b_idx, f_idx], 's-', label='Predicted', markersize=3, alpha=0.7, linewidth=2)
plt.xlabel('Time (hours)')
plt.ylabel('Wind Speed (knots)')
plt.title('M5 Wind Speed - Predictions vs Actual')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Check variance
print(f"\n" + "="*70)
print("PREDICTION QUALITY CHECK")
print("="*70)
print(f"Prediction variance: {predictions[:, b_idx, f_idx].std():.4f}")
print(f"Actual variance: {actuals[:, b_idx, f_idx].std():.4f}")
print(f"Ratio: {predictions[:, b_idx, f_idx].std() / actuals[:, b_idx, f_idx].std():.4f}")