In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import optuna

#np.random.seed(1729)

In [None]:
import torch
from torch import nn
import torch.optim as optim

torch.manual_seed(1729)
x = torch.rand(5, 3)
print(x)

In [None]:
from torch.nn.utils.rnn import pad_sequence

def create_dataset(data, variable_indexes, lookback_period, step, forecast_period, motif_indexes):
    X1, X2, y = [], [], []  # X1: data, X2: indexes of the motifs, y: distance to the next motif
    
    for idx in range(len(data[0]) - lookback_period - 1):
        if idx % step != 0:
            continue

        window_end_idx = idx + lookback_period
        forecast_period_end = window_end_idx + forecast_period

        # If there are no more matches after the window, break
        if not any([window_end_idx < motif_idx for motif_idx in motif_indexes]):
            break

        # Motif indexes in window, relative to the start of the window
        motif_indexes_in_window = [motif_idx - idx for motif_idx in motif_indexes if idx <= motif_idx <= window_end_idx]
        motif_indexes_in_forecast_period = [motif_idx for motif_idx in motif_indexes if window_end_idx < motif_idx <= forecast_period_end]

        if motif_indexes_in_forecast_period:
            next_match_in_forecast_period = motif_indexes_in_forecast_period[0]
        else:
            next_match_in_forecast_period = -1  # No match in the forecast period but exists in the future

        # Get the data window and transpose to (lookback_period, num_features)
        data_window = data[variable_indexes, idx:window_end_idx].T

        # Calculate `y`
        data_y = -1
        if next_match_in_forecast_period != -1:
            # Index of the next match relative to the end of the window
            data_y = next_match_in_forecast_period - window_end_idx
        
        # Append to lists
        X1.append(torch.tensor(data_window, dtype=torch.float32))  # Now with shape (lookback_period, num_features)
        X2.append(torch.tensor(motif_indexes_in_window, dtype=torch.long)) 
        y.append(data_y) 

    # Pad X2 sequences to have the same length
    X2_padded = pad_sequence(X2, batch_first=True, padding_value=-1)
    
    # Convert lists to torch tensors
    X1 = torch.stack(X1)  # Final shape: (num_samples, lookback_period, num_features)
    y = torch.tensor(y, dtype=torch.float32)

    return X1, X2_padded, y


In [None]:
#load data
#fixed frequency repetitions
n = 1000
k = 3
variable_indexes = range(k)

data_scenario1 = np.genfromtxt("../data/syntheticdata/scenario1_k=3.csv", delimiter=",")
data_scenario1 = data_scenario1.astype(int)
data_scenario1 = data_scenario1.reshape((k, n))

motif_indexes_scenario1 = np.genfromtxt("../data/syntheticdata/motif_indexes_scenario1_k=3.csv", delimiter=",")
motif_indexes_scenario1= motif_indexes_scenario1.astype(int)

fig, axs = plt.subplots(k, 1, sharex=True)
for i in range(k):
    axs[i].plot(data_scenario1[i], linewidth=0.5)


In [None]:
lookback_period = 100 #window size
step = 1 #step size for the sliding window
forecast_period = 50 #forward window size

#x1: past window + masking, x2: indexes of the motif in the window,  y: next relative index of the motif
X1, X2, y = create_dataset(data_scenario1, variable_indexes, lookback_period, step, forecast_period, motif_indexes_scenario1)

#print one input output pair
print(X1[0])
print(X2[0])
print(y[0])

# X1, X2, and y are now PyTorch tensors
print("X1 shape:", X1.shape)  # Expected shape: (num_samples, lookback_period, num_features)
print("X2 shape:", X2.shape)  # Expected shape: (num_samples, max_motif_length_in_window)
print("y shape:", y.shape)    # Expected shape: (num_samples,)


In [None]:
class BlockingTimeSeriesSplit():
  def __init__(self, n_splits):
      self.n_splits = n_splits

  def get_n_splits(self, X, y, groups):
      return self.n_splits

  def split(self, X, y=None, groups=None):
      n_samples = len(X)
      k_fold_size = n_samples // self.n_splits
      indices = np.arange(n_samples)

      margin = 0
      for i in range(self.n_splits):
          start = i * k_fold_size
          stop = start + k_fold_size
          mid = int(0.8 * (stop - start)) + start
          yield indices[start: mid], indices[mid + margin: stop]

In [None]:
from torch.utils.data import Dataset, DataLoader

class TimeSeriesDataset(Dataset):
    def __init__(self, X1, X2, y):
        self.X1 = X1  # Time series data
        self.X2 = X2  # Motif indexes data
        self.y = y    # Target values
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X1[idx], self.X2[idx], self.y[idx]


In [None]:
#TODO: check if this is how i want to deal with the x2 data
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # LSTM layer for processing X1
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)

        # Fully connected layers
        self.fc = nn.Linear(hidden_size + X2.shape[1], output_size)
        
    def forward(self, X1, X2):
        batch_size = X1.size(0)
        # Initialize hidden and cell states for LSTM
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(X1.device)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(X1.device)

        # Forward propagate LSTM
        out, _ = self.lstm(X1, (h0, c0))

        # Get the last time step's output
        out = out[:, -1, :]  # Shape: (batch_size, hidden_size)

        # Concatenate with X2 (motif indexes)
        out = torch.cat((out, X2.float()), dim=1)  # Concatenate along the feature dimension

        # Pass through the final fully connected layer
        out = self.fc(out)
        return out

In [None]:
# data n=1000, k=3, split 5-fold, 80% train, 20% test
# train CV1 0-141 test CV1 142-177
# train CV2 178-319 test CV2 320-355
# train CV3 356-497 test CV3 498-533
# train CV4 534-675 test CV4 676-711
# train CV5 712-853 test CV5 854-889

from sklearn.preprocessing import MinMaxScaler

# Model parameters
input_size = X1.shape[2]  # Number of features in X1
hidden_size = 100          # LSTM hidden layer size
num_layers = 1            # Number of LSTM layers
output_size = 1           # Since we are predicting a single value


# Placeholder for results
fold_results = []

# Loop over each fold
for fold, (train_idx, val_idx) in enumerate(BlockingTimeSeriesSplit(n_splits=5).split(X1)):
    # Create train and validation sets for each split
    X1_train, X1_val = X1[train_idx], X1[val_idx]
    X2_train, X2_val = X2[train_idx], X2[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    # Initialize a new scaler for each fold (only for X1)
    scaler_X1 = MinMaxScaler(feature_range=(0, 1))

    # Fit the scaler on the training data
    X1_train_reshaped = X1_train.view(-1, X1_train.shape[-1])  # Flatten for scaling
    X1_train_scaled = scaler_X1.fit_transform(X1_train_reshaped)
    X1_train = torch.tensor(X1_train_scaled, dtype=torch.float32).view(X1_train.shape)

    # Transform the validation set using the training scaler
    X1_val_reshaped = X1_val.view(-1, X1_val.shape[-1])  # Flatten for scaling
    X1_val_scaled = scaler_X1.transform(X1_val_reshaped)
    X1_val = torch.tensor(X1_val_scaled, dtype=torch.float32).view(X1_val.shape)
    
    # Create DataLoader for train and validation sets
    train_dataset = TimeSeriesDataset(X1_train, X2_train, y_train)
    val_dataset = TimeSeriesDataset(X1_val, X2_val, y_val)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

    #TODO: check train and val loader shape, mismatch

    # Initialize model, loss function, and optimizer for each fold
    model = LSTMModel(input_size, hidden_size, num_layers, output_size)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    # Training loop for the fold
    num_epochs = 200
    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0

        for batch_X1, batch_X2, batch_y in train_loader:
            # Forward pass
            outputs = model(batch_X1, batch_X2)
            print(outputs.shape)
            loss = criterion(outputs.squeeze(), batch_y)
            print(loss.shape)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
        if epoch % 10 + 1 == 0:
            print(f'Fold {fold+1}, Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss/len(train_loader):.4f}')

    # Evaluate on validation set
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch_X1, batch_X2, batch_y in val_loader:
            outputs = model(batch_X1, batch_X2)
            loss = criterion(outputs.squeeze(), batch_y)
            val_loss += loss.item()
    
    # Log validation loss for the fold
    avg_val_loss = val_loss / len(val_loader)
    fold_results.append(avg_val_loss)
    print(f'Fold {fold+1} Validation Loss: {avg_val_loss:.4f}')

print(fold_results)
# Display overall cross-validation performance
print(f'Mean Validation Loss across folds: {np.mean(fold_results):.4f}')
print(f'Standard Deviation of Validation Loss across folds: {np.std(fold_results):.4f}')

In [None]:
from sklearn.preprocessing import MinMaxScaler
import csv

# Model parameters
input_size = X1.shape[2]  # Number of features in X1
output_size = 1           

# Open the CSV file to log Optuna results
with open("optuna_tuning_results.csv", mode="w", newline="") as file:
    writer = csv.writer(file)
    # Write the header, including a separate column for each fold's loss
    writer.writerow(["trial_number", "learning_rate", "batch_size", "hidden_size", "num_layers", "fold_1_loss", "fold_2_loss", "fold_3_loss", "fold_4_loss", "fold_5_loss", "avg_validation_loss"])

    # Define the Optuna objective function
    def objective(trial):
        # Define hyperparameters to tune
        learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True)
        hidden_size = trial.suggest_int("hidden_size", 32, 128)
        num_layers = trial.suggest_int("num_layers", 1, 1)
        batch_size = trial.suggest_int("batch_size", 16, 128)

        # Placeholder for fold validation losses
        fold_results = []

        # Cross-validation with BlockingTimeSeriesSplit
        for fold, (train_idx, val_idx) in enumerate(BlockingTimeSeriesSplit(n_splits=5).split(X1)):
            # Split train and validation sets for each fold
            X1_train, X1_val = X1[train_idx], X1[val_idx]
            X2_train, X2_val = X2[train_idx], X2[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]

            # Scale the training and validation data for X1
            scaler_X1 = MinMaxScaler(feature_range=(0, 1))
            X1_train_scaled = scaler_X1.fit_transform(X1_train.view(-1, X1_train.shape[-1]))
            X1_train = torch.tensor(X1_train_scaled, dtype=torch.float32).view(X1_train.shape)
            X1_val_scaled = scaler_X1.transform(X1_val.view(-1, X1_val.shape[-1]))
            X1_val = torch.tensor(X1_val_scaled, dtype=torch.float32).view(X1_val.shape)
            
            # Create DataLoader for train and validation sets
            train_dataset = TimeSeriesDataset(X1_train, X2_train, y_train)
            val_dataset = TimeSeriesDataset(X1_val, X2_val, y_val)
            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
            val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

            # Initialize model, loss function, and optimizer for each fold
            model = LSTMModel(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, output_size=output_size)
            criterion = nn.MSELoss()
            optimizer = optim.Adam(model.parameters(), lr=learning_rate)
            
            # Training loop for the fold
            num_epochs = 200  
            for epoch in range(num_epochs):
                model.train()
                epoch_loss = 0

                for batch_X1, batch_X2, batch_y in train_loader:
                    # Forward pass
                    outputs = model(batch_X1, batch_X2)
                    loss = criterion(outputs.squeeze(), batch_y)

                    # Backward pass and optimization
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                    epoch_loss += loss.item()
            
            # Evaluate on validation set
            model.eval()
            val_loss = 0
            with torch.no_grad():
                for batch_X1, batch_X2, batch_y in val_loader:
                    outputs = model(batch_X1, batch_X2)
                    loss = criterion(outputs.squeeze(), batch_y)
                    val_loss += loss.item()
            
            # Calculate and store the average validation loss for this fold
            avg_val_loss = val_loss / len(val_loader)
            fold_results.append(avg_val_loss)
        
        # Calculate the mean validation loss across all folds for this trial
        mean_val_loss = np.mean(fold_results)

        # Write the trial's results to the CSV file, including each fold's loss
        writer.writerow([trial.number, learning_rate, batch_size, hidden_size, num_layers] + fold_results + [mean_val_loss])
        file.flush()  # Ensure each trial result is written immediately

        # Return the mean validation loss across folds for this trial
        return mean_val_loss

    # Run the Optuna study
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=50)  # Adjust n_trials as desired

    # Print the best hyperparameters and validation loss
    print("Best hyperparameters:", study.best_params)
    print("Best cross-validated validation loss:", study.best_value)