## Imports

In [None]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset, DataLoader
import sys
sys.path.append('../../Models')
from models import LSTMPredictor
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from pymongo import MongoClient
from dotenv import load_dotenv
import seaborn as sns
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

load_dotenv()
cluster_uri = os.environ.get("MONGODB_URI")
client = MongoClient(cluster_uri)
db = client["MSCI446_DB"]


## Functions

In [None]:
def create_sequences(features, target, sequence_length):
    sequences = []
    target_sequences = []
    for i in range(len(features) - sequence_length):
        sequences.append(features[i:i+sequence_length])
        target_sequences.append(target[i+1:i+1+sequence_length])
   
    sequences = np.array(sequences, dtype=np.float32)
    target_sequences = np.array(target_sequences, dtype=np.float32)
    
    return torch.from_numpy(sequences), torch.from_numpy(target_sequences)


def train_model(model, train_loader, criterion, optimizer, num_epochs, device):
    model.train()
    train_losses = []
    for epoch in range(num_epochs):
        total_loss = 0.0  
        for seq, targets in train_loader:
            seq, targets = seq.to(device), targets.to(device)
            optimizer.zero_grad()
            output = model(seq).to(device)
            output = output.unsqueeze(-1)
            loss = criterion(output, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(train_loader)
        train_losses.append(avg_loss)  # Store the average loss for this epoch
        print(f'Epoch {epoch+1}, Average Loss: {avg_loss}')
    return train_losses

def test_model(model, test_loader, criterion, device, future=0):
    model.eval()
    predictions = []
    actuals = []
    test_losses = []
    total_loss = 0
    index = 0
    with torch.no_grad():
        for seq, targets in test_loader:
            seq, targets = seq.to(device), targets.to(device)
            output = model(seq, future=future).to(device)
            if future > 0:
               
                extended_targets = torch.cat([targets, torch.zeros(targets.size(0), future, 1, device=device)], dim=1)
            else:
                extended_targets = targets
            
           
            output = output.unsqueeze(-1)
            loss = criterion(output, extended_targets)
            total_loss += loss.item()
            test_losses.append(loss.item())
            predictions.append(output.cpu())
            actuals.append(extended_targets.cpu())
            index += 1
    avg_loss = total_loss / len(test_loader)
    print(f'Test Loss: {avg_loss}')
    predictions = torch.cat(predictions, dim=0)
    actuals = torch.cat(actuals, dim=0)
    return test_losses, predictions, actuals

def predict_future(model, input_sequence, device, future_steps=1):
    model.eval()  
    predictions = []

    with torch.no_grad(): 
        for seq, targets in input_sequence:
            seq, targets = seq.to(device), targets.to(device)
            pred = model(seq, future=future_steps)
            
            predictions.append(pred)
        
      
        predictions = torch.cat(predictions, dim=0)

    return predictions


def print_metrics(actuals, predictions):
    r2 = r2_score(actuals.squeeze(-1).numpy(), predictions.squeeze(-1).numpy())
    mae = mean_absolute_error(actuals.squeeze(-1).numpy(),predictions.squeeze(-1).numpy())
    print(f'R² Score: {r2}')
    print(f'Mean Absolute Error: {mae}')

def plot_predictions(predictions, actuals, num_sequences=5):
    for i in range(min(num_sequences, predictions.size(0))):
        df = pd.DataFrame(data={
            'Time Step': list(range(predictions.size(1))) * 2,
            'Value': torch.cat((actuals[i, :, 0], predictions[i, :, 0]), dim=0).numpy(),
            'Type': ['Actual'] * predictions.size(1) + ['Predicted'] * predictions.size(1)
        })

        plt.figure(figsize=(10, 4))
        plt.title(f'Sequence {i+1}')
        sns.lineplot(x='Time Step', y='Value', hue='Type', style='Type', markers=True, dashes=False, data=df)
        plt.legend(title='Type')
        plt.show()
    
def plot_losses(train_losses, test_losses):
    plt.figure(figsize=(10, 5))
    plt.plot(train_losses, label='Train Loss')
    plt.plot(test_losses, label='Test Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.title('Loss over epochs')
    plt.show()

def plot_training_loss(train_losses):
    plt.figure(figsize=(10, 5))
    plt.plot(train_losses, label='Train Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Training Loss Over Time')
    plt.legend()
    plt.show()



## Loading Data

In [None]:
data = pd.read_csv('../Data/Final_table.csv')
data.head(20)

# collection_merged = db["Merged"]
# data = pd.DataFrame(list(collection_merged.find()))
# data = data.drop(columns=['_id'])
# data.head(20)


## Scaling Features and Target Value

In [None]:
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(data.drop(columns=['datetime_beginning_ept', 'DPL_historical_da']))
scaled_target = scaler.fit_transform(data[['DPL_historical_da']])

## Creating Sequences

In [None]:
X, y = create_sequences(scaled_features, scaled_target, sequence_length=78)

## Splitting into Train and Test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, shuffle=False)

## Feature Selection

In [None]:
n_samples, n_timesteps, n_features = X_train.shape
X_train_reshaped = X_train.reshape((n_samples * n_timesteps, n_features))
y_train_reshaped = y_train.reshape((n_samples * n_timesteps,))


model_fr = SelectKBest(score_func=f_regression, k=5)

data_new_fr = model_fr.fit(X_train_reshaped, y_train_reshaped)

print("f_regression: ", data_new_fr.get_feature_names_out())


In [None]:
X_train_best = model_fr.transform(X_train_reshaped)  
X_test_best = model_fr.transform(X_test.reshape((X_test.shape[0] * X_test.shape[1], X_test.shape[2])))

X_train_best = X_train_best.reshape((X_train.shape[0], X_train.shape[1], -1))
X_test_best = X_test_best.reshape((X_test.shape[0], X_test.shape[1], -1))

## Converting to Valid Pytorch Input

In [None]:
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=False)

test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [None]:
train_dataset_best = TensorDataset(torch.from_numpy(X_train_best), y_train)
train_loader_best = DataLoader(train_dataset_best, batch_size=64, shuffle=False)

test_dataset_best = TensorDataset(torch.from_numpy(X_test_best), y_test)
test_loader_best = DataLoader(test_dataset_best, batch_size=64, shuffle=False)

## Training Model

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_best = LSTMPredictor(input_features=X_train_best.shape[2], n_hidden=51, output_features=1).to(device)

criterion = nn.MSELoss()
optimizer_adam = optim.Adam(model_best.parameters(), lr=0.001)
num_epochs = 50
train_losses_best = train_model(model_best, train_loader_best, criterion, optimizer_adam, num_epochs, device)

## Plotting Train Loss

In [None]:
plot_training_loss(train_losses_best)

In [None]:
test_losses_best, predictions_best, actuals_best = test_model(model_best, test_loader_best, criterion, device, future=0)

## MAE

In [None]:
print_metrics(actuals_best, predictions_best)

## Plotting Results

In [None]:
plot_predictions(predictions_best, actuals_best, num_sequences=10)

## Predicting Future Values

In [None]:
future_values = predict_future(model_best, test_loader_best, device, future_steps=10)
future_predictions = future_values[:, -10:]

future_predictions_2d = future_predictions.reshape(-1, 1)

# Inverse transform to get back to the original dollar values
unscaled_future_predictions = scaler.inverse_transform(future_predictions_2d)

# Reshape back to the original shape with future predictions
unscaled_future_predictions = unscaled_future_predictions.reshape(future_predictions.shape)

In [None]:
for i, future_values in enumerate(unscaled_future_predictions):
    print(f"Future Values for Sequence {i+1}:")
    print(future_values)
    print("\n")  # Add extra newline for readability