# Test 5

### Import

In [None]:

import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader

# Normalize the data
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

df = pd.read_pickle("../Data/train_dax_data.pkl")
df["Y_scaled"] = scaler_y.fit_transform(df["Y"].values.reshape(-1, 1))
X = scaler_X.fit_transform(df.iloc[:, 2:-2])  # Exclude 'Y' and unnecessary columns

# Create sequences
def create_sequences(data, labels, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i + seq_length])
        y.append(labels[i + seq_length])
    return np.array(X), np.array(y)

seq_size = 30
X_sequences, y_sequences = create_sequences(X, df["Y_scaled"].values, seq_size)

# Dataset and DataLoader
class FinanceDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return torch.tensor(self.X[idx], dtype=torch.float32), torch.tensor(self.y[idx], dtype=torch.float32)

dataset = FinanceDataset(X_sequences, y_sequences)
train_loader = DataLoader(dataset, batch_size=16, shuffle=True)

# Define the LSTM model
class Net(nn.Module):
    def __init__(self, input_size, output_size, hidden_size, num_layers, dropout=0.2):
        super(Net, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x, _ = self.lstm(x)
        x = self.fc(x[:, -1, :])  # Output from the last timestep
        return x

# Initialize model, loss function, and optimizer
input_size = X.shape[1]
output_size = 1
hidden_size = 100
num_layers = 2

model = Net(input_size, output_size, hidden_size, num_layers)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 20
losses = []

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(-1), labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    epoch_loss /= len(train_loader)
    losses.append(epoch_loss)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss:.4f}")

# Plot training loss
plt.plot(losses, label='Training Loss')
plt.legend()
plt.show()

# Save the model
torch.save(model.state_dict(), "../Models/best_model.pt")

# Evaluate the model
model.eval()
test_df = pd.read_pickle("../Data/test_dax_data.pkl")
test_X = scaler_X.transform(test_df.iloc[:, :-2])  # Normalize test data
test_y = scaler_y.transform(test_df["Y"].values.reshape(-1, 1))

test_sequences, test_labels = create_sequences(test_X, test_y.flatten(), seq_size)
test_dataset = FinanceDataset(test_sequences, test_labels)
test_loader = DataLoader(test_dataset, batch_size=16)

predictions, actuals = [], []

with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        predictions.append(outputs.numpy())
        actuals.append(labels.numpy())

predictions = scaler_y.inverse_transform(np.concatenate(predictions))
actuals = scaler_y.inverse_transform(np.concatenate(actuals))

# Plot predictions vs actuals
plt.plot(actuals, label='Actual')
plt.plot(predictions, label='Predicted')
plt.legend()
plt.show()


### Device

In [None]:
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

device = "cpu"
print(f"Using device: {device}")

### Hyperparameter

In [None]:
# Model parameter
input_size = 8
output_size = 1
hidden_size = 100
num_layers = 3
dropout = 0.2

# Training parameter
batch_size = 16
num_epochs = 10
learning_rate = 0.001
seq_size = 30

### LSTM Model

In [None]:
class Net(nn.Module):
    def __init__(self, input_size, output_size, hidden_size, num_layers, dropout=0.2):
        super(Net, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x, _ = self.lstm(x)
        x = self.fc(x[:, -1, :])  # Output from the last timestep
        return x


### Dataloader

In [None]:
class FinanceDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return torch.tensor(self.X[idx], dtype=torch.float32), torch.tensor(self.y[idx], dtype=torch.float32)

### Init

In [None]:
# Initialize model, loss function, optimizer
net = Net(input_size, output_size, hidden_size, num_layers)
criterion = nn.MSELoss()
optimizer = optim.SGD(net.parameters(), lr=learning_rate)

In [None]:
# Train
# Normalize the data
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

df = pd.read_pickle("../Data/train_dax_data.pkl")
df["Y_scaled"] = scaler_y.fit_transform(df["Y"].values.reshape(-1, 1))
X = scaler_X.fit_transform(df.iloc[:, :-2])  # Exclude 'Y' and unnecessary columns



df = pd.read_pickle("../Data/train_dax_data.pkl")
if "Date" in df.columns:
    df = df.drop("Date", axis=1)
if "index" in df.columns:
    df = df.drop("index", axis=1)
    
display(df)
scaler = MinMaxScaler()
scaler.fit(df.iloc[:, :-1])

In [None]:
# Test
test_df = pd.read_pickle("../Data/test_dax_data.pkl")
if "Date" in test_df.columns:
    test_df = test_df.drop("Date", axis=1)
if "index" in test_df.columns:
    test_df = test_df.drop("index", axis=1)

display(test_df)
    
labels_test = test_df.iloc[:, -1]

In [None]:
# Initialize dataset and dataloader
dataset = FinanceDataset(df, scaler, seq_size=seq_size)
train_loader = DataLoader(dataset, batch_size=batch_size)

test_dataset = FinanceDataset(test_df, scaler, seq_size=seq_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

### Training

In [None]:
import os
import torch
import torch.optim as optim
from torch.utils.data import DataLoader

# Sicherstellen, dass das Modellverzeichnis existiert
os.makedirs("../Models", exist_ok=True)

best_model_path = "../Models/best_model.pt"

if os.path.exists(best_model_path):
    net.load_state_dict(torch.load(best_model_path))
    print(f"Modell erfolgreich geladen von {best_model_path}")


# Parameter für Early Stopping und Modell-Speicherung
patience = 5 
best_test_loss = float('inf')
early_stopping_counter = 0

losses = []
test_loss_vals = []
learning_rate = 0.001  # Initiale Lernrate

# Training loop mit Early Stopping
for epoch in range(num_epochs):
    net.train()
    running_loss = 0

    # Training
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)  # Auf das Gerät verschieben
        optimizer.zero_grad()
        outputs = net(inputs.float())
        loss = criterion(torch.squeeze(outputs), labels.float())
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    avg_train_loss = running_loss / len(train_loader)
    losses.append(avg_train_loss)

    # Modell auswerten auf dem Testset
    net.eval()
    running_test_loss = 0
    with torch.no_grad():
        # Iteriere über den Testloader
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)  # Auf das Gerät verschieben
            test_outputs = net(inputs)
            test_outputs = test_outputs.squeeze(-1)  # Entferne unnötige Dimension
            test_loss = criterion(test_outputs, labels)
            running_test_loss += test_loss.item()
        
    avg_test_loss = running_test_loss / len(test_loader)
    test_loss_vals.append(avg_test_loss)

    # Check for Early Stopping und Speichern des besten Modells
    if avg_test_loss <= best_test_loss:
        best_test_loss = avg_test_loss
        best_model_path = '../Models/best_model.pt'
        torch.save(net.state_dict(), best_model_path)
        early_stopping_counter = 0
    else:
        early_stopping_counter += 1

    # Ausgabe der Verluste
    print(f'Epoch {epoch + 1}, Train Loss: {avg_train_loss:.4f}, Test Loss: {avg_test_loss:.4f}')
    
    # Early Stopping-Kriterium prüfen
    if early_stopping_counter >= patience:
        print(f"Early stopping after {epoch + 1} epochs. Test loss did not improve for {patience} epochs.")
        break

    # Dynamische Lernratenanpassung alle 10 Epochen
    if epoch > 0 and epoch % 10 == 0:
        learning_rate *= 0.5
        optimizer = optim.SGD(net.parameters(), lr=learning_rate)
        print(f"Reduced learning rate to {learning_rate:.5f}")

print(f"Training abgeschlossen. Bestes Modell gespeichert unter: {best_model_path}")


### Backtesting

In [None]:
scaler = MinMaxScaler()
df = pd.read_pickle("../Data/train_dax_data.pkl").reset_index().iloc[:, :-1]
if "Date" in test_df.columns:
    test_df = test_df.drop("Date", axis=1)
if "index" in test_df.columns:
    test_df = test_df.drop("index", axis=1)
df = df[["Open", "High", "Low", "Close", "Adj Close", "Volume", "month", "weekday"]]
display(df)
scaler.fit(df.values)

model_path = "../Models/best_model.pt"

In [None]:
import pandas as pd
import pandas as pd
import torch
from sklearn.preprocessing import MinMaxScaler
import numpy as np

model = Net(input_size, output_size, hidden_size, num_layers)

# Load state_dict only
model.load_state_dict(torch.load(model_path)) 
model.eval()

df = pd.read_pickle('../Data/train_dax_data.pkl')
df = df[["Open", "High", "Low", "Close", "Adj Close", "Volume", "month", "weekday", "Y"]]


test_data = FinanceDataset(df, scaler, seq_size=seq_size)
test_loader = DataLoader(test_data, batch_size=batch_size)

all_predictions = []
all_labels = []

with torch.no_grad():
    for inputs, labels in test_loader:
        out = model(inputs) 
        
        all_predictions.append(out.numpy())  
        all_labels.append(labels.numpy())

all_predictions = np.concatenate(all_predictions)
all_labels = np.concatenate(all_labels)

print(f'Predicted values: {all_predictions.flatten()}')
print(f'Actual values: {all_labels.flatten()}')

output_df = pd.DataFrame({'Predicted': all_predictions.flatten(), 'Actual': all_labels.flatten()})
display(output_df)


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

threshold = 0.5
predicted_classes = (all_predictions.flatten() > threshold).astype(int)

accuracy = accuracy_score(all_labels.flatten(), predicted_classes)
precision = precision_score(all_labels.flatten(), predicted_classes)
recall = recall_score(all_labels.flatten(), predicted_classes)
f1 = f1_score(all_labels.flatten(), predicted_classes)

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(all_labels.flatten(), predicted_classes)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()


In [None]:
output_df = pd.DataFrame({
    'Predicted': all_predictions.flatten(),
    'Predicted_Class': predicted_classes,
    'Actual': all_labels.flatten()
})
correlation_matrix = output_df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()
display(output_df)

In [None]:
os.makedirs("logs", exist_ok=True)
os.makedirs("results", exist_ok=True)

test_data = pd.read_pickle("../Data/test_dax_data.pkl")

print(test_data.dtypes)

scaler = MinMaxScaler()
scaler.fit(test_data.iloc[:, 2:-1])

input_size = 8
output_size = 1
hidden_size = 100
num_layers = 2

model = Net(input_size, output_size, hidden_size, num_layers)
model_path = "../Models/best_model.pt"
model.load_state_dict(torch.load(model_path))
model.eval()

ALPACA_CREDS = {
    "API_KEY": os.getenv("ALPACA_API_KEY"),
    "API_SECRET": os.getenv("ALPACA_API_SECRET"),
    "PAPER": True,
}

# Strategy setup
start_date = datetime(2023, 1, 1)
end_date = datetime(2023, 12, 31)
broker = Alpaca(ALPACA_CREDS)

strategy = Backtest(
    name="Test2.1",
    broker=broker,
    parameters={
        "symbol": "^GDAXI",
        "cash_at_risk": 0.5,
        "model": model,
        "num_prior_days": 30,
        "dataset": test_data,
        "scaler": scaler,
    },
)

# Run backtest
backtest_results = strategy.backtest(
    YahooDataBacktesting,
    start_date,
    end_date,
    name="Test2.1",
    parameters={
        "symbol": "^GDAXI",
        "cash_at_risk": 0.5,
        "model": model,
        "dataset": test_data,
        "num_prior_days": 30,
        "scaler": scaler,
    },
    benchmark_asset="SPY",
    show_plot=True,
    show_tearsheet=True,
)

# Save results
backtest_results.to_csv("results/backtest_results.csv.gz", index=False, compression="gzip")

print("Backtesting complete. Results saved to backtest_results.csv.gz.")