# Test 2.2 - News Linear

### Import

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from backtesting import Backtest
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import os
from datetime import datetime
from lumibot.brokers import Alpaca
from lumibot.backtesting import YahooDataBacktesting

### Device

In [None]:
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

print(f"Using device: {device}")

### Hyperparameter

In [None]:
# Model parameter
input_size = 7
output_size = 1
hidden_size = 2048
num_layers = 20
dropout = 0.3

# Training parameter
num_epochs = 0
learning_rate = 0.001

### LSTM Model

In [None]:
class Net(nn.Module):
    def __init__(self, input_size, output_size, hidden_size, num_layers, dropout_rate=0.5):
        super(Net, self).__init__()
        
        self.layer_1 = nn.Linear(input_size, hidden_size)
        self.dropout = nn.Dropout(p=dropout_rate)  

        self.hidden_layers = nn.ModuleList([
            nn.Linear(hidden_size, hidden_size) for _ in range(num_layers)
        ])
        
        self.output_layer = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = torch.arctan(self.layer_1(x))
        x = self.dropout(x)
        
        for layer in self.hidden_layers:
            x = torch.arctan(layer(x))
            x = self.dropout(x)
        
        x = self.output_layer(x)
        return x

### Data Preperation

In [None]:
# Train Data
df_train = pd.read_pickle("../Data/spy_train_data.pkl")

inputs = torch.tensor(df_train.iloc[:, :-1].values)
labels = torch.tensor(df_train.iloc[:, -1].values)

# Normalize the data
# min_max_scaler = MinMaxScaler()

# Fit the scaler on the training data and transform the training data
# min_max_scaler.fit(inputs)
# inputs_scaled = min_max_scaler.transform(inputs)
inputs_scaled = torch.tensor(inputs)


In [None]:
# Test Data
df_test = pd.read_pickle("../Data/spy_test_data.pkl")

df_test["news_probability"] = df_test["news_probability"].apply(lambda x: float(x.removeprefix("tensor(").split(",")[0]))

inputs_test = torch.tensor(df_test.iloc[:,:-1].values)
labels_test = torch.tensor(df_test.iloc[:,-1].values)

# Normalize the data
# min_max_scaler = MinMaxScaler()

# Fit the scaler on the training data and transform the training data
# min_max_scaler.fit(inputs_test)
# inputs_scaled_test = min_max_scaler.transform(inputs_test)
inputs_scaled_test = torch.tensor(inputs_test)


### Init

In [None]:
# Initialize model, loss function, optimizer
net = Net(input_size, output_size, hidden_size, num_layers)
criterion = nn.MSELoss()
optimizer = optim.SGD(net.parameters(), lr=learning_rate)

print(net)

### Training

In [None]:
os.makedirs("Models", exist_ok=True)

loss_vals = []
test_loss_vals = []

for epoch in range(num_epochs):
    net.train()  
    
    optimizer.zero_grad()

    outputs = net(inputs_scaled.float())
    outputs = outputs.squeeze(-1)  

    loss = criterion(outputs, labels.float())
    
    loss.backward()
    optimizer.step()
    
    loss_vals.append(loss.item())

    net.eval()
    with torch.no_grad():
        test_outputs = net(inputs_scaled_test.float())
        test_outputs = test_outputs.squeeze(-1) 
        test_loss = criterion(test_outputs, labels_test.float())
        test_loss_vals.append(test_loss.item())
    
    # Save model after each epoch
    model_path = f'Models/model-{epoch + 1}.pt'
    torch.save(net.state_dict(), model_path)
    
    if (epoch + 1) % 10 == 0:
        learning_rate *= 0.5
        for param_group in optimizer.param_groups:
            param_group['lr'] = learning_rate
    
    if epoch % 5 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}, Test Loss: {test_loss.item():.4f}')

net.eval()
with torch.no_grad():
    outputs_test = net(inputs_scaled_test.float()).squeeze(-1)
    loss_test = criterion(outputs_test, labels_test.float())
    print(f'Final Test Loss: {loss_test.item():.4f}')


In [None]:
import matplotlib.pyplot as plt

# Plotting the loss values
plt.figure(figsize=(10, 6))
plt.plot(range(len(loss_vals)), loss_vals, label='Training Loss', color='red')
plt.plot(range(len(test_loss_vals)), test_loss_vals, label='Test Loss', color='blue')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Test Loss over Epochs')
plt.legend()
plt.grid(True)
plt.show()

# Printing the final loss values
print(f'Final Training Loss: {loss_vals[-1]:.4f}')
print(f'Final Test Loss: {loss_test.item():.4f}')


### Backtesting

In [None]:
os.makedirs("logs", exist_ok=True)
os.makedirs("results", exist_ok=True)

models = [f for f in os.listdir("Models/")]
alredy_done = [f.removeprefix("backtest-").removesuffix(".csv.gz") for f in os.listdir("results/")]
print(len(models))
print(len(alredy_done))

test_data = pd.read_pickle("../../Data/spy_test_data.pkl")

for model_name in models:
    # Skip Backtesting for Models which already did the Backtesting
    if model_name in alredy_done:
        print(f"Skip model: {model_name}")
        continue

    model_path = f"Models/{model_name}"
    model = Net(input_size, output_size, hidden_size, num_layers)

    # Load state_dict only
    model.load_state_dict(torch.load(model_path, weights_only=False))
    model.eval()

    ALPACA_CREDS = {
        "API_KEY": os.getenv("ALPACA_API_KEY"), 
        "API_SECRET": os.getenv("ALPACA_API_SECRET"), 
        "PAPER": True
    }

    # Strategy setup
    start_date = datetime(2023, 1, 1)
    end_date = datetime(2023, 12, 31)
    broker = Alpaca(ALPACA_CREDS)

    # Instantiate and run the strategy
    strategy = Backtest(
        name=model_name,  
        broker=broker,
        parameters={
            "symbol": "SPY",
            "cash_at_risk": 0.5,
            "model": model,
            "num_prior_days": 1,
            "dataset": test_data,
        }
    )

    # Run the backtest
    backtest_results = strategy.backtest(
        YahooDataBacktesting,
        start_date,
        end_date,
        name=model_name,
        parameters={
            "symbol": "SPY", 
            "cash_at_risk": 0.5, 
            "model": model,  
            "dataset": test_data,
        },
        benchmark_asset="SPY",
        show_plot=True,
        show_tearsheet=True
    )

    # Convert to DataFrame only if results are non-empty
    backtest_results = pd.DataFrame(backtest_results)   
    backtest_results["model"] = model_name
    
    backtest_results.to_csv(f"results/backtest-{model_name}.csv.gz", index=False, compression='gzip')
    
print("Backtesting complete. Results saved to backtest_results.csv.")

In [None]:
dfs = [pd.read_csv(f"results/{f}", compression="gzip") for f in os.listdir("results/")]
df = pd.concat(dfs).sort_values(by=["model"])
#df = pd.read_csv("results.csv")
display(df.sort_values(by=["model"]))

In [None]:

best_model = df[df["total_return"] == df["total_return"].max()]
best_model.reset_index(inplace=True)

model_name = best_model.at[0, "model"]
print(f"Best model: {model_name}")

In [None]:
import pandas as pd
import pandas as pd
import torch
from sklearn.preprocessing import MinMaxScaler
import numpy as np

model_path = f"Models/{model_name}"
model = Net(input_size, output_size, hidden_size, num_layers)

# Load state_dict only
model.load_state_dict(torch.load(model_path))  # Do not use weights_only
model.eval()

# Load your DataFrame
data = pd.read_pickle('../../Data/spy_train_data.pkl')

inputs = torch.tensor(data.iloc[:, :-1].values, dtype=torch.float32)
labels = torch.tensor(data.iloc[:, -1].values, dtype=torch.float32)
all_predictions = []
all_labels = []


out = model(inputs) 
# Store predictions and labels
all_predictions.append(out.detach().numpy())  
all_labels.append(labels.numpy())

# Concatenate results
all_predictions = np.concatenate(all_predictions)
all_labels = np.concatenate(all_labels)

# Print or analyze the predictions
print(f'Predicted values: {all_predictions.flatten()}')
print(f'Actual values: {all_labels.flatten()}')

output_df = pd.DataFrame({'Predicted': all_predictions.flatten(), 'Actual': all_labels.flatten()})


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

threshold = 0.5
predicted_classes = (all_predictions.flatten() > threshold).astype(int)
print(predicted_classes)

accuracy = accuracy_score(all_labels.flatten(), predicted_classes)
precision = precision_score(all_labels.flatten(), predicted_classes)
recall = recall_score(all_labels.flatten(), predicted_classes)
f1 = f1_score(all_labels.flatten(), predicted_classes)

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(all_labels.flatten(), predicted_classes)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()


In [None]:
output_df = pd.DataFrame({
    'Predicted': all_predictions.flatten(),
    'Predicted_Class': predicted_classes,
    'Actual': all_labels.flatten()
})
correlation_matrix = output_df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()
display(output_df)