In [None]:
import pandas as pd
import numpy as np

import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset

from transformers import AutoTokenizer, AutoModelForSequenceClassification, AlbertTokenizer, AlbertModel, AlbertForSequenceClassification
from transformers.modeling_outputs import SequenceClassifierOutput

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight

import matplotlib.pyplot as plt
import random


In [None]:
# Plotting Function
def plot_metrics(train_losses, val_losses, val_accuracies):
    # Plot Loss
    plt.figure(figsize=(10, 5))
    plt.plot(train_losses, label="Training Loss")
    plt.plot(val_losses, label="Validation Loss")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.title("Training and Validation Loss")
    plt.legend()
    plt.show()

    # Plot Accuracy
    plt.figure(figsize=(10, 5))
    plt.plot(val_accuracies, label="Validation Accuracy", color="green")
    plt.xlabel("Epochs")
    plt.ylabel("Accuracy")
    plt.title("Validation Accuracy")
    plt.legend()
    plt.show()

In [None]:
# Set a seed for reproducibility
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)  # Sets the seed for all devices, including MPS
    if torch.cuda.is_available():  # For CUDA if present
        torch.cuda.manual_seed_all(seed)
    # Disable deterministic behavior for MPS (optional, as it's not strictly needed)
    torch.backends.cudnn.deterministic = False
    torch.backends.cudnn.benchmark = True

In [None]:
# Custom Dataset Class
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)


In [None]:
# Custom Model Class
class CustomModel(torch.nn.Module):
    def __init__(self, model_name, num_labels=2):
        super(CustomModel, self).__init__()
        self.base_model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels
        )
        # self.base_model = AlbertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
        self.dropout = torch.nn.Dropout(p=0.2)  

    def forward(self, **kwargs):
        # Get the original outputs from the base model
        output = self.base_model(**kwargs)

        # Apply dropout to logits
        logits = self.dropout(output.logits)

        # Return a SequenceClassifierOutput with modified logits
        return SequenceClassifierOutput(
            logits=logits,
            hidden_states=output.hidden_states,
            attentions=output.attentions,
        )

In [None]:
# Tokenize Data
def tokenize_data(data, tokenizer, max_length=MAX_LEN):
    return tokenizer(
        data["title"].tolist(),
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

In [None]:
# Evaluation Function
def evaluate_model(model, data_loader, device, loss_fn):
    model.eval()
    predictions, true_labels = [], []
    total_loss = 0  # Track validation loss

    with torch.no_grad():
        for batch in data_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = loss_fn(outputs.logits, batch["labels"])
            total_loss += loss.item()

            preds = torch.argmax(outputs.logits, axis=-1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(batch["labels"].cpu().numpy())

    avg_loss = total_loss / len(data_loader)

    return predictions, true_labels, avg_loss

In [None]:
# Training Function
def train_model(model, train_loader, val_loader, optimizer, device, epochs=EPOCHS):
    best_accuracy = 0
    loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights.to(device))
    
    train_losses, val_losses, val_accuracies = [], [], []

    for epoch in range(epochs):
        # Training phase
        model.train()
        total_loss = 0
        for batch in train_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            optimizer.zero_grad()
            outputs = model(**batch)
            loss = loss_fn(outputs.logits, batch["labels"])
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        train_losses.append(avg_train_loss)
        print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {avg_train_loss:.4f}")

        # Validation phase (Use val_loader instead of test_loader)
        predictions, true_labels, avg_val_loss = evaluate_model(model, val_loader, device, loss_fn)
        accuracy = accuracy_score(true_labels, predictions)
        
        val_losses.append(avg_val_loss)
        val_accuracies.append(accuracy)

        print(f"Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {accuracy * 100:.2f}%")

        # Save the best model based on validation accuracy
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            torch.save(model.state_dict(), "best_model.pth")
    return train_losses, val_losses, val_accuracies

In [None]:
# Hyperparameters
LEARNING_RATE = 1e-5
EPOCHS = 15
BATCH_SIZE = 32
MAX_LEN = 256
WEIGHT_DECAY = 0.0005
SEED = 42
set_seed(SEED)

In [None]:
# Datasets
df = pd.read_csv("merged_data/eurusd_daily_news.csv")
# df = pd.read_csv("merged_data/eurusd_15min_data.csv")

In [None]:
# Tokenizers and Models
# model_name = "microsoft/MiniLM-L12-H384-uncased"
# model_name = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"
# model_name = "albert/albert-base-v2"
model_name = "huawei-noah/TinyBERT_General_4L_312D"

tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer = AlbertTokenizer.from_pretrained('albert/albert-base-v2')

model = CustomModel(model_name, num_labels=2)
df_cropped = df[["title", "movement"]].copy()

In [None]:
# Split into Train and Test
train_df, temp_df = train_test_split(df_cropped, test_size=0.3, stratify=df_cropped["movement"], random_state=SEED)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df["movement"], random_state=SEED)

# Tokenize Data
train_encodings = tokenize_data(train_df, tokenizer)
val_encodings = tokenize_data(val_df, tokenizer)
test_encodings = tokenize_data(test_df, tokenizer)

# Convert Labels to Tensors
train_labels = torch.tensor(train_df["movement"].values)
val_labels = torch.tensor(val_df["movement"].values)
test_labels = torch.tensor(test_df["movement"].values)

# Create Dataset Objects
train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)
test_dataset = SentimentDataset(test_encodings, test_labels)

# Create Dataloaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)  
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)  


In [None]:
# Class Weights
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(train_df["movement"]),
    y=train_df["movement"]
)
class_weights = torch.tensor(class_weights, dtype=torch.float)

# Training Setup
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)
print(f"MPS Available: {torch.backends.mps.is_available()}")

# Optimizer
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
 
# Train the Model

torch.mps.empty_cache()

print("Fine-Tuning the Model...")
train_losses, val_losses, val_accuracies = train_model(
    model, train_loader, val_loader, optimizer, device
)
# Plot Metrics
plot_metrics(train_losses, val_losses, val_accuracies)

In [None]:
# Evaluate Fine-Tuned Model
# Load the best model (selected based on validation accuracy)
model.load_state_dict(torch.load("best_model.pth"))

# Evaluate on Test Set (Final Evaluation)
print("Evaluating Fine-Tuned Model on Test Set...")
test_predictions, test_true_labels, _ = evaluate_model(model, test_loader, device, torch.nn.CrossEntropyLoss())
test_accuracy = accuracy_score(test_true_labels, test_predictions)
print(f"Test Set Accuracy: {test_accuracy * 100:.2f}%")

# Print Full Classification Report
print(classification_report(test_true_labels, test_predictions, target_names=['down', 'up']))