In [41]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import spacy

# Load spaCy's English model
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])

# Load GloVe embeddings
def load_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

# Load GloVe embeddings (adjust the file path as needed)
glove_embeddings = load_glove_embeddings("glove/glove.6B.100d.txt")  # Adjust path to the GloVe file

# Preprocess text with stopword removal and lemmatization using spaCy
def preprocess_text(text):
    doc = nlp(text.lower())
    # Remove stopwords and non-alphabetic words, and return lemmatized words
    return [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]

# Load dataset (Kaggle dataset)
path = "/Users/marius/.cache/kagglehub/datasets/ankurzing/sentiment-analysis-for-financial-news/versions/5"  # Adjust path
file_path = os.path.join(path, "all-data.csv")
columns = ["Sentiment", "News Headline"]
df = pd.read_csv(file_path, encoding='latin-1', names=columns)

# Rename columns
df.rename(columns={"Sentiment": "label", "News Headline": "text"}, inplace=True)

# Preprocess text
df['text'] = df['text'].apply(preprocess_text)
df = df[['text', 'label']]

# Encode labels
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])

# Split dataset into train and test
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Function to get GloVe embedding for a word, return zeros if word is not in GloVe
def get_glove_embedding(word):
    return glove_embeddings.get(word, np.zeros(100))  # Assuming 100-dimensional GloVe embeddings

# Encode text to GloVe embeddings
def encode_phrase_with_glove(phrase):
    return [get_glove_embedding(word) for word in phrase]

train_data['text'] = train_data['text'].apply(encode_phrase_with_glove)
test_data['text'] = test_data['text'].apply(encode_phrase_with_glove)

# Padding sequences of GloVe embeddings
def pad_sequence_embeddings(seq, max_length):
    return seq + [np.zeros(100)] * (max_length - len(seq))  # Padding with zeros to match max_length

# Calculate max length of sequences
max_length = max(df['text'].apply(len))

# Apply padding
train_data['text'] = train_data['text'].apply(lambda x: pad_sequence_embeddings(x, max_length))
test_data['text'] = test_data['text'].apply(lambda x: pad_sequence_embeddings(x, max_length))

# Convert data into PyTorch tensors
def prepare_data(df, max_length):
    # Pad and convert text into tensors
    X = np.array(df['text'].tolist())
    X = torch.tensor(X, dtype=torch.float32)
    
    # Labels
    y = torch.tensor(df['label'].values, dtype=torch.long)
    
    return X, y

# Prepare train and test data
X_train, y_train = prepare_data(train_data, max_length)
X_test, y_test = prepare_data(test_data, max_length)

# Create DataLoader for batching
batch_size = 64
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Define the RNN model
class RNNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, max_length):
        super(RNNModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.max_length = max_length
        
        # Define layers
        self.rnn = nn.RNN(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        # Initialize hidden state
        h0 = torch.zeros(1, x.size(0), self.hidden_dim).to(x.device)  # (num_layers, batch_size, hidden_dim)
        
        # RNN Layer
        out, _ = self.rnn(x, h0)
        
        # Take the output from the last time step
        out = out[:, -1, :]
        
        # Fully connected layer to get output
        out = self.fc(out)
        return out

# Initialize model
input_dim = 100  # GloVe embedding dimension
hidden_dim = 128  # Number of hidden units in the RNN
output_dim = 3  # Number of classes (3 in this case)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = RNNModel(input_dim, hidden_dim, output_dim, max_length).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Training loop
num_epochs = 10

for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    running_loss = 0.0
    
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        # Zero gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs)
        
        # Compute loss
        loss = criterion(outputs, labels)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader)}")

# Evaluation loop
model.eval()  # Set model to evaluation mode
correct = 0
total = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        # Forward pass
        outputs = model(inputs)
        
        # Get predictions
        _, predicted = torch.max(outputs, 1)
        
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f"Test Accuracy: {accuracy}%")

Epoch [1/10], Loss: 0.9941961491694216
Epoch [2/10], Loss: 0.9306752974869775
Epoch [3/10], Loss: 0.9266952764792521
Epoch [4/10], Loss: 0.9034421805475579
Epoch [5/10], Loss: 0.8408381714195502
Epoch [6/10], Loss: 0.807283992650079
Epoch [7/10], Loss: 0.7873260808772728
Epoch [8/10], Loss: 0.7887290217837349
Epoch [9/10], Loss: 0.7723253476815145
Epoch [10/10], Loss: 0.7643671915179393
Test Accuracy: 65.87628865979381%
