In [5]:
import torch
import torch.nn as nn

class SentimentRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, dropout=0.2, activationfn='tanh',):
        super(SentimentRNN, self).__init__()
        
        # RNN layer
        self.rnn = nn.RNN(
            input_size, 
            hidden_size, 
            num_layers, 
            batch_first=True, 
            dropout=dropout, 
            nonlinearity=activationfn, 
            bias=True
            )
        
        # Fully connected layer
        self.fc = nn.Linear(hidden_size, output_size)
        
        # Sigmoid activation for output
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # Pass input through RNN
        out, hidden = self.rnn(x)
        
        out = torch.mean(out, dim=1) # mean over the sequence length
        
        # Pass through the fully connected layer and activation
        out = self.fc(out)
        out = self.sigmoid(out)
        
        return out


In [6]:
# Define model parameters
input_size = 100         # Dimension of Word2Vec embeddings
hidden_size = 32        # Number of hidden units
output_size = 1          # Output size (1 for binary classification)
num_layers = 1           # Number of stacked RNN layers
dropout = 0.5            # Dropout rate for regularization

# Initialize the model
model = SentimentRNN(input_size, hidden_size, output_size, num_layers, dropout)

# Print the model architecture to verify
print(model)


SentimentRNN(
  (rnn): RNN(100, 32, batch_first=True, dropout=0.5)
  (fc): Linear(in_features=32, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)




In [49]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import numpy as np

class SentimentDataset(Dataset):
    def __init__(self, dataset, word2vec_model, max_length=100):
        self.dataset = dataset
        self.word2vec = word2vec_model
        self.max_length = max_length  # Maximum sequence length for padding

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        # Get text and label
        text = self.dataset[idx]['text']
        label = self.dataset[idx]['label']
        
        # Convert text to embeddings
        tokens = text # Assuming text is tokenized already
        embeddings = [self.word2vec.wv[word] for word in tokens if word in self.word2vec.wv]
        
        # Truncate sequences - will pad later
        if len(embeddings) > self.max_length:
            embeddings = embeddings[:self.max_length]
        
        embeddings = np.array(embeddings)
        
        return torch.tensor(embeddings, dtype=torch.float32), torch.tensor(label, dtype=torch.float32)

def collate_fn(batch):
    # Separate embeddings and labels
    embeddings = [item[0] for item in batch]
    labels = [item[1] for item in batch]
    
    # Stack them into tensors
    embeddings = pad_sequence(embeddings, batch_first=True)
    labels = torch.stack(labels)
    
    return embeddings, labels


In [50]:
from gensim.models import Word2Vec
from datasets import load_from_disk

# This is the training dataset
path_to_train_set = r"C:\Users\All Saints\Desktop\Uni mods\SC4002\SC4002_NLP_Project\tokenised_datasets\tokenised_train_dataset"
train_dataset = load_from_disk(path_to_train_set)

# This is the word2vec model
word2vec_model = Word2Vec.load('word2vec.model')

# Assuming 'train_dataset' is your dataset and 'word2vec_model' is your Word2Vec model
train_data = SentimentDataset(train_dataset, word2vec_model)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True, collate_fn=collate_fn)


In [56]:
c=0
for embeddings, labels in train_loader:
    print(embeddings.shape, labels.shape)
    c+=1
    if c==4:
        break


torch.Size([32, 35, 100]) torch.Size([32])
torch.Size([32, 41, 100]) torch.Size([32])
torch.Size([32, 47, 100]) torch.Size([32])
torch.Size([32, 34, 100]) torch.Size([32])


In [53]:
import torch.optim as optim

# Binary Cross-Entropy Loss with Logits
criterion = nn.BCEWithLogitsLoss()

# Define the optimizer with the model's parameters and a learning rate
learning_rate = 0.0001
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


In [57]:
import matplotlib.pyplot as plt

# Set the model to training mode
model.train()

# Training parameters
num_epochs = 20
train_losses = []
train_accuracies = []

for epoch in range(num_epochs):
    epoch_loss = 0.0
    correct_predictions = 0
    total_samples = 0
    
    for embeddings, labels in train_loader:
        # Move data to the same device as model (GPU if available)
        # embeddings, labels = embeddings.to('cuda'), labels.to('cuda')
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(embeddings).squeeze()  # Output shape is (batch_size,)
        print(labels)
        print(outputs)
        
        # Compute the loss
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        
        # Accumulate the loss and accuracy
        epoch_loss += loss.item() * labels.size(0)  # Multiply by batch size for total loss
        predictions = (outputs >= 0.5).float()  # Threshold at 0.5
        correct_predictions += (predictions == labels).sum().item()
        total_samples += labels.size(0)

    # Calculate average loss and accuracy for the epoch
    avg_loss = epoch_loss / total_samples
    accuracy = correct_predictions / total_samples
    
    # Store metrics for plotting
    train_losses.append(avg_loss)
    train_accuracies.append(accuracy)
    
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")

# Plot the training loss and accuracy over epochs
plt.figure(figsize=(12, 5))

# Plot Loss
plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Training Loss')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training Loss Over Epochs")
plt.legend()

# Plot Accuracy
plt.subplot(1, 2, 2)
plt.plot(train_accuracies, label='Training Accuracy')
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.title("Training Accuracy Over Epochs")
plt.legend()

plt.show()


tensor([1., 1., 0., 0., 0., 1., 1., 1., 1., 0., 1., 1., 0., 0., 0., 0., 0., 1.,
        1., 0., 1., 0., 1., 1., 0., 0., 1., 1., 1., 1., 0., 1.])
tensor([2.0245e-05, 2.1510e-05, 2.0864e-05, 2.1818e-05, 1.8418e-05, 1.8555e-05,
        1.8987e-05, 1.8906e-05, 2.0179e-05, 1.7594e-05, 1.9125e-05, 1.8726e-05,
        1.8924e-05, 1.7987e-05, 1.9980e-05, 1.8095e-05, 1.9314e-05, 1.8888e-05,
        1.9956e-05, 2.1113e-05, 1.7419e-05, 1.8268e-05, 2.6479e-05, 1.7922e-05,
        1.9538e-05, 1.7343e-05, 1.7827e-05, 2.3053e-05, 1.8282e-05, 1.9066e-05,
        1.8350e-05, 1.8189e-05], grad_fn=<SqueezeBackward0>)


RuntimeError: No active exception to reraise