## LSTM using GloVe embeddings

In [None]:
import pandas as pd

In [None]:
# load glove embeddings from glove.pickle
glove = pd.read_pickle('../processed_data/glove.pickle')
glove.shape

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

In [None]:
# Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)

        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])  # Take the last output
        return out

In [None]:
# Instantiate the model
input_size = 300
hidden_size = 512
num_layers = 2 # number of layers in the RNN
output_size = 4 # number of classes
model = LSTMModel(input_size, hidden_size, num_layers, output_size)

# Define loss and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
def train(model, criterion, optimizer, train_loader, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        for i, (embeddings, labels) in enumerate(train_loader):
            # Forward pass
            outputs = model(embeddings)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

In [None]:
# load vocabulary of token : index mapping which we will use for GloVe embeddings
vocab = torch.load('../processed_data/remove-stopwords-punct-25000.vocab')

In [None]:
# average text length from fulltrain.csv is 536
data = pd.read_csv('../raw_data/fulltrain.csv', names=['label', 'text'])

In [None]:
sequence_length = 100 # HYPERPARAMETER
# convert each sentence to a sequence of glove embeddings
def text_to_sequence(text):
    sequence = []
    for word in text.split()[:sequence_length]:
        if word in vocab:
            sequence.append(glove[vocab[word]])
    sequence = sequence[:sequence_length] + [np.zeros(300)] * (sequence_length - len(sequence))
    return np.array(sequence)

In [None]:
# cannot do this all in one-shot because of memory constraints lol
# do this for 100 sentences at a time
i = 0
sequences = []
while i < len(data):
    sequences.extend(np.array(data['text'][i:i+100].apply(text_to_sequence).tolist()))
    i += 100
print(len(sequences), len(sequences[0]))

In [None]:
# convert labels to one-hot encoding
labels = pd.get_dummies(data['label']).values

In [None]:
# train the LSTM model
from torch.utils.data import TensorDataset, DataLoader
train_dataset = TensorDataset(torch.tensor(sequences), torch.tensor(labels))
train_loader = DataLoader(train_dataset, batch_size=100, shuffle=True)
train(model, criterion, optimizer, train_loader, 10)