In [1]:
import numpy as np
import pandas as pd
import torch
import string
import re
from torchtext.data.utils import get_tokenizer
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch.optim as optim
from torch.nn import functional as F
from sklearn.preprocessing import LabelEncoder
from torchtext.vocab import build_vocab_from_iterator
import torchtext.vocab as vocab

In [2]:
training_df = pd.read_csv('/kaggle/input/twitter-entity-sentiment-analysis/twitter_training.csv', header=None)
training_df.shape

(74682, 4)

In [3]:
print(f'Shape before dropping nulls {training_df.shape}')
training_df = training_df.dropna()
print(f'Shape after dropping nulls {training_df.shape}')

Shape before dropping nulls (74682, 4)
Shape after dropping nulls (73996, 4)


In [4]:
input_texts = training_df[3].tolist()
print(len(input_texts))

tokenizer = get_tokenizer('basic_english')

tokenized_texts = [tokenizer(text) for text in input_texts]
# tokenized_texts = tokenized_texts



# flattened_list = sum(tokenized_texts, [])
flattened_list = [token for tokens in tokenized_texts for token in tokens]

vocab = sorted(list(set(flattened_list)))
word_to_id = {word:i+1 for i, word in enumerate(vocab)}
id_to_word = {i+1:word for i, word in enumerate(vocab)}

le = LabelEncoder()
training_df['Labels']  = le.fit_transform(training_df[2])
output_y = training_df['Labels'].tolist()

73996


In [5]:
encode_text = lambda x: [word_to_id[_] for _ in x]

encoded_inputs = list(map(encode_text, tokenized_texts))
padded = pad_sequence(list(map(torch.tensor, encoded_inputs)), batch_first=True)
output_y = torch.tensor(output_y, dtype=torch.float32).unsqueeze(-1)
output_y = output_y.type(torch.LongTensor)
padded.shape, output_y.shape

(torch.Size([73996, 311]), torch.Size([73996, 1]))

In [6]:
class Embedding(nn.Module):
    def __init__(self, n_vocab, n_embed):
        super().__init__()
        self.embedding_layer = nn.Embedding(n_vocab, n_embed)
        
    def forward(self, x):
        return self.embedding_layer(x)

In [7]:
class LSTMClassifier(nn.Module):
    def __init__(self, n_vocab, n_embed, n_hidden, timesteps, output):
        super().__init__()
        self.embedding = Embedding(n_vocab, n_embed)
        self.lstm = nn.LSTM(n_embed, n_hidden, batch_first=True)
        self.inter1 = nn.Linear(n_hidden, output)

    def forward(self, x):
        embeded = self.embedding(x)
        out, (h, c) = self.lstm(embeded)
        out = self.inter1(out[:, -1])
        return out
    

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [9]:
split = 0.8
train_size = int(split * padded.shape[0])

x_train, x_val, y_train, y_val = padded[:train_size], padded[train_size:], output_y[:train_size], output_y[train_size:]
x_train, x_val, y_train, y_val = x_train.to(device), x_val.to(device), y_train.to(device), y_val.to(device)



In [10]:
def train_epoch(x_batch, y_batch):
    optimizer.zero_grad()
    output = model(x_batch)
    
    loss = loss_function(output, y_batch.view(-1))

    loss.backward()
    optimizer.step()
    
    return loss

In [12]:
n_embed = 128
timesteps = padded.shape[-1]
n_hidden = 64
# model = Encoder(len(vocab) + 1, n_embed, timesteps, head_size=16, output=4)
model = LSTMClassifier(len(vocab) + 1, n_embed, n_hidden, timesteps, 4)
batch_size = 32
batch_per_epoch = x_train.shape[0] // batch_size

# loss_function = nn.BCELoss()
loss_function = nn.CrossEntropyLoss()
learning_rate = 0.001 
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Set the device (CPU or GPU)
model.to(device)
# model = torch.nn.DataParallel(model, device_ids = [0,1]).to(device)
# output_y = output_y.type(torch.LongTensor)
# inputs, targets, masked_reshape = padded.to(device), output_y.to(device), masked_reshape.to(device)

for epoch in range(10):
    train_loss = 0
    for i in range(batch_per_epoch):
        start = i * batch_size
        x_batch, y_batch = x_train[start:start+batch_size], y_train[start:start+batch_size]

        model.train(True)
        loss = train_epoch(x_batch, y_batch)
        train_loss += loss
        
    print(f'Epoch {epoch} Loss: {train_loss / (i+1)}')   
    model.eval()
    with torch.no_grad():
        output_val = model(x_val)
        loss_val = loss_function(output_val, y_val.view(-1))
        print(f'Epoch {epoch} Val loss: {loss_val}')
        
    print()

Epoch 0 Loss: 1.3612509965896606


OutOfMemoryError: CUDA out of memory. Tried to allocate 21.97 GiB. GPU 0 has a total capacty of 14.75 GiB of which 6.55 GiB is free. Process 5761 has 8.19 GiB memory in use. Of the allocated memory 8.02 GiB is allocated by PyTorch, and 26.90 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF