<a href="https://colab.research.google.com/github/Lecon-a/start_LLM_skill/blob/main/sentiment_analysis_with_lstm_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## IMDb Sentiment Analysis using LSTM PyTorch

In [1]:
# download dataset
!pip install datasets

Collecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/519.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m204.8/519.3 kB[0m [31m6.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-

## Import Required Modules

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
import tqdm

import functools
import sys

import datasets
import matplotlib.pyplot as plt
import numpy as np

### Set Seed and enable GPU

In [3]:
torch.manual_seed(42) # This helps to control randomness in PyTorch

<torch._C.Generator at 0x7c0cc00f0b50>

#### Important point to note

To switch to CUDA (GPU), follow the below steps:
- Click on Runtime
- Click on `Change Runtime type`
- Select Hardware Accelerator as `T4 GPU`

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
device

device(type='cuda')

Download the Dataset using datasets library by HuggingFace

In [6]:
train_data, test_data = datasets.load_dataset('imdb', split=['train', 'test'])

Downloading builder script:   0%|          | 0.00/4.31k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.59k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

## Torchtext

Torchtext is a library made for NLP lovers. This contains most of the pre-processing required for Text data

## Tokenize the sequences

In [7]:
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')

In [9]:
def tokenize_data(example, tokenizer, max_length):
  # print(example)
  tokens = tokenizer(example['text'])[:max_length] # why do we have to set the max length
  length = len(tokens)
  return {'tokens': tokens, 'length': length}

In [10]:
max_length = 256
train_token = train_data.map(tokenize_data, fn_kwargs={'tokenizer': tokenizer, 'max_length': max_length})
test_token = test_data.map(tokenize_data, fn_kwargs={'tokenizer': tokenizer, 'max_length': max_length})

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

## Before Tokenization

The dataset contains only text and label

In [12]:
train_data

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})

## After Tokenization

Notice two other keys have been added to it train data: tokens and length

In [13]:
train_token

Dataset({
    features: ['text', 'label', 'tokens', 'length'],
    num_rows: 25000
})

In [19]:
# display the first twenty text

print(train_token['text'][:20])

['I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, e

In [20]:
print(train_token['label'][:20])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [23]:
print(train_token['tokens'][:20])

[['i', 'rented', 'i', 'am', 'curious-yellow', 'from', 'my', 'video', 'store', 'because', 'of', 'all', 'the', 'controversy', 'that', 'surrounded', 'it', 'when', 'it', 'was', 'first', 'released', 'in', '1967', '.', 'i', 'also', 'heard', 'that', 'at', 'first', 'it', 'was', 'seized', 'by', 'u', '.', 's', '.', 'customs', 'if', 'it', 'ever', 'tried', 'to', 'enter', 'this', 'country', ',', 'therefore', 'being', 'a', 'fan', 'of', 'films', 'considered', 'controversial', 'i', 'really', 'had', 'to', 'see', 'this', 'for', 'myself', '.', 'the', 'plot', 'is', 'centered', 'around', 'a', 'young', 'swedish', 'drama', 'student', 'named', 'lena', 'who', 'wants', 'to', 'learn', 'everything', 'she', 'can', 'about', 'life', '.', 'in', 'particular', 'she', 'wants', 'to', 'focus', 'her', 'attentions', 'to', 'making', 'some', 'sort', 'of', 'documentary', 'on', 'what', 'the', 'average', 'swede', 'thought', 'about', 'certain', 'political', 'issues', 'such', 'as', 'the', 'vietnam', 'war', 'and', 'race', 'issues',

Split the train data after the tokenization to avoid data leakage

In [24]:
train_valid_data = train_token.train_test_split(test_size=0.2)

In [25]:
train_valid_data

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'tokens', 'length'],
        num_rows: 20000
    })
    test: Dataset({
        features: ['text', 'label', 'tokens', 'length'],
        num_rows: 5000
    })
})

In [28]:
train_data = train_valid_data['train']
valid_data = train_valid_data['test']

train_data.shape, valid_data.shape

((20000, 4), (5000, 4))

Build vocabulary

In [30]:
vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data['tokens'],
    specials=['<UNK>', '<PAD>'],
    min_freq=10
)

In [34]:
vocab['<UNK>']

0

In [35]:
vocab.set_default_index(0)

## Note: Why do we need UNK and PAD?

Let's say we have a large corpus of text data. During tokenization we usually fit in all the train data. When we have a new text, if the model encounters a new word, it will assign it as UNK, which stands for unknown.

Let's take a few sample movie reviews:

- I love this movie
- Amazing
- Impressive storyline
- Terrible experience not recommended to watch

If you look at the above statements, all have different word sizes. To ensure we pass the model with the same size, we pad all the sentences to be in the same size. We set the max length to be some value if the sequence has more than the threshold it truncates the padding. If it is less than sequence, it pad and fill the sequence with zero.

## Prepare the dataset for the model

In [37]:
def convert_into_tokens(example, vocab):
  ids = [vocab[token] for token in example['tokens']]
  return {'ids': ids}

In [38]:
# this data will be used for training
train_data = train_data.map(convert_into_tokens, fn_kwargs={'vocab': vocab})
# this data will be used for evaluation
valid_data = valid_data.map(convert_into_tokens, fn_kwargs={'vocab': vocab})
# this is the data we use for generalizaiton [new unseen data for testing]
test_data = test_token.map(convert_into_tokens, fn_kwargs={'vocab': vocab})

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [45]:
print(train_data['ids'][:2])

[[12, 254, 2, 10275, 19, 187, 7, 0, 0, 9, 5626, 1158, 2087, 25, 1678, 5513, 1742, 1173, 5303, 24, 28, 274, 3, 12, 108, 11, 10, 5, 58, 56, 23, 3, 14, 18, 6550, 5, 176, 7, 64, 96, 5, 58, 348, 65, 3, 26, 31, 6523, 4, 747, 229, 4, 9903, 4, 8673, 4, 919, 0, 4, 8069, 4, 6, 579, 4, 37, 13, 34, 1612, 36, 2, 481, 154, 93, 5, 3032, 291, 3, 6, 4, 12, 246, 2, 0, 8809, 114, 8, 35, 818, 6, 213, 3, 191, 4, 2, 711, 180, 4, 3818, 2, 9903, 4, 1467, 5, 4067, 247, 4, 624, 60, 1670, 8, 1693, 403, 20, 62, 83, 48, 5, 170, 721, 3, 432, 39, 358, 159, 9903, 4, 325, 7, 154, 3, 5, 381, 7, 2, 111, 4, 13, 833, 4, 75, 4596, 818, 3, 2, 612, 676, 10, 388, 9821, 4, 21, 2, 206, 4, 2, 14466, 4, 6, 21, 773, 2, 64, 863, 3, 12, 246, 2, 229, 5, 159, 1602, 3, 12, 1816, 13, 2, 636, 891, 15, 2, 18, 2589, 2, 229, 7, 2, 1585, 4, 12, 0, 106, 22, 4, 397, 9, 27, 173, 235, 3, 3, 3, 13, 110, 417, 4, 712, 1081, 3, 5731, 42, 1051, 4, 1028, 6, 2495, 13, 14, 18, 389, 2, 307, 43, 86, 25, 7, 5, 11631, 1014, 136, 831, 8, 3465, 150, 5817, 6, 

## Look now we have ids as the next data appended

Token is the individual representation of the word and the ids is equivalent numerial value to that token.

In [46]:
train_data = train_data.with_format(type='torch', columns=['ids', 'label', 'length'])
valid_data = valid_data.with_format(type='torch', columns=['ids', 'label', 'length'])
test_data = test_data.with_format(type='torch', columns=['ids', 'label', 'length'])

Model building - LSTM

In [48]:
class LSTMmodel(nn.Module):
  def __init__(self, vocab_size,
               embedding_dim, hidden_dim,
               output_dim, n_layers,
               dropout_rate, pad_index
               ) -> None:
    super().__init__()
    # layer 1 - Pass the ids to the embedding layer
    self.embedding = nn.Embedding(
        vocab_size,
        embedding_dim,
        padding_idx=pad_index
        )
    # layer 2 - LSTM [If n_layers = 2, then layer 3 is also LSTM]
    self.lstm = nn.LSTM(
        embedding_dim,
        hidden_dim,
        n_layers,
        dropout=dropout_rate,
        batch_first=True
        )
    self.fc = nn.Linear(hidden_dim, output_dim)
    self.dropout = nn.Dropout(dropout_rate) # to avoid overfitting

  def forward(self, ids, batch_size):
    # token to embeddings
    embedded = self.dropout(self.embedding(ids))
    embdedded = nn.utils.rnn.pack_padded_sequence(
        embedded,
        batch_size,
        batch_first=True,
        enforce_sorted=False
        )
    # embedding sequence (batch_size, seq_length, emd_dim) to LSTM

    outputs, (hidden, cell) = self.lstm(embdedded)

    output, output_length = nn.utils.rnn.pad_packed_sequence(outputs)
    hidden = self.dropout(hidden[-1])

    prediction = self.fc(hidden)

    return prediction

In [49]:
vocab_size = len(vocab) # why do we need to know the length of the vocab
embedding_dim = 128
hidden_dim = 64
output_dim = len(train_data.unique('label')) # either 0 or 1 = 2(length)
n_layers = 2
dropout_rate = 0.5

model = LSTMmodel(
    vocab_size,
    embedding_dim,
    hidden_dim,
    output_dim,
    n_layers,
    dropout_rate,
    vocab['<PAD>']
    )

# switch out modeling training in Graphic Processing Unit
model = model.to(device)


In [51]:
sum(p.numel() for p in model.parameters() if p.requires_grad) # total parameters

1947778

In [54]:
def initialize_weights(m):
  if isinstance(m, nn.Linear):
    nn.init.xavier_normal_(m.weight)
    nn.init.zeros_(m.bias)
  elif isinstance(m, nn.LSTM):
    for name, param in m.named_parameters():
      if 'bias' in name:
        nn.init.zeros_(param)
      elif 'weight' in name:
        nn.init.orthogonal_(param)

In [55]:
model.apply(initialize_weights)

LSTMmodel(
  (embedding): Embedding(14568, 128, padding_idx=1)
  (lstm): LSTM(128, 64, num_layers=2, batch_first=True, dropout=0.5)
  (fc): Linear(in_features=64, out_features=2, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

## Using Pre-trained embeddings

Few of the most used Pre-trained embedding are:

- GloVe
- Word2Vec
- FastText

In [56]:
vectors = torchtext.vocab.GloVe() # extra pre-trained embedding

.vector_cache/glove.840B.300d.zip: 2.18GB [06:49, 5.31MB/s]                            
100%|█████████▉| 2196016/2196017 [04:54<00:00, 7463.75it/s]


In [57]:
pretrained_embedding = vectors.get_vecs_by_tokens(vocab.get_itos())

In [58]:
model.embedding.weight.data = pretrained_embedding

## Compile Model

- The three parameters that influence the model are:
 - Optimizer: algorithm for gradient descent [Adam, SGD, RMSProp]
 - Loss function: Binary cross entropy loss or CrossEntropy loss
 - Evaluation performance metrics [Accuracy, Precision, Recall]

In [59]:
learning_rate = 1e-4
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_function = nn.CrossEntropyLoss().to(device)

def metrics(prediction, actual):
  batch_size, _ = prediction.shape
  predicted_classes = prediction.argmax(dim=-1)
  correct_predictions = predicted_classes.eq(actual).sum()
  accuracy = correct_predictions / batch_size
  return accuracy

In [60]:
def collate(batch, pad_index):
  batch_ids = [i['ids'] for i in batch]
  batch_ids = nn.utils.rnn.pad_sequence(
      batch_ids,
      padding_value=pad_index,
      batch_first=True
      )
  batch_length = [i['length'] for i in batch]
  batch_length = torch.stack(batch_length)
  batch_label = [i['label'] for i in batch]
  batch_label = torch.stack(batch_label)
  batch = {
      'ids': batch_ids,
      'length': batch_length,
      'label': batch_label
  }

  return batch

## Fill the data to the model

In [61]:
batch_size = 64
collate = functools.partial(collate, pad_index=vocab['<PAD>'])

train_dataloader = torch.utils.data.DataLoader(
    train_data,
    batch_size=batch_size,
    collate_fn=collate,
    shuffle=True
)

valid_dataloader = torch.utils.data.DataLoader(
    valid_data,
    batch_size=batch_size,
    collate_fn=collate
    )

test_dataloader = torch.utils.data.DataLoader(
    test_data,
    batch_size=batch_size,
    collate_fn=collate
)

Train the model

In [62]:
def train(dataloader, model, loss_function, optimizer, device):
  model.train()

  epoch_losses = []
  epoch_accs = []

  for batch in tqdm.tqdm(dataloader, desc='training...', file=sys.stdout):
    ids = batch['ids'].to(device)

    # batch length - dataloader
    length = batch['length']
    label = batch['label'].to(device)

    # y_hat = prediction from the model
    prediction = model(ids, length)
    # loss function - > Actual value, predicted value
    # actual value - label
    # predicted value is prediction
    loss = loss_function(prediction, label) # loss

    accuracy = metrics(prediction, label)
    optimizer.zero_grad() # adam - > gradient descent

    loss.backward()
    optimizer.step() # we will update the weights with learning rate

    epoch_losses.append(loss.item())
    epoch_accs.append(accuracy.item())

  return epoch_losses, epoch_accs

## Evaluate

In [63]:
def evaluate(dataloader, model, loss_function, device):
  model.eval()
  epoch_losses = []
  epoch_accs = []

  with torch.no_grad(): # no optimization -> no update in weightds.
    for batch in tqdm.tqdm(dataloader, desc='evaluating...', file=sys.stdout):
      ids = batch['ids'].to(device)
      length = batch['length']
      label = batch['label'].to(device)
      prediction = model(ids, length)
      loss = loss_function(prediction, label)
      accuracy = metrics(prediction, label)
      epoch_losses.append(loss.item())
      epoch_accs.append(accuracy.item())

  return epoch_losses, epoch_accs