# Neural Language Modeling

In this Exercise, we will be using Pytorch Lightning to implement our neural LM. Your job will be just to write the forward method of the model.



## setup

In [1]:
# # #download corpus
# !wget --no-check-certificate https://github.com/ekapolc/nlp_2019/raw/master/HW4/BEST2010.zip
# !unzip BEST2010.zip

In [2]:
!pip install lightning



## code

In [3]:
total_word_count = 0
best2010 = []
with open('BEST2010/news.txt','r',encoding='utf-8') as f:
  for i,line in enumerate(f):
    line=line.strip()[:-1] #remove the trailing |
    total_word_count += len(line.split("|"))
    best2010.append(line)

train = best2010[:int(len(best2010)*0.7)]
test = best2010[int(len(best2010)*0.7):]
#Training data
train_word_count =0
for line in train:
    for word in line.split('|'):
        train_word_count+=1
print ('Total sentences in BEST2010 news training dataset :\t'+ str(len(train)))
print ('Total word counts in BEST2010 news training dataset :\t'+ str(train_word_count))

Total sentences in BEST2010 news training dataset :	21678
Total word counts in BEST2010 news training dataset :	1042797


Here we are going to use a library from huggingface called `tokenizers`. This will help us create a vocabulary and handle the encoding and decoding, i.e., convert text to its corresponding ID (which will be learned by the tokenizer).

In [4]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import CharDelimiterSplit
from tokenizers.trainers import WordLevelTrainer

#Basically, we just use the new tokenizer as our vocab building tool.
#In practice, you will have to use a compatible tokenizer like newmm to tokenize the corpus first then do this step
tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
tokenizer.pre_tokenizer = CharDelimiterSplit(delimiter="|") #now the tokenizer will split "|" for us
trainer = WordLevelTrainer(min_frequency=3,  #we can set a frequency threshold for taking a word into our vocab. for this example, words with freq < 3 will be excluded from the vocab.
                           special_tokens=["[UNK]", "<s>", "</s>"]) #these are our special tokens: for unknown, begin-of-sentence, and end-of-sentence, respectively.
tokenizer.train_from_iterator(train, trainer=trainer)

In [5]:
len(tokenizer.get_vocab()) #same as nltk

9062

In [6]:
tokenizer.encode("กฎหมาย|กับ|การ|เบียดบัง|คน|จน|asdf").tokens #tokens we get after tokenizing this sentence. unknown words will be tokenized as [UNK]

['กฎหมาย', 'กับ', 'การ', 'เบียดบัง', 'คน', 'จน', '[UNK]']

In [7]:
tokenizer.encode("กฎหมาย|กับ|การ|เบียดบัง|คน|จน|asdf").ids #this is what we will feed to the LM

[242, 28, 5, 8883, 22, 190, 0]

In [8]:
import itertools
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import lightning as L
from tqdm import tqdm

In [9]:
L.seed_everything(42, workers=True)

Seed set to 42


42

In [10]:
class TextDataset(Dataset):
  def __init__(self, data, seq_len = 128):
    #  data is currently a list of sentences
    #  [sent1,
    #   sent2,
    #   ...,
    #  ]

    data = [d+'|</s>' for d in data] #append an </s> token to each sentence
    encodings = tokenizer.encode_batch(data) #encode (turn token into token_id) data
    token_ids = [enc.ids for enc in encodings] #get the token ids for each sentence
    flatten_token_ids = list(itertools.chain(*token_ids)) #turn a list of token_ids into one long token_ids
    ## now data looks like this [sent1_ids </s> sent2_ids </s> ...]
    encoded = torch.LongTensor(flatten_token_ids)

    #remove some left over tokens so that we can form batches of seq_len (128 in this case). Optionally, we can use padding tokens instead.
    left_over = len(encoded) % seq_len
    encoded = encoded[:len(encoded)-left_over]
    self.encoded = encoded.view(-1, seq_len) #reshape data so it becomes a 2-D matrix of shape (len(encoded)//128, 128), i.e. each row contains data of len==128
    ## now data looks like this
    ## [ [1,2,3, ... , 128] (this is just an example, not actual input_ids)
    ##   [1,2,3, ... , 128]
    ##   [1,2,3, ... , 128]
    ## ]

  def __getitem__(self, idx):
    return self.encoded[idx]

  def __len__(self):
    return len(self.encoded)

In [11]:
train_batch_size = 64
test_batch_size = 128
train_dataset = TextDataset(train)
train_loader = DataLoader(train_dataset, batch_size = train_batch_size, shuffle = True) #DataLoader will take care of the random sampling and batching of data

test_dataset = TextDataset(test)
test_loader = DataLoader(test_dataset, batch_size = test_batch_size, shuffle = False)

## Model : Implement the forward function here

In [12]:
class LSTM(L.LightningModule):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout_rate, learning_rate, criterion):

        super().__init__()

        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim

        self.embedding = nn.Embedding(vocab_size, embedding_dim) #this will turn the token ids into vectors
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers,
                    dropout=dropout_rate, batch_first=True)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_dim, vocab_size) #turn the vectors back into token ids
        self.learning_rate = learning_rate
        self.criterion = criterion

    def forward(self, src):
        emb = self.dropout(self.embedding(src))
        lstm_out, _ = self.lstm(emb)
        lstm_out = self.dropout(lstm_out)
        out = self.fc(lstm_out)
        return out

    def training_step(self, batch, batch_idx):

        src = batch[:, :-1]
        target = batch[:, 1:]
        prediction = self(src) # run the sequence through the model (the forward method)
        prediction = prediction.reshape(-1, vocab_size)
        target = target.reshape(-1)
        loss = self.criterion(prediction, target)
        self.log("train_loss", loss)
        return loss

    def test_step(self, batch, batch_idx):

        src = batch[:, :-1]  #[batch_size (64) , seq_len-1 (127)] except last words
        target = batch[:, 1:] #[batch_size (64) , seq_len-1 (127)] except first words
        with torch.no_grad(): #disable gradient calculation for faster inference
          prediction = self(src) #[batch_size (64), seq_len-1 (127) , vocab size (9000)]
        prediction = prediction.reshape(-1, vocab_size) #[batch_size*(seq_len-1) (64*127=8128) , vocab]
        target = target.reshape(-1) #[batch_size (64), seq_len-1 (127)] -> [batch_size*(seq_len-1) (8128)]
        loss = self.criterion(prediction, target)
        self.log("test_loss", loss)
        return loss

    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=self.learning_rate)

In [13]:
vocab_size = tokenizer.get_vocab_size()
embedding_dim = 200
hidden_dim = 512
num_layers = 3
dropout_rate = 0.2
lr = 1e-3

In [14]:
criterion = nn.CrossEntropyLoss()
model = LSTM(vocab_size, embedding_dim, hidden_dim, num_layers, dropout_rate, lr, criterion)

In [15]:
from lightning.pytorch.loggers import CSVLogger
csv_logger = CSVLogger("log")

### Training

In [16]:
trainer = L.Trainer(
    max_epochs=20,
    logger=csv_logger,
    deterministic=True
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [17]:
trainer.fit(model, train_dataloaders=train_loader) # takes about 8 mins

You are using a CUDA device ('NVIDIA GeForce RTX 3060 Ti') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params | Mode 
-------------------------------------------------------
0 | embedding | Embedding        | 1.8 M  | train
1 | lstm      | LSTM             | 5.7 M  | train
2 | dropout   | Dropout          | 0      | train
3 | fc        | Linear           | 4.6 M  | train
4 | criterion | CrossEntropyLoss | 0      | train
-------------------------------------------------------
12.1 M    Trainable params
0         Non-trainable params
12.1 M    Total params
48.504    Total estimated model params size (MB)
5         Modules in train mode
0         Module

Epoch 19: 100%|██████████| 130/130 [00:08<00:00, 15.09it/s, v_num=2]

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 19: 100%|██████████| 130/130 [00:08<00:00, 14.70it/s, v_num=2]


### Testing

In [18]:
test_result = trainer.test(model, dataloaders=test_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/jaf/anaconda3/envs/nlp/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Testing DataLoader 0: 100%|██████████| 39/39 [00:01<00:00, 25.40it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_loss            4.109495639801025
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


In [19]:
import numpy as np

In [20]:
print(f"Perplexity : {np.exp(test_result[0]['test_loss'])}")

Perplexity : 60.91598622150782


In [21]:
model.eval() #disable dropout

LSTM(
  (embedding): Embedding(9062, 200)
  (lstm): LSTM(200, 512, num_layers=3, batch_first=True, dropout=0.2)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=512, out_features=9062, bias=True)
  (criterion): CrossEntropyLoss()
)

In [22]:
unk_token_id = tokenizer.encode("[UNK]").ids
eos_token_id = tokenizer.encode("</s>").ids

In [23]:
def generate_seq(context, max_new_token = 10):
  encoded = tokenizer.encode(context).ids
  with torch.no_grad():
      for i in range(max_new_token):
          src = torch.LongTensor([encoded]).to(model.device)
          prediction = model(src)
          probs = torch.softmax(prediction[:, -1] / 1, dim=-1)
          prediction = torch.multinomial(probs, num_samples=1).item()

          while prediction == unk_token_id:
              prediction = torch.multinomial(probs, num_samples=1).item()

          if prediction == eos_token_id:
              break

          encoded.append(prediction)

  return tokenizer.decode(encoded)

In [24]:
context = "<s>|วัน|จันทร์"
generate_seq(context, 50)

'วัน จันทร์ ที่   26   มิถุนายน   และ ได้ พบปะ   แจก ประชาสัมพันธ์ ให้ ผู้ ประกอบ การ ดูแล   ดัง นั้น   หลัง จาก ขอ ให้ ตรวจสอบ ผล ได้   ซึ่ง หาก เจ้าหน้าที่ ได้ รวบรวม หลักฐาน ให้ เสีย ชีวิต ใน ข้อ หา "   นายปรีชา เลาหพงศ์ชนะ   กล่าว   และ โดย  '

## Questions: Answer the following in MyCourseville

1. What is the perplexity of the neural LM you trained?
2. Paste your favorite sentence generated with the LM.