**Language Model**

Language modeling is the task of predicting the next word in a sequence of words.
In this exercise, we will use the IMDB dataset, preprocess it, build a vocabulary, and train a language
model.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os

In [3]:
if not os.path.isfile('data/aclImdb_v1.tar.gz'):
  !wget -q -P data https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

if not os.path.isdir('data/aclImdb'):
  !tar -xzf data/aclImdb_v1.tar.gz -C data/

**1. EDA: Preprocessing & Analyze the data**

In [4]:
import os
import random
from pathlib import Path
import torch

In [5]:
config = {'data':'/content/data/aclImdb/train/unsup',
          'max_sentecne': 60,
          'max_vocab':30000,
          'Start_token': 'START_TOK',
          'End_token': 'END_TOK',
          'device': torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
          'pos_class': '/content/data/aclImdb/train/pos',
          'neg_class': '/content/data/aclImdb/train/neg'}
ANALYZE_FLAG = False

In [6]:
!pip install nltk



In [7]:
import re
import matplotlib.pyplot as plt
from pathlib import Path
import numpy as np
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import TreebankWordTokenizer,sent_tokenize

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [8]:
class Sample:
  def __init__(self) -> None:
    self.range = {}

  def find_sample(self, index:int):
    list_key = list(self.range.keys())
    left, right = 0, len(list_key) - 1
    while left <= right:
      mid = (left+right) // 2
      start, end = list_key[mid]
      if start <= index <= end:
        tokens = self.range[start,end]
        token_len = len(tokens)
        num_sample_in_the_range = index - start
        return ((tokens[0:num_sample_in_the_range+1], tokens[num_sample_in_the_range+1]))
        # index - start -> its the sample from the range
      elif index < start:
        right = mid-1
      else:
        left = mid+1
    raise IndexError(f"Index {index} doesn't exist")

In [9]:
class Preprocess:
  def __init__(self):
    #  self.word_to_index = {'unk':0}
    #  self.sentences = {}
    #  self.sampels = Sample()
    pass
  def clean_sentence(sentence:str):
    return re.sub(r'<br\s*/?>', '', sentence).lower()

  def show_length_histogram(sentence_lengths: list):
      max_len = max(sentence_lengths)
      bins = np.arange(0, max_len + 100, 100)
      plt.figure(figsize=(10,6))
      plt.hist(sentence_lengths, bins=bins, color='skyblue', edgecolor='black')
      plt.title("Distribution of Sentence Lengths")
      plt.xlabel("Sentence Length (in words)")
      plt.ylabel("Frequency")
      plt.xticks(bins)
      plt.grid(True)
      plt.show()

  def tokenization(sent:str):
      tokenizer = TreebankWordTokenizer()
      tokens = tokenizer.tokenize(sent)
      return tokens

  def split_dataset(directory_path, train_ratio=0.7, val_ratio=0.2, test_ratio=0.1, seed=42):
    random.seed(seed)
    directory = Path(directory_path)
    all_files = list(directory.glob("*.txt"))
    random.shuffle(all_files)

    total = len(all_files)
    train_end = int(total * train_ratio)
    val_end = train_end + int(total * val_ratio)

    train_files = all_files[:train_end]
    val_files = all_files[train_end:val_end]
    test_files = all_files[val_end:]

    print(f"Total files: {total}")
    print(f"Train: {len(train_files)} files")
    print(f"Val: {len(val_files)} files")
    print(f"Test: {len(test_files)} files")

    return train_files, val_files, test_files


  def orgenaize_data(train_set:list, sentences:dict, samples:Sample,word_to_index:dict=None):
    sentence_lengths = []
    index_of_word = 1
    index_of_sent = 0
    index_next_sample = 0
    directory = Path(config['data'])
    for file in train_set:
      if word_to_index and len(word_to_index) == config['max_vocab']:
        break
      with file.open("r", encoding="utf-8") as f:
          text = f.read()
          text = Preprocess.clean_sentence(text)
          for sentence in sent_tokenize(text):
            if len(sentence) <= config['max_sentecne']:
              sentence = config['Start_token'] + ' ' + sentence + ' ' + config['End_token']
              tokens = Preprocess.tokenization(sentence)
              sentences[index_of_sent] = sentence
              samples.range[(index_next_sample,index_next_sample+len(tokens)-2)] = tokens
              index_next_sample+=len(tokens)-1
              index_of_sent+=1
              if word_to_index:
                for word in tokens:
                  if word not in word_to_index:
                    word_to_index[word] = index_of_word
                    index_of_word+=1
          sentence_lengths.append(len(text.split()))

    if ANALYZE_FLAG:
      Preprocess.show_length_histogram(sentence_lengths)



In [10]:
vocab = {'unk':0}
sentences = {}
sample_train, sample_val, sample_test = Sample(), Sample(), Sample()
train_set, val_set, test_set = Preprocess.split_dataset(config['data'])

Total files: 50000
Train: 35000 files
Val: 10000 files
Test: 5000 files


In [11]:
Preprocess.orgenaize_data(train_set,sentences,sample_train,vocab)
Preprocess.orgenaize_data(val_set,sentences,sample_val)
Preprocess.orgenaize_data(test_set,sentences,sample_test)

**build samples**

**Dataset**

In [12]:
import torch

In [13]:
from torch.utils.data import Dataset
class IMDBDataset(Dataset):
  def __init__(self, samples:Sample) -> None:
    super().__init__()
    self.sample = samples
  def __len__(self):
    list_key = list(self.sample.range.keys())
    return list_key[-1][1] - list_key[0][0] + 1
  def __getitem__(self, index):
    tokens_sample = self.sample.find_sample(index)
    x = tokens_sample[0]
    y = vocab.get(tokens_sample[1],0)
    list_x = []
    for word in x:
      index_word = vocab.get(word,0)
      list_x.append(index_word)
    return (torch.tensor(list_x),torch.tensor(y))


In [14]:
training_data = IMDBDataset(sample_train)
val_data = IMDBDataset(sample_val)
test_data = IMDBDataset(sample_test)

**Language Model**

In [15]:
from torch.nn.utils.rnn import pack_sequence

In [16]:
# each sample have differenet length (seqeunces = sentences with different length - so need padding)
def collate_fn(batch):
    # pack_sequence - returns a PackedSequence object
    # for 3 tensors:
    # x0 = [1,4,6,7]
    # x1 = [2,5]
    # x2 = [3,8,9]
    # the result will be: [1,2,3,4,5,8,6,9,7] sort=(1,2,0), size=(4,2,3)
    # when the items are sorted by size
    # print(batch)
    # print([x for x, _ in batch])
    x = pack_sequence([item[0] for item in batch],enforce_sorted=False)
    y = torch.tensor([item[1] for item in batch],dtype=torch.long)
    return x,y


In [17]:
import torch.nn as nn
from torch.nn.utils.rnn import pad_packed_sequence

In [18]:
from typing import OrderedDict
class LanguageModel(nn.Module):
  def __init__(self, *args, **kwargs) -> None:
    super().__init__(*args, **kwargs)
    print(len(vocab))
    self.embedding = nn.Embedding(39993, 64)
    self.lstm = nn.LSTM(64, 120,batch_first=True)
    self.linear1 = nn.Linear(120, 120)
    self.relu = nn.ReLU()
    self.linear2 = nn.Linear(120,39993)

  def forward(self,x):
    unpack,_ = pad_packed_sequence(x,batch_first=True)
    embeds = self.embedding(unpack)
    pack = pack_sequence(embeds)
    packed_out_lstm, _ = self.lstm(pack)
    out_lstm, _ = pad_packed_sequence(packed_out_lstm, batch_first=True)
    out_lstm = out_lstm[:,-1,:]
    linear_out1 = self.linear1(out_lstm)
    relu = self.relu(linear_out1)
    linear_out2 = self.linear2(relu)
    return linear_out2



In [None]:
model = LanguageModel()
model = model.to(config['device'])

40182


In [None]:
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=1e-4)

In [None]:
train_loader = torch.utils.data.DataLoader(training_data, batch_size=32, shuffle=True,collate_fn=collate_fn)
val_loader = torch.utils.data.DataLoader(val_data, batch_size=32, shuffle=False,collate_fn=collate_fn)

**Train loop**

In [28]:
import tqdm

In [None]:
for epoch in range(10):
    model.train()
    total_loss = 0
    for batch in tqdm.tqdm(train_loader):
      optimizer.zero_grad()
      x,y = batch
      x = x.to(config['device'])
      y = y.to(config['device'])
      output = model(x)
      loss = loss_fn(output.view(-1, len(vocab)),y.view(-1))
      loss.backward()
      optimizer.step()
      total_loss += loss.item()
      del x
      del y
    print("train epoch done")
    model.eval()
    val_loss = 0
    #val the model
    with torch.no_grad():
        y_true = []
        y_pred = []
        correct = 0
        total = 0
        for val_batch in tqdm.tqdm(val_loader):
            x, y = val_batch
            x=x.to(config['device'])
            y=y.to(config['device'])
            output = model(x)
            predicted_logits = output.view(-1, len(vocab))
            predicted_classes = predicted_logits.argmax(dim=1)
            labels = y.view(-1)
            loss = loss_fn(predicted_logits, labels)
            val_loss += loss.item()
            total += labels.size(0)
            correct += (predicted_classes == labels).sum()
            y_true += labels.tolist()
            y_pred += predicted_classes.tolist()
            del x
            del y

        avg_val_loss = val_loss / len(val_loader)
        print(f'Validation Loss: {avg_val_loss:.4f}')
        print('Accuracy: {} %'.format(100 * correct / total))
    print(f'Epoch {epoch+1} --> Loss: {total_loss / len(train_loader):.4f}')

100%|██████████| 27079/27079 [15:11<00:00, 29.70it/s]


train epoch done


100%|██████████| 7805/7805 [01:05<00:00, 118.96it/s]


Validation Loss: 5.5362
Accuracy: 18.40452003479004 %
Epoch 1 --> Loss: 5.8599


100%|██████████| 27079/27079 [15:13<00:00, 29.64it/s]


train epoch done


100%|██████████| 7805/7805 [01:08<00:00, 114.15it/s]


Validation Loss: 5.3415
Accuracy: 20.701282501220703 %
Epoch 2 --> Loss: 5.3553


100%|██████████| 27079/27079 [14:28<00:00, 31.19it/s]


train epoch done


100%|██████████| 7805/7805 [01:08<00:00, 114.72it/s]


Validation Loss: 5.2366
Accuracy: 21.206201553344727 %
Epoch 3 --> Loss: 5.1785


100%|██████████| 27079/27079 [14:20<00:00, 31.48it/s]


train epoch done


100%|██████████| 7805/7805 [01:08<00:00, 114.22it/s]


Validation Loss: 5.1651
Accuracy: 21.873287200927734 %
Epoch 4 --> Loss: 5.0614


100%|██████████| 27079/27079 [14:29<00:00, 31.15it/s]


train epoch done


100%|██████████| 7805/7805 [01:09<00:00, 112.69it/s]


Validation Loss: 5.1049
Accuracy: 22.81705665588379 %
Epoch 5 --> Loss: 4.9713


100%|██████████| 27079/27079 [14:41<00:00, 30.71it/s]


train epoch done


100%|██████████| 7805/7805 [01:08<00:00, 113.17it/s]


Validation Loss: 5.0423
Accuracy: 23.237089157104492 %
Epoch 6 --> Loss: 4.8961


100%|██████████| 27079/27079 [14:33<00:00, 31.01it/s]


train epoch done


100%|██████████| 7805/7805 [01:09<00:00, 112.65it/s]


Validation Loss: 5.0276
Accuracy: 23.536195755004883 %
Epoch 7 --> Loss: 4.8315


100%|██████████| 27079/27079 [14:42<00:00, 30.67it/s]


train epoch done


100%|██████████| 7805/7805 [01:08<00:00, 114.19it/s]


Validation Loss: 5.0016
Accuracy: 23.758024215698242 %
Epoch 8 --> Loss: 4.7748


100%|██████████| 27079/27079 [14:33<00:00, 31.01it/s]


train epoch done


100%|██████████| 7805/7805 [01:09<00:00, 112.69it/s]


Validation Loss: 4.9601
Accuracy: 24.216495513916016 %
Epoch 9 --> Loss: 4.7267


100%|██████████| 27079/27079 [14:27<00:00, 31.21it/s]


train epoch done


100%|██████████| 7805/7805 [01:08<00:00, 114.11it/s]

Validation Loss: 4.9297
Accuracy: 24.795089721679688 %
Epoch 10 --> Loss: 4.6838





In [None]:
torch.save(model.state_dict(), 'model_weights.pth')

In [20]:
model = LanguageModel()
model.load_state_dict(torch.load('/content/drive/MyDrive/pytorch_assignment_model/model_weights.pth', map_location=config['device']))
model.eval()

40001


LanguageModel(
  (embedding): Embedding(39993, 64)
  (lstm): LSTM(64, 120, batch_first=True)
  (linear1): Linear(in_features=120, out_features=120, bias=True)
  (relu): ReLU()
  (linear2): Linear(in_features=120, out_features=39993, bias=True)
)

In [21]:
test_loader = torch.utils.data.DataLoader(test_data, batch_size=32, shuffle=False,collate_fn=collate_fn)

In [None]:
import math
total_loss = 0
total_tokens = 0
criterion = nn.CrossEntropyLoss(reduction='sum')
with torch.no_grad():
    for test_batch in tqdm.tqdm(test_loader):
        x, y = test_batch
        x = x.to(config['device'])
        y = y.to(config['device'])

        output = model(x)
        labels = y.view(-1)

        loss = criterion(output, labels)
        total_loss += loss.item()
        total_tokens += labels.size(0)

avg_loss = total_loss / total_tokens
perplexity = math.exp(avg_loss)

print(f"Test Perplexity: {perplexity:.2f}")