In [10]:
# Imports
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import torchtext.vocab as vocab

import pandas as pd
import numpy as np
from datasets import load_dataset
from tqdm import tqdm
from collections import Counter

# Set device = CUDA if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Device: ', device)

Device:  cpu


In [2]:
# Download the dataset using HuggingFace load_dataset
ptb = load_dataset('ptb_text_only')

Found cached dataset ptb_text_only (/Users/kushagraseth/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f)


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
# Dataset Split
print('Dataset Split:', ptb)

train_data = ptb['train']['sentence']
val_data = ptb['validation']['sentence']
test_data = ptb['test']['sentence']

Dataset Split: DatasetDict({
    train: Dataset({
        features: ['sentence'],
        num_rows: 42068
    })
    test: Dataset({
        features: ['sentence'],
        num_rows: 3761
    })
    validation: Dataset({
        features: ['sentence'],
        num_rows: 3370
    })
})


In [5]:
# Tokenize
def tokenize(text):
    return [i for i in text.split()]

# Load Words
def load_words(data):
    tokenized_sent = list()
    for sentence in data:
        tokenized_sent.append(tokenize('<start> ' + sentence))
    return sum(tokenized_sent, [])

In [6]:
# List of Tokenized Words in the Corpus
words = load_words(train_data)

# Vocab: List of Unique Words
vocab = Counter(words)
VOCAB_LEN = len(vocab)
print('Vocab Length: ', VOCAB_LEN)

Vocab Length:  10000


In [7]:
# Dictionary for Vocab
word2idx = { term: idx for idx, term in enumerate(vocab) }

idx2word = { idx: word for word,idx in word2idx.items() }

In [11]:
# Load GloVe Embeddings
GLOVE_DIM = 300
glove = vocab.GloVe(name = '840B', dim = GLOVE_DIM)

print('Loaded {} words'.format(len(glove.itos)))

# Get Embedding for given word
def get_word_embedding(word):
    return glove.vectors[glove.stoi[word]]

Loaded 2196017 words


In [12]:
# Create Embedding Matrix for Vocab
start_tensor = torch.zeros(1, GLOVE_DIM) # Word Embedding for <start>
unk_tensor = torch.rand(1, GLOVE_DIM) # Word Embedding for <unk>

embeddings = []
for word in word2idx:
    if word in glove.stoi:
        embeddings.append(get_word_embedding(word))
    else:
        if(word == '<start>'):
            embeddings.append(start_tensor)
        else:
            embeddings.append(unk_tensor)
            
temp_list = []
for emb in embeddings:
    temp_list.append(emb.numpy().squeeze().tolist())
embeddings_tensor = torch.tensor(temp_list)

In [13]:
# LangModel Class for DataLoader
class LangModelDataset(Dataset):
    
    def __init__(self, 
                 data: list):
        self.data = data
        self.sequence_length = 30
        self.words = self.load_words()
        self.token_list = list() # List of Word ID's in the Corpus
        for word in self.words:
            if word in word2idx:
                self.token_list.append(word2idx[word])
            else:
                self.token_list.append(1)
        
    def __len__(self):
        return len(self.token_list) - self.sequence_length
    
    # List of Tokenized Words in the Corpus
    def load_words(self):
        tokenized_sent = list()
        for sentence in self.data:
            tokenized_sent.append(self.tokenize('<start> ' + sentence))
        return sum(tokenized_sent, [])
    
    def __getitem__(self, 
                    idx: int):
        x = torch.tensor(self.token_list[idx : idx + self.sequence_length])
        y = torch.tensor(self.token_list[idx + 1 : idx + self.sequence_length + 1])
        return x, y
        
    # Tokenize    
    def tokenize(self, 
                 text: str):
        return [i for i in text.split()]

In [79]:
# Language Model Object for DataLoader
train_ds = LangModelDataset(train_data)
print(train_ds[50])

val_ds = LangModelDataset(val_data)

test_ds = LangModelDataset(test_data)

(tensor([44, 45, 46,  0, 47, 26, 27, 28, 29, 48, 49, 41, 42, 50, 51, 52, 53, 54,
        55, 35, 36, 37, 42, 56, 57, 58, 59,  0, 35, 60]), tensor([45, 46,  0, 47, 26, 27, 28, 29, 48, 49, 41, 42, 50, 51, 52, 53, 54, 55,
        35, 36, 37, 42, 56, 57, 58, 59,  0, 35, 60, 42]))


In [59]:
# Model Hyper-parameters
BATCH_SIZE = 64
EMBEDDING_DIM = GLOVE_DIM
HIDDEN_DIM = 256
OUTPUT_DIM = VOCAB_LEN
NUM_LAYERS = 2
BIDIRECTION = False
DROPOUT = 0.2
LEARNING_RATE = 0.01

In [82]:
# PyTorch Data Loaders
train_loader = DataLoader(train_ds, 
                          batch_size = BATCH_SIZE, 
                          shuffle = True)
val_loader = DataLoader(val_ds, 
                        batch_size = BATCH_SIZE, 
                        shuffle = True)

test_loader = DataLoader(test_ds, 
                        batch_size = 1, 
                        shuffle = False)

# Print Training Batch
iterator = iter(train_loader)
inputs = next(iterator)
print(inputs)

[tensor([[  32, 1649, 4260,  ...,   46,   99,   98],
        [1367, 3465,    0,  ..., 3748,   32,  127],
        [   0,   32, 6710,  ...,   27,   48,   27],
        ...,
        [   0,  315,  307,  ..., 3269,  160, 7670],
        [ 938,    0, 4550,  ..., 4386,  467,  133],
        [ 315,  374,  874,  ..., 1295,   35, 3854]]), tensor([[1649, 4260, 2668,  ...,   99,   98, 1868],
        [3465,    0,  108,  ...,   32,  127,   40],
        [  32, 6710,  203,  ...,   48,   27,  152],
        ...,
        [ 315,  307, 3257,  ...,  160, 7670, 7671],
        [   0, 4550,   69,  ...,  467,  133,  247],
        [ 374,  874,  251,  ...,   35, 3854,  181]])]


In [61]:
# Bi-LSTM Model
class LSTM(nn.Module):
    
    def __init__(self, 
                 embedding_dim, 
                 hidden_dim, 
                 output_dim, 
                 n_layers, 
                 bidirectional, 
                 dropout):
        super().__init__()
        # Initialize Embedding Layer with Pre-Trained Embeddings (Vector Sequences)
        self.embedding = nn.Embedding.from_pretrained(embeddings_tensor)
        # Initialzie LSTM layer to process the vector sequences 
        self.lstm = nn.LSTM(embedding_dim, 
                            hidden_dim,
                            num_layers = n_layers,
                            bidirectional = bidirectional,
                            dropout = dropout,
                            batch_first = True)
        num_directions = 2 if bidirectional else 1
        # Initialize Dense layers to predict
        self.fc1 = nn.Linear(hidden_dim * num_directions, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        # Initialize dropout to improve with regularization
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, 
                x):
        # Embedding Layer
        embedded = self.embedding(x)
        # Dropout Layer before LSTM Layer
        embedded = self.dropout(embedded)
        # LSTM Layer
        output, (hidden, cell) = self.lstm(embedded)
        # 1st Fully Connected Layer
        output = self.fc1(output)
        # Dropout Layer before Output
        output = self.dropout(output)
        # 2nd Fully Connected Layer
        output = self.fc2(output)
        return output

In [62]:
torch.manual_seed(32)

model = LSTM(EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            NUM_LAYERS, 
            BIDIRECTION, 
            DROPOUT).to(device)

print('LSTM Model: ', model)

LSTM Model:  LSTM(
  (embedding): Embedding(10000, 300)
  (lstm): LSTM(300, 256, num_layers=2, batch_first=True, dropout=0.2)
  (fc1): Linear(in_features=256, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=10000, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [75]:
# Model Train Function
def train(loader, 
          model, 
          optimizer, 
          loss_fn):
    model.train()
    losses = []
    pbar = tqdm(loader)
    for x, y in pbar:
        optimizer.zero_grad()
        
        # Calculate y_pred
        y_pred = model(x)
        
        # Convert y_pred to 2D Tensor
        y_pred = y_pred.view(-1, y_pred.shape[-1])
        # Convert y to 1D Tensor
        y = torch.flatten(y)
        
        # Loss
        loss = loss_fn(y_pred, y)
        pbar.set_postfix({'Loss': loss.item()})
        losses.append(loss.item())
        
        # Calculate gradients for w/b
        loss.backward()  
        # Update weights according to optimizer rules
        optimizer.step()          
    return round((sum(losses) / len(losses)), 4)

# Model Evaluate Function
def evaluate(loader, 
             model, 
             loss_fn):
    model.eval()
    losses = []
    pbar = tqdm(loader)
    for x, y in pbar:
        # Calculate y_pred
        y_pred = model(x)
              
        # Convert y_pred to 2D Tensor
        y_pred = y_pred.view(-1, y_pred.shape[-1])
        # Convert y to 1D Tensor
        y = torch.flatten(y)
        
        # Loss
        loss = loss_fn(y_pred, y)
        pbar.set_postfix({'Loss': loss.item()})
        losses.append(loss.item())
    
    return round((sum(losses) / len(losses)), 4)

In [76]:
# Model Training on Train dataset and Evaluation on Validation dataset
optimizer = torch.optim.AdamW(model.parameters(),
                              lr = LEARNING_RATE)
loss_fn = nn.CrossEntropyLoss().to(device)

train_loss_list = []
val_loss_list = []

n_epochs = 1
PATH = f'best-model.pt'

for epoch in range(n_epochs):
    # Model Training
    train_loss = train(train_loader, 
                       model, 
                       optimizer, 
                       loss_fn)
    train_loss_list.append(train_loss)
    # Train Perplexity
    train_ppl = torch.exp(torch.tensor(train_loss))
    
    # Model Evaluation
    val_loss = evaluate(val_loader, 
                        model, 
                        loss_fn)
    val_loss_list.append(val_loss)
    # Val Perplexity
    val_ppl = torch.exp(torch.tensor(val_loss))
    
    print("Epoch {0} --> Train Loss: {1} | Train PPL: {2} | Val Loss: {3} | Val PPL: {4}".format(epoch + 1, train_loss, train_ppl, val_loss, val_ppl))
    
    # Save model
    torch.save(model.state_dict(), PATH)

100%|██████████████████████████| 14525/14525 [33:26<00:00,  7.24it/s, Loss=4.17]
100%|████████████████████████████| 1153/1153 [01:04<00:00, 17.76it/s, Loss=4.82]


Epoch 1 --> Train Loss: 4.0741 | Train PPL: 58.79753875732422 | Val Loss: 4.8028 | Val PPL: 121.85114288330078


In [77]:
# Load the saved model
saved_model = LSTM(EMBEDDING_DIM, 
                   HIDDEN_DIM, 
                   OUTPUT_DIM, 
                   NUM_LAYERS, 
                   BIDIRECTION, 
                   DROPOUT).to(device)

saved_model.load_state_dict(torch.load(PATH))
saved_model.eval()

LSTM(
  (embedding): Embedding(10000, 300)
  (lstm): LSTM(300, 256, num_layers=2, batch_first=True, dropout=0.2)
  (fc1): Linear(in_features=256, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=10000, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [86]:
# Model Predict Function
def predict(loader, 
            model, 
            loss_fn):
    model.eval()
    losses = []
    pbar = tqdm(loader)
    for x, y in pbar:
        with torch.no_grad():
            # Calculate y_pred
            y_pred = model.forward(x)

            # Convert y_pred to 2D Tensor
            y_pred = y_pred.view(-1, y_pred.shape[-1])
            # Convert y to 1D Tensor
            y = torch.flatten(y)
            
            # Loss
            loss = loss_fn(y_pred, y)
            pbar.set_postfix({'Loss': loss.item()})
            losses.append(loss.item())
    
    return round((sum(losses) / len(losses)), 4)

# Model Predict
predict_loss = predict(test_loader, 
                       saved_model, 
                       loss_fn)
# Predict Perplexity
predict_ppl = torch.exp(torch.tensor(predict_loss))

print("Predict Loss: {0} | Predict PPL: {1}".format(predict_loss, predict_ppl))

100%|█████████████████████████| 82400/82400 [06:05<00:00, 225.54it/s, Loss=4.83]

Predict Loss: 4.7386 | Predict PPL: 114.27407836914062



