In [10]:
# Import necesssary modules
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import torchtext.vocab as vocab

import pandas as pd
import numpy as np
from datasets import load_dataset
from tqdm import tqdm
from collections import Counter

# Set device = CUDA if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Device: ', device)

Device:  cpu


In [2]:
# Download the dataset using HuggingFace load_dataset
ptb = load_dataset('ptb_text_only')

Found cached dataset ptb_text_only (/Users/kushagraseth/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f)


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
# View Dataset Splits
print('Dataset Split:', ptb)

# Train Data
train_data = ptb['train']['sentence']
# Val Data
val_data = ptb['validation']['sentence']
# Test Data
test_data = ptb['test']['sentence']

Dataset Split: DatasetDict({
    train: Dataset({
        features: ['sentence'],
        num_rows: 42068
    })
    test: Dataset({
        features: ['sentence'],
        num_rows: 3761
    })
    validation: Dataset({
        features: ['sentence'],
        num_rows: 3370
    })
})


In [5]:
# Tokenize
def tokenize(text):
    return [i for i in text.split()]

# Load Words
def load_words(data):
    tokenized_sent = list()
    for sentence in data:
        tokenized_sent.append(tokenize('<start> ' + sentence))
    return sum(tokenized_sent, [])

In [6]:
# List of Tokenized Words in the Corpus
words = load_words(train_data)

# Vocab: List of Unique Words
vocab = Counter(words)

# Print Vocab Length
VOCAB_LEN = len(vocab)
print('Vocab Length: ', VOCAB_LEN)

Vocab Length:  10000


In [7]:
# Word-to-Index Dictionary for Vocab
word2idx = { term: idx for idx, term in enumerate(vocab) }

# Index-to-Word Dictionary for Vocab
idx2word = { idx: word for word,idx in word2idx.items() }

In [119]:
# Load GloVe Embeddings
GLOVE_DIM = 300
glove = vocab.GloVe(name = '840B', dim = GLOVE_DIM)

print('Loaded {} words present in GloVe'.format(len(glove.itos)))

# Get Embedding for given word
def get_word_embedding(word):
    return glove.vectors[glove.stoi[word]]

Loaded 2196017 words present in GloVe


In [12]:
start_tensor = torch.zeros(1, GLOVE_DIM) # Word Embedding Tensor for <start>
unk_tensor = torch.rand(1, GLOVE_DIM) # Word Embedding Tensor for <unk>

# Create Embedding Matrix for Vocab
embeddings = []
for word in word2idx:
    if word in glove.stoi: # If word present in GloVe
        embeddings.append(get_word_embedding(word)) 
    else:
        if(word == '<start>'): # If word is <start>
            embeddings.append(start_tensor) 
        else: # If word is <unk> or not present in GloVe
            embeddings.append(unk_tensor)
            
temp_list = []
for emb in embeddings:
    temp_list.append(emb.numpy().squeeze().tolist())
# Tensor of Word Embeddings for each word in vocab
embeddings_tensor = torch.tensor(temp_list)

In [13]:
# LangModel Class for DataLoader
class LangModelDataset(Dataset):
    
    # Constructor
    def __init__(self, 
                 data: list):
        self.data = data
        self.sequence_length = 30
        self.words = self.load_words()
        self.token_list = list() # List of tokens in the Corpus
        for word in self.words:
            if word in word2idx:
                self.token_list.append(word2idx[word])
            else:
                self.token_list.append(1)
    
    # Length of Number of Sequences for a Dataset split
    def __len__(self):
        return len(self.token_list) - self.sequence_length
    
    # List of Tokenized Words in the Corpus
    def load_words(self):
        tokenized_sent = list()
        for sentence in self.data:
            tokenized_sent.append(self.tokenize('<start> ' + sentence))
        return sum(tokenized_sent, [])
    
    def __getitem__(self, 
                    idx: int):
        x = torch.tensor(self.token_list[idx : idx + self.sequence_length])
        y = torch.tensor(self.token_list[idx + 1 : idx + self.sequence_length + 1])
        return x, y
        
    # Tokenize a sentence using split()  
    def tokenize(self, 
                 text: str):
        return [i for i in text.split()]

In [79]:
# Language Model Object for DataLoader
# Train Dataset Object
train_ds = LangModelDataset(train_data)

# Val Dataset Object
val_ds = LangModelDataset(val_data)

# Test Dataset Object
test_ds = LangModelDataset(test_data)

(tensor([44, 45, 46,  0, 47, 26, 27, 28, 29, 48, 49, 41, 42, 50, 51, 52, 53, 54,
        55, 35, 36, 37, 42, 56, 57, 58, 59,  0, 35, 60]), tensor([45, 46,  0, 47, 26, 27, 28, 29, 48, 49, 41, 42, 50, 51, 52, 53, 54, 55,
        35, 36, 37, 42, 56, 57, 58, 59,  0, 35, 60, 42]))


In [102]:
# Model Hyper-Parameters
BATCH_SIZE = 64
EMBEDDING_DIM = GLOVE_DIM
HIDDEN_DIM = 256
OUTPUT_DIM = VOCAB_LEN
NUM_LAYERS = 2
BIDIRECTION = False
DROPOUT = 0.2
LEARNING_RATE = 0.01

In [103]:
# Pytorch Data Loaders
# Train Data Loader
train_loader = DataLoader(train_ds, 
                          batch_size = BATCH_SIZE, 
                          shuffle = True)
# Val Data Loader
val_loader = DataLoader(val_ds, 
                        batch_size = BATCH_SIZE, 
                        shuffle = True)
# Test Data Loader
test_loader = DataLoader(test_ds, 
                        batch_size = 1, 
                        shuffle = False)

[tensor([[  64, 2043,  133,  ...,   26,  108, 3394],
        [ 495,   64, 3098,  ...,   35, 1319,  189],
        [  27, 2821,  229,  ...,  918, 1607, 1608],
        ...,
        [ 103,   42,  465,  ...,  160,   26,  119],
        [  98,  935,  392,  ...,   32,  663,  790],
        [6755,   95,  108,  ..., 1258, 6689, 2742]]), tensor([[2043,  133,   40,  ...,  108, 3394, 3395],
        [  64, 3098,    0,  ..., 1319,  189, 2479],
        [2821,  229, 7681,  ..., 1607, 1608, 1656],
        ...,
        [  42,  465,   40,  ...,   26,  119, 2590],
        [ 935,  392,  336,  ...,  663,  790, 9406],
        [  95,  108,   35,  ..., 6689, 2742,   30]])]


In [None]:
# Print Training Batch
iterator = iter(train_loader)
inputs = next(iterator)
print(inputs)

In [107]:
# RNN / LSTM Model
class LSTM(nn.Module):
    
    # Constructor
    def __init__(self, 
                 embedding_dim, 
                 hidden_dim, 
                 output_dim, 
                 n_layers, 
                 bidirectional, 
                 dropout):
        super().__init__()
        # Initialize Embedding Layer with Pre-Trained GloVe Embeddings
        self.embedding = nn.Embedding.from_pretrained(embeddings_tensor)
        # Initialzie LSTM layer to process the vector sequences 
        self.lstm = nn.LSTM(embedding_dim, 
                            hidden_dim,
                            num_layers = n_layers,
                            bidirectional = bidirectional,
                            dropout = dropout,
                            batch_first = True)
#         # Initialzie RNN layer to process the vector sequences 
#         self.rnn = nn.RNN(embedding_dim, 
#                           hidden_dim, 
#                           num_layers = n_layers, 
#                           bidirectional = bidirectional, 
#                           dropout = dropout, 
#                           batch_first = True, 
#                           nonlinearity = 'relu')
        num_directions = 2 if bidirectional else 1
        # Initialize Dense layers to predict
        self.fc1 = nn.Linear(hidden_dim * num_directions, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        # Initialize dropout to improve with regularization
        self.dropout = nn.Dropout(dropout)
        
    # Forward Pass of Model    
    def forward(self, 
                x):
        # Embedding Layer
        embedded = self.embedding(x)
        # Dropout Layer before LSTM Layer
        embedded = self.dropout(embedded)
        # LSTM Layer
        output, (hidden, cell) = self.lstm(embedded)
#         # RNN Layer
#         output, hidden = self.rnn(embedded)
        # 1st Fully Connected Layer
        output = self.fc1(output)
        # Dropout Layer before Output
        output = self.dropout(output)
        # 2nd Fully Connected Layer
        output = self.fc2(output)
        return output

In [108]:
# Set Seed Value to make results reproducible
torch.manual_seed(32)

model = LSTM(EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            NUM_LAYERS, 
            BIDIRECTION, 
            DROPOUT).to(device)

print('LSTM Model: ', model)

LSTM Model:  LSTM(
  (embedding): Embedding(10000, 300)
  (rnn): RNN(300, 256, num_layers=2, batch_first=True, dropout=0.2)
  (fc1): Linear(in_features=256, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=10000, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [109]:
# Model Train Function
def train(loader, 
          model, 
          optimizer, 
          loss_fn):
    # Set model to train mode
    model.train()
    losses = []
    pbar = tqdm(loader)
    for x, y in pbar:
        optimizer.zero_grad()
        
        # Calculate y_pred
        y_pred = model(x) # [Batch_Length, Sequence Length, Output Dim]
        
        # Convert y_pred to 2D Tensor
        y_pred = y_pred.view(-1, y_pred.shape[-1]) # [Batch_Length * Sequence Length, Output Dim]
        # Convert y to 1D Tensor
        y = torch.flatten(y) # [Batch_Length * Sequence Length]
        
        # Loss
        loss = loss_fn(y_pred, y)
        pbar.set_postfix({'Loss': loss.item()})
        losses.append(loss.item())
        
        # Calculate gradients for w/b
        loss.backward()  
        # Update weights according to optimizer rules
        optimizer.step()
    return round((sum(losses) / len(losses)), 4) # Return Average Loss

# Model Evaluate Function
def evaluate(loader, 
             model, 
             loss_fn):
    # Set model to eval mode
    model.eval()
    losses = []
    pbar = tqdm(loader)
    for x, y in pbar:
        # Calculate y_pred
        y_pred = model(x) # [Batch_Length, Sequence Length, Output Dim]
              
        # Convert y_pred to 2D Tensor
        y_pred = y_pred.view(-1, y_pred.shape[-1]) # [Batch_Length * Sequence Length, Output Dim]
        # Convert y to 1D Tensor
        y = torch.flatten(y) # [Batch_Length * Sequence Length]
        
        # Loss
        loss = loss_fn(y_pred, y)
        pbar.set_postfix({'Loss': loss.item()})
        losses.append(loss.item())
    
    return round((sum(losses) / len(losses)), 4) # Return Average Loss

In [116]:
# Model Training on Train dataset and Evaluation on Validation dataset
optimizer = torch.optim.Adam(model.parameters(),
                              lr = LEARNING_RATE)
loss_fn = nn.CrossEntropyLoss().to(device)

train_loss_list = []
val_loss_list = []

n_epochs = 1
PATH = f'best-model.pt'

for epoch in range(n_epochs):
    # Model Training
    train_loss = train(train_loader, 
                       model, 
                       optimizer, 
                       loss_fn)
    train_loss_list.append(train_loss)
    # Train Perplexity
    train_ppl = torch.exp(torch.tensor(train_loss))
    
    # Model Evaluation
    val_loss = evaluate(val_loader, 
                        model, 
                        loss_fn)
    val_loss_list.append(val_loss)
    # Val Perplexity
    val_ppl = torch.exp(torch.tensor(val_loss))
    
    print("Epoch {0} --> Train Loss: {1} | Train PPL: {2} | Val Loss: {3} | Val PPL: {4}".format(epoch + 1, train_loss, train_ppl, val_loss, val_ppl))
    
    # Save model
    torch.save(model.state_dict(), PATH)

100%|██████████████████████████| 14525/14525 [18:22<00:00, 13.17it/s, Loss=5.35]
100%|████████████████████████████| 1153/1153 [00:32<00:00, 35.06it/s, Loss=5.18]

Epoch 1 --> Train Loss: 5.2862 | Train PPL: 197.59115600585938 | Val Loss: 5.3696 | Val PPL: 214.7769012451172





In [117]:
# Load the saved model
saved_model = LSTM(EMBEDDING_DIM, 
                   HIDDEN_DIM, 
                   OUTPUT_DIM, 
                   NUM_LAYERS, 
                   BIDIRECTION, 
                   DROPOUT).to(device)

saved_model.load_state_dict(torch.load(PATH))
saved_model.eval()

LSTM(
  (embedding): Embedding(10000, 300)
  (rnn): RNN(300, 256, num_layers=2, batch_first=True, dropout=0.2)
  (fc1): Linear(in_features=256, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=10000, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [118]:
# Model Predict Function
def predict(loader, 
            model, 
            loss_fn):
    # Set model to eval mode
    model.eval()
    losses = []
    pbar = tqdm(loader)
    for x, y in pbar:
        with torch.no_grad():
            # Calculate y_pred
            y_pred = model.forward(x) # [Batch_Length, Sequence Length, Output Dim]

            # Convert y_pred to 2D Tensor
            y_pred = y_pred.view(-1, y_pred.shape[-1]) # [Batch_Length * Sequence Length, Output Dim]
            # Convert y to 1D Tensor
            y = torch.flatten(y) # [Batch_Length * Sequence Length]
            
            # Loss
            loss = loss_fn(y_pred, y)
            pbar.set_postfix({'Loss': loss.item()})
            losses.append(loss.item())
    
    return round((sum(losses) / len(losses)), 4) # Return Average Loss

# Model Predict
predict_loss = predict(test_loader, 
                       saved_model, 
                       loss_fn)
# Predict Perplexity
predict_ppl = torch.exp(torch.tensor(predict_loss))

print("Predict Loss: {0} | Predict PPL: {1}".format(predict_loss, predict_ppl))

100%|█████████████████████████| 82400/82400 [04:06<00:00, 334.12it/s, Loss=4.86]

Predict Loss: 5.2833 | Predict PPL: 197.01895141601562



