# LM demo notebook

In this notebook we will load n-gram and lstm models.
Generate text and count perplexity for both of them.

In [33]:
!pip install torch==1.7.1+cu101 torchvision==0.8.2+cu101 -f https://download.pytorch.org/whl/torch_stable.html
!pip install -U torchtext
!pip install -U mosestokenizer

Looking in links: https://download.pytorch.org/whl/torch_stable.html
Requirement already up-to-date: torchtext in /usr/local/lib/python3.6/dist-packages (0.8.1)
Requirement already up-to-date: mosestokenizer in /usr/local/lib/python3.6/dist-packages (1.1.0)


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import sys
sys.path.append('/content/drive/MyDrive/demetre_{pipia, uridia}')
import utils

In [5]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd

import torch
import torch.nn as nn
import torchtext
from torchtext.datasets import PennTreebank, LanguageModelingDataset
from mosestokenizer import *
import gensim
from tqdm import tqdm_notebook
from gensim.models import KeyedVectors
import re

# this notebook was tested with PyTorch 1.7.1 and Torchtext 0.8.1
print(torch.__version__, torchtext.__version__) 

1.7.1+cu101 0.8.1


In [6]:
#Embedding layer size
EMBED_SIZE = 100

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [8]:
#define model paths
w2v_model_path = '/content/drive/MyDrive/demetre_{pipia, uridia}/resources/word2vec.model_paragraph_all_only_georgian_shuffled_3M_30it'
df_path = '/content/drive/MyDrive/demetre_{pipia, uridia}/data/paragraph_all_only_georgian_shuffled.csv'
lstm_model_path = '/content/drive/MyDrive/demetre_{pipia, uridia}/resources/lstm_model_300K_tuning'
n_gram_model_path = '/content/drive/MyDrive/demetre_{pipia, uridia}/resources/n-gram-model'

In [9]:
# Get text field and data loaders for lstm model
text_field = utils.TextField(w2v_model_path, df_path)
txt_field = text_field.get_txt_field()
train_dl, dev_dl, test_dl = utils.DataLoader(text_field, device).get_dls()

In [10]:
# load ngram model
n_gram_model = torch.load(n_gram_model_path)

In [11]:
# load lstm model
model = utils.LSTMModel(EMBED_SIZE, 100, len(txt_field.vocab), txt_field, device, num_layers=2).to(device)
model.load_state_dict(torch.load(lstm_model_path, map_location=device))

<All keys matched successfully>

In [12]:
# context lenght, how many previous words should we use for context when generating new word
context_len = 10

In [29]:
def n_gram_generate_text_top_p(model, context, tokenizer, length:int = 10):    
    context = tokenizer(context)
    for i in range(length):

        predictions = model.get_distribution(context)
        prediction_indice = 0
        while prediction_indice == 0: # not to get <unk>
            prediction_indice = utils.top_p(predictions, 0.8)
        context.append(model.get_word_from_index(prediction_indice))

    return ' '.join(context)t   

n_gram_generate_text_top_p(n_gram_model, 'პოლიტიკა dadas', MosesTokenizer())

'პოლიტიკა dadas 1805 თვით წყალსა მოხდა ტაძრებში ქუჩის აღადგინა ნორმალური გადმოსვლას ქვის'

In [28]:
def generate_top_p_text(model, context, length:int = 10):

    model = model.to(device)
    model.eval()
    with torch.no_grad(): # tells Pytorch not to store values of intermediate computations for backward pass because we not gonna need gradients.
        
        vocab = txt_field.vocab

        context = context.split(' ')
        for i in range(length):

            x = [vocab.stoi.get(word, vocab.unk_index) for word in context[-context_len:]]
            x = torch.LongTensor(x).view(1, len(context[-context_len:])).to(device)
            
            predictions = torch.nn.Softmax()(model(x)[0,-1,:])
            prediction_indice = 0
            while prediction_indice in [0,1, 2]: # not to get <unk> <pad> and <eos>
                prediction_indice = utils.top_p(predictions, 0.8)
            
            # if prediction_indice == 2: break
            context.append(vocab.itos[ prediction_indice] )

    return ' '.join(context)

generate_top_p_text(model, 'პოლიტიკა dadas', 10)

'პოლიტიკა dadas საკუთარი პოლიტიკური მიზნებისათვის რომლის და უნდა იყოს იყოს რომელსაც ჩვენი'

In [None]:
პოლიტიკა ვისაც გვაქვს არასწორი 
ან ამის მიუხედავად პრემიერ 
ბრიუსელში პოლიტიკური ინტერესები             

In [None]:
def beam_search(context, model, num_beams=3):
    if num_beams < 0: 
        return 1, -1
    predictions = torch.nn.Softmax()(model(context)[0,-1,:])
    top_k = torch.topk(predictions, dim=-1, k=10)
    indices = top_k.indices
    values = top_k.values
    
    childs = []
    for indice, pr in zip(indices, values):
        l = context.tolist()[0].copy()
        l.append(indice.item())
        l = l[-context_len:]
        new_context = torch.tensor(l).unsqueeze(dim=0).to(device)
        prob, word_index = beam_search(new_context, model,num_beams=num_beams-1)
        prob = prob * pr.item()
        childs.append((prob, indice.item()))
    childs = sorted(childs, key=lambda tup: tup[0])
    
    for prob, word_index in childs:
        if word_index not in [0,1,2]:
            return prob, word_index

def generate_beam_search_text(model, context, length:int = 10):

    model = model.to(device)
    model.eval()
    with torch.no_grad(): # tells Pytorch not to store values of intermediate computations for backward pass because we not gonna need gradients.
        
        vocab = txt_field.vocab

        context = context.split(' ')
        for i in range(length):

            x = [vocab.stoi.get(word, vocab.unk_index) for word in context[-context_len:]]
            x = torch.LongTensor(x).view(1, len(context[-context_len:])).to(device)
            
            _, prediction_indice = beam_search(x, model)
            context.append(vocab.itos[ prediction_indice] )

    return ' '.join(context)

generate_beam_search_text(model, '', 10)

' ამ ყველა სახის უნდა ამ . ყველა ადამიანის შესახებ უნდა'

## Compare lstm and ngram models perplexities

In [None]:
utils.n_gram_compute_perplexity(n_gram_model, n_gram_model.n, text_field.test_df['Paragraph'].tolist()[:200], MosesTokenizer())

100%|██████████| 200/200 [1:55:07<00:00, 34.54s/it]


415551.41100474604

In [None]:
utils.compute_perplexity(model, test_dl, device)

24.11112977997592

მდინარე თრიალეთის სხეულს ქორწილის 
დაცვის გვიანდელი სასუნთქ იხსნა 
ხიფათი სადაც მოპყრობისა                     