In [None]:
import torch
import torch.nn as nn
import torch.autograd as ag
from matplotlib import pyplot as plt

In [None]:

class VanilaRNNLM(nn.Module):
    def __init__(self, n_inputs, n_hiddens, n_outputs, vocab, sigma='sigmoid', phi='softmax'):
        """
        Construct a vanila RNN. 
        
        Params:
        n_inputs: number of input neurons, embedding_dim
        n_hiddens: number of hidden neurons
        n_outputs: number of output neurons,
        vocab: a dictionary of the form {word: word_id}
        sigma: activation function for hidden layer
        phi: output function
        """
        super(VanilaRNNLM, self).__init__()
        self.n_inputs = n_inputs
        self.n_hiddens = n_hiddens
        self.embedding   = torch.nn.Embedding(num_embeddings=len(vocab), embedding_dim=256)
        
       
        self.in2hidden = nn.Linear(n_inputs + n_hiddens, n_hiddens)
        self.in2output = nn.Linear(n_inputs + n_hiddens, n_outputs)
        self.softmax = nn.LogSoftmax(dim=1)
      
    
    def forward(self, xs, h0):
        """
        Params:
        xs: the input sequence [start = 0, x_1, x_2, ..., x_n, end = V]. x_i is the id of the i-th word in the sequence. 
            For example, xs = [1, 3, 11, 6, 8, 2]
        h0: the initial hidden state
        
        Returns: (ys, hs) where
        ys = [y_1, y_2, ..., y_n] and
        hs = [h_1, h_2, ..., h_n]
        """

        combined = torch.cat((torch.LongTensor(xs), h0), 1)
        hs       = torch.sigmoid(self.in2hidden(combined))
        output   = self.in2output(combined)
        output   = self.softmax(output)
        return output, hs
    def init_hidden(self):
        hidden = nn.init.kaiming_uniform_(torch.empty(1, self.n_hiddens))
        return hidden




In [None]:
# torch.cuda.is_available() checks and returns a Boolean True if a GPU is available, else it'll return False
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [None]:
class FancyRNNLM(nn.Module):
    def __init__(self, n_inputs, n_hiddens, n_outputs, vocab, sigma='sigmoid', phi='softmax'):
        """
        Construct a fancy RNN, this could be LSTM, GRU, or your own invention.
        
        Params:
        n_inputs: number of input neurons
        n_hiddens: number of hidden neurons
        n_outputs: number of output neurons
        vocab: a dictionary {word: word_id}
        sigma: activation function for hidden layer
        phi: output function
        """
        super(FancyRNNLM, self).__init__()
        self.n_inputs = n_inputs
        self.n_hiddens = n_hiddens
        self.n_layers = n_layers
        self.embedding   = torch.nn.Embedding(num_embeddings=len(vocab), embedding_dim=256)
       


        self.lstm   = nn.LSTM(n_inputs, n_hiddens, 2, batch_first=True, dropout=0.2)
        self.fc       = nn.Linear(n_hiddens, n_outputs)
        self.softmax = nn.Softmax(dim=1)


    def forward(self, xs, h0):
      embeds = self.embedding(torch.LongTensor(xs))
      lstm_out, hs = self.lstm(embeds, h0)
      lstm_out = lstm_out.contiguous().view(-1, self.n_hiddens)
      out = nn.Dropout(lstm_out)
      out = self.fc(out)
      out = torch.sigmoid(out)
      out = out.view(batch_size, -1)
      out = out[:,-1]
      return output, hs
    def init_hidden(self):
      hidden = torch.zeros(self.n_layers, 1, self.n_hiddens).to(device)
      return hidden

In [None]:
!git clone https://github.com/NTHoang99/NLP.git

Cloning into 'NLP'...
remote: Enumerating objects: 19, done.[K
remote: Counting objects: 100% (19/19), done.[K
remote: Compressing objects: 100% (17/17), done.[K
remote: Total 19 (delta 4), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (19/19), done.


In [None]:
!pip install torch_nightly -f https://download.pytorch.org/whl/nightly/cu92/torch_nightly.html

Looking in links: https://download.pytorch.org/whl/nightly/cu92/torch_nightly.html
Collecting torch_nightly
  Downloading https://download.pytorch.org/whl/nightly/cu92/torch_nightly-1.2.0.dev20190805%2Bcu92-cp37-cp37m-linux_x86_64.whl (704.8 MB)
[K     |████████████████████████████████| 704.8 MB 5.1 kB/s 
[?25hInstalling collected packages: torch-nightly
Successfully installed torch-nightly-1.2.0.dev20190805+cu92


In [None]:
!nvidia-smi

Fri Nov 19 12:49:24 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
%cd /content/NLP

/content/NLP


In [None]:
!gdown --id 148kFotM7dTaBfyD_HMkA5BSW-NFXb5Hu

Downloading...
From: https://drive.google.com/uc?id=148kFotM7dTaBfyD_HMkA5BSW-NFXb5Hu
To: /content/NLP/wikitext-103.zip
100% 189M/189M [00:01<00:00, 101MB/s] 


In [None]:
!mkdir data

mkdir: cannot create directory ‘data’: File exists


In [None]:
!unzip -uq "/content/NLP/wikitext-103.zip" -d "/content/NLP/data"

In [None]:
%%time 
!python -u main.py --cuda --emsize 650 --nhid 650 --dropout 0.2 --epochs 40 --tied  2>&1 | tee train.log  

Number of tokens:
Train:  102590700
Valid:  216347
Test:   244102
| epoch   1 |   200/146558 batches | lr 20.00 | ms/batch 151.98 | loss  9.19 | ppl  9821.10
| epoch   1 |   400/146558 batches | lr 20.00 | ms/batch 151.33 | loss  7.83 | ppl  2523.87
| epoch   1 |   600/146558 batches | lr 20.00 | ms/batch 151.65 | loss  7.33 | ppl  1532.41
| epoch   1 |   800/146558 batches | lr 20.00 | ms/batch 151.68 | loss  7.07 | ppl  1178.19
| epoch   1 |  1000/146558 batches | lr 20.00 | ms/batch 151.69 | loss  6.86 | ppl   950.11
| epoch   1 |  1200/146558 batches | lr 20.00 | ms/batch 151.71 | loss  6.73 | ppl   837.97
| epoch   1 |  1400/146558 batches | lr 20.00 | ms/batch 151.69 | loss  6.73 | ppl   833.02
| epoch   1 |  1600/146558 batches | lr 20.00 | ms/batch 151.69 | loss  6.60 | ppl   733.78
| epoch   1 |  1800/146558 batches | lr 20.00 | ms/batch 151.71 | loss  6.52 | ppl   676.96
| epoch   1 |  2000/146558 batches | lr 20.00 | ms/batch 151.70 | loss  6.39 | ppl   595.74
| epoch   1 | 

In [None]:
%%time 
!python -u main.py --cuda --emsize 650 --nhid 650 --dropout 0.5 --epochs 20 --tied  2>&1 | tee train.log  

Number of tokens:
Train:  102590700
Valid:  216347
Test:   244102
| epoch   1 |  1200/146558 batches | lr 20.00 | ms/batch 151.74 | loss  7.50 | ppl  1814.00
| epoch   1 |  2400/146558 batches | lr 20.00 | ms/batch 151.75 | loss  6.47 | ppl   643.19
| epoch   1 |  3600/146558 batches | lr 20.00 | ms/batch 151.75 | loss  6.13 | ppl   461.19
| epoch   1 |  4800/146558 batches | lr 20.00 | ms/batch 151.76 | loss  5.92 | ppl   371.79
| epoch   1 |  6000/146558 batches | lr 20.00 | ms/batch 151.75 | loss  5.78 | ppl   325.21
| epoch   1 |  7200/146558 batches | lr 20.00 | ms/batch 151.75 | loss  5.72 | ppl   304.28
| epoch   1 |  8400/146558 batches | lr 20.00 | ms/batch 151.76 | loss  5.67 | ppl   289.28
| epoch   1 |  9600/146558 batches | lr 20.00 | ms/batch 151.75 | loss  5.64 | ppl   281.71
| epoch   1 | 10800/146558 batches | lr 20.00 | ms/batch 151.75 | loss  5.54 | ppl   254.88
| epoch   1 | 12000/146558 batches | lr 20.00 | ms/batch 151.75 | loss  5.51 | ppl   247.83
| epoch   1 | 

In [None]:
import torch
import numpy as np
import pandas as pd

from model import RNNModel
from data import Dictionary, Corpus

In [None]:
DATA_PATH = "/content/NLP/data/wikitext-103"
corpus = Corpus(DATA_PATH)

print("Number of tokens:")
print("Train: ", len(corpus.train))
print("Valid: ", len(corpus.valid))
print("Test:  ", len(corpus.test))

print("Vocabulary size:", len(corpus.dictionary.idx2word))

Number of tokens:
Train:  102590700
Valid:  216347
Test:   244102
Vocabulary size: 267735


In [None]:
DEVICE = torch.device("cpu")
with open("/content/NLP/model.pt", 'rb') as f:
    model = torch.load(f, map_location='cpu')
model = model.to(DEVICE)

In [None]:
model.eval()

RNNModel(
  (drop): Dropout(p=0.2, inplace=False)
  (encoder): Embedding(267735, 200)
  (rnn): LSTM(200, 200, num_layers=2, dropout=0.2)
  (decoder): Linear(in_features=200, out_features=267735, bias=True)
)

**Perplexity (+2 bonus points)**

Compute the perplexity of the models. The lower the perplexity, the higher your score.



In [None]:
%%time
BPTT = 50
CRITERION = torch.nn.CrossEntropyLoss()

def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(DEVICE)

def get_batch(source, i):
    seq_len = min(BPTT, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target

def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0.
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(10)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, BPTT):
            data, targets = get_batch(data_source, i)
            output, hidden = model(data, hidden)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * CRITERION(output_flat, targets).item()
            hidden = repackage_hidden(hidden)
    return total_loss / len(data_source)

def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)
    
test_data = batchify(corpus.train, 10)
loss = evaluate(test_data)

In [None]:
loss, np.exp(loss)

(4.999740528464561, 148.37465510786626)

In [None]:

%%time
BPTT = 50
CRITERION = torch.nn.CrossEntropyLoss()

def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(DEVICE)

def get_batch(source, i):
    seq_len = min(BPTT, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target

def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0.
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(10)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, BPTT):
            data, targets = get_batch(data_source, i)
            output, hidden = model(data, hidden)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * CRITERION(output_flat, targets).item()
            hidden = repackage_hidden(hidden)
    return total_loss / len(data_source)

def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)
    
test_data = batchify(corpus.test, 10)
loss = evaluate(test_data)

CPU times: user 8min 53s, sys: 3.08 s, total: 8min 56s
Wall time: 4min 27s


In [None]:
loss, np.exp(loss)

(4.999740528464561, 148.37465510786626)

**Generating new text with RNNLM**

Write the code to generate new text segments from the RNNLM. Produce several outputs from both VanilaRNN and FancyRNN to compare the quality of 2 models.

In [None]:
test_tokens = corpus.test.numpy()
eos_pos = np.where(test_tokens == corpus.dictionary.word2idx["<eos>"])[0]
print("Number of lines in test:", len(eos_pos))

Number of lines in test: 2891


In [None]:
print(" ".join([corpus.dictionary.idx2word[c] for c in test_tokens[eos_pos[28]+1:eos_pos[29]]]))

The An Lushan Rebellion began in December 755 , and was not completely suppressed for almost eight years . It caused enormous disruption to Chinese society : the census of 754 recorded 52 @.@ 9 million people , but ten years later , the census counted just 16 @.@ 9 million , the remainder having been displaced or killed . During this time , Du Fu led a largely itinerant life unsettled by wars , associated famines and imperial displeasure . This period of unhappiness was the making of Du Fu as a poet : Even Shan Chou has written that , " What he saw around him — the lives of his family , neighbors , and strangers – what he heard , and what he hoped for or feared from the progress of various campaigns — these became the enduring themes of his poetry " . Even when he learned of the death of his youngest child , he turned to the suffering of others in his poetry instead of dwelling upon his own misfortunes . Du Fu wrote :


In [None]:
def generate_text_from_texts(texts, target_length=20, temperature=1.0):
    """texts needs to be tokens seperated by space characters."""
    token_tensor = torch.LongTensor([
        corpus.dictionary.word2idx[x] for x in texts.split(" ")
    ]).to(DEVICE)
    return generate_text_from_tensor(token_tensor, target_length, temperature)

In [None]:
def generate_text_from_chunk(start, end, target_length=20, temperature=1.0):
    token_tensor = corpus.test[eos_pos[start]+1:eos_pos[end]]
    return generate_text_from_tensor(token_tensor, target_length, temperature)
    

def generate_text_from_tensor(token_tensor, target_length, temperature):
    """Sampling from the softmax distribution."""    
    hidden = model.init_hidden(1)
    _, hidden = model(token_tensor[:-1].unsqueeze(1), hidden)
    input_tensor = torch.zeros((1, 1)).long().to(DEVICE)
    input_tensor[0, 0].fill_(token_tensor[-1])
    res = []
    with torch.no_grad():    
        for i in range(target_length):            
            output, hidden = model(input_tensor, hidden)
            word_weights = output.squeeze().div(temperature).exp()
            word_idx = torch.multinomial(word_weights, 1)[0]
            input_tensor[0, 0].fill_(word_idx)
            res.append(word_idx.item())
    return [
        [
           corpus.dictionary.idx2word[x] for x in arr            
        ] for arr in (token_tensor.numpy(), res)
    ]

In [None]:
context, new_texts = generate_text_from_chunk(28, 33, target_length=50)
print(" ".join(context[-10:]))
for i in range(0, len(new_texts), 10):
    print(" ".join(new_texts[i:i+10]))

bring more papers to pile higher on my desk .
" = indiction parasitoid Griggs was a wager in both
the Liberal theodicy and De Spangled Dawit , a disgrace
to Dien Bouvar in early 1944 . He was appointed
as a soldier on 15 November 1996 by Soviet forces
because he proved responsible , with social institutions ( Corder


In [None]:
def generate_text_from_texts(texts, target_length=20, temperature=1.0):
    """texts needs to be tokens seperated by space characters."""
    token_tensor = torch.LongTensor([
        corpus.dictionary.word2idx[x] for x in texts.split(" ")
    ]).to(DEVICE)
    return generate_text_from_tensor(token_tensor, target_length, temperature)

In [None]:
context, new_texts =  generate_text_from_texts("He is a", target_length=5)
print(" ".join(context[-10:]))
for i in range(0, len(new_texts), 10):
    print(" ".join(new_texts[i:i+10]))

He is a
strong audit . = =
