<a href="https://colab.research.google.com/github/GloriaShuang/NLP-Projects/blob/master/lm_homework_1001_0.3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DS-GA 1011 Homework 2
## N-Gram and Neural Language Modeling

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
!pip install jsonlines



In [0]:
import os
import json
import jsonlines
import numpy as np
from collections import defaultdict

## I. N-Gram Language Modeling

#### Utilities

In [0]:
def load_wikitext(filename='wikitext2-sentencized.json'):
    if not os.path.exists(filename):
        !wget "https://nyu.box.com/shared/static/9kb7l7ci30hb6uahhbssjlq0kctr5ii4.json" -O $filename
    
    datasets = json.load(open(filename, 'r'))
    for name in datasets:
        datasets[name] = [x.split() for x in datasets[name]]
    vocab = list(set([t for ts in datasets['train'] for t in ts]))      
    print("Vocab size: %d" % (len(vocab)))
    return datasets, vocab

def perplexity(model, sequences):
    n_total = 0
    logp_total = 0
    for sequence in sequences:
        logp_total += model.sequence_logp(sequence)
        n_total += len(sequence) + 1  
    ppl = 2 ** (- (1.0 / n_total) * logp_total)  
    return ppl

In [0]:
class NGramAdditive(object):
    def __init__(self, n, delta, vsize):
        self.n = n
        self.delta = delta
        self.count = defaultdict(lambda: defaultdict(float))
        self.total = defaultdict(float)
        self.vsize = vsize
    
    def estimate(self, sequences):
        for sequence in sequences:
            padded_sequence = ['<bos>']*(self.n-1) + sequence + ['<eos>']
            for i in range(len(padded_sequence) - self.n+1):
                ngram = tuple(padded_sequence[i:i+self.n])
                prefix, word = ngram[:-1], ngram[-1]
                self.count[prefix][word] += 1
                self.total[prefix] += 1
                
    def sequence_logp(self, sequence):
        padded_sequence = ['<bos>']*(self.n-1) + sequence + ['<eos>']
        total_logp = 0
        for i in range(len(padded_sequence) - self.n+1):
            ngram = tuple(padded_sequence[i:i+self.n])
            total_logp += np.log2(self.ngram_prob(ngram))
        return total_logp

    def ngram_prob(self, ngram):
        prefix = ngram[:-1]
        word = ngram[-1]
        prob = ((self.delta + self.count[prefix][word]) / 
                (self.total[prefix] + self.delta*self.vsize))
        return prob

In [0]:
datasets, vocab = load_wikitext()

delta = 0.0005
for n in [2, 3, 4]:
    lm = NGramAdditive(n=n, delta=delta, vsize=len(vocab)+1)  # +1 is for <eos>
    lm.estimate(datasets['train'])

    print("Baseline (Additive smoothing, n=%d, delta=%.4f)) Train Perplexity: %.3f" % (n, delta, perplexity(lm, datasets['train'])))
    print("Baseline (Additive smoothing, n=%d, delta=%.4f)) Valid Perplexity: %.3f" % (n, delta, perplexity(lm, datasets['valid'])))

--2019-10-01 21:22:37--  https://nyu.box.com/shared/static/9kb7l7ci30hb6uahhbssjlq0kctr5ii4.json
Resolving nyu.box.com (nyu.box.com)... 103.116.4.197
Connecting to nyu.box.com (nyu.box.com)|103.116.4.197|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /public/static/9kb7l7ci30hb6uahhbssjlq0kctr5ii4.json [following]
--2019-10-01 21:22:37--  https://nyu.box.com/public/static/9kb7l7ci30hb6uahhbssjlq0kctr5ii4.json
Reusing existing connection to nyu.box.com:443.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://nyu.app.box.com/public/static/9kb7l7ci30hb6uahhbssjlq0kctr5ii4.json [following]
--2019-10-01 21:22:38--  https://nyu.app.box.com/public/static/9kb7l7ci30hb6uahhbssjlq0kctr5ii4.json
Resolving nyu.app.box.com (nyu.app.box.com)... 103.116.4.199
Connecting to nyu.app.box.com (nyu.app.box.com)|103.116.4.199|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://public.boxcloud.com/d/1/

## II. Neural Language Modeling with a Recurrent Neural Network

In [0]:
import torch
import torch.nn as nn

#### Utilities

(Hint: you can adopt the `Dictionary`, dataset loading, and training code from the lab for use here)

##### Dictionary

In [0]:
import sys

try:
    import jsonlines
except ImportError:
    print('Installing the package, RESTART THIS CELL')
    !{sys.executable} -m pip install jsonlines
  
try:
    from tqdm import tqdm
except ImportError:
    print('Installing the package, RESTART THIS CELL')
    !{sys.executable} -m pip install tqdm
  
import os
    


class Dictionary(object):
    def __init__(self, datasets, include_valid=False):
        self.tokens = []
        self.ids = {}
        self.counts = {}
        
        # add special tokens
        self.add_token('<bos>')
        self.add_token('<eos>')
        self.add_token('<pad>')
        self.add_token('<unk>')
        
        for line in tqdm(datasets['train']):
            for w in line:
                self.add_token(w)
                    
        if include_valid is True:
            for line in tqdm(datasets['valid']):
                for w in line:
                    self.add_token(w)
                            
    def add_token(self, w):
        if w not in self.tokens:
            self.tokens.append(w)
            _w_id = len(self.tokens) - 1
            self.ids[w] = _w_id
            self.counts[w] = 1
        else:
            self.counts[w] += 1

    def get_id(self, w):
        return self.ids[w]
    
    def get_token(self, idx):
        return self.tokens[idx]
    
    def decode_idx_seq(self, l):
        return [self.tokens[i] for i in l]
    
    def encode_token_seq(self, l):
        return [self.ids[i] if i in self.ids else self.ids['<unk>'] for i in l]
    
    def __len__(self):
        return len(self.tokens)

 ##### Dataset loading

In [0]:
def tokenize_dataset(datasets, dictionary, ngram_order=2):
    tokenized_datasets = {}
    for split, dataset in datasets.items():
        _current_dictified = []
        for l in tqdm(dataset):
            l = ['<bos>']*(ngram_order-1) + l + ['<eos>']
            encoded_l = dictionary.encode_token_seq(l)
            _current_dictified.append(encoded_l)
        tokenized_datasets[split] = _current_dictified
        
    return tokenized_datasets

In [0]:
# this function slices the input sequence into ngrams i.e.
# [0,1,2,3,4,5] will be sliced into bigrams
# [0,1], [1,2], [2,3], [3,4], [4,5] etc for bigger orders

def slice_sequences_given_order(tokenized_dataset_with_spec, ngram_order=5):
    sliced_datasets = {}
    for split, dataset in tokenized_dataset_with_spec.items():
        _list_of_sliced_ngrams = []
        for seq in tqdm(dataset):
            ngrams = [seq[i:i+ngram_order] for i in range(len(seq)-ngram_order+1)]
            _list_of_sliced_ngrams.extend(ngrams)
        sliced_datasets[split] = _list_of_sliced_ngrams

    return sliced_datasets

In [0]:
def load_wikitext(filename='wikitext2-sentencized.json'):
    if not os.path.exists(filename):
        !wget "https://nyu.box.com/shared/static/9kb7l7ci30hb6uahhbssjlq0kctr5ii4.json" -O $filename
    
    datasets = json.load(open(filename, 'r'))
    for name in datasets:
        datasets[name] = [x.split() for x in datasets[name]]
    vocab = list(set([t for ts in datasets['train'] for t in ts]))      
    print("Vocab size: %d" % (len(vocab)))
    return datasets, vocab
  
  
# wiki_dataset, vocab = load_wikitext()

# print(wiki_dataset['train'][1])

In [0]:

wiki_dataset, wiki_dict_ori = load_wikitext()

wiki_dict = Dictionary(wiki_dataset, include_valid=True)

# checking some example
print(' '.join(wiki_dataset['train'][3010]))

encoded = wiki_dict.encode_token_seq(wiki_dataset['train'][3010])
print(f'\n encoded - {encoded}')
decoded = wiki_dict.decode_idx_seq(encoded)
print(f'\n decoded - {decoded}')

# personachat_tokenized_datasets_5gram = tokenize_dataset(wiki_dataset, wiki_dict, ngram_order=5)


  1%|          | 458/78274 [00:00<00:17, 4576.98it/s]

Vocab size: 33175


100%|██████████| 78274/78274 [02:17<00:00, 570.25it/s]
100%|██████████| 8464/8464 [00:10<00:00, 779.53it/s]

The Nataraja and Ardhanarishvara sculptures are also attributed to the Rashtrakutas .

 encoded - [75, 8816, 30, 8817, 8732, 70, 91, 2960, 13, 6, 8806, 39]

 decoded - ['The', 'Nataraja', 'and', 'Ardhanarishvara', 'sculptures', 'are', 'also', 'attributed', 'to', 'the', 'Rashtrakutas', '.']





In [0]:
print(' '.join(wiki_dataset['train'][3010]))
encoded = wiki_dict.encode_token_seq(wiki_dataset['train'][3010])
print(f'\n encoded - {encoded}')
decoded = wiki_dict.decode_idx_seq(encoded)
print(f'\n decoded - {decoded}')

print(len(wiki_dict))

The Nataraja and Ardhanarishvara sculptures are also attributed to the Rashtrakutas .

 encoded - [75, 8816, 30, 8817, 8732, 70, 91, 2960, 13, 6, 8806, 39]

 decoded - ['The', 'Nataraja', 'and', 'Ardhanarishvara', 'sculptures', 'are', 'also', 'attributed', 'to', 'the', 'Rashtrakutas', '.']
33181


### II.1 LSTM and Hyper-Parameters

#### 1. Define the dataset and dataloader

##### 1.1 Define the dataset

In [0]:
import torch
from torch.utils.data import Dataset, RandomSampler, SequentialSampler, DataLoader

class TensoredDataset(Dataset):
    def __init__(self, list_of_lists_of_tokens):
        self.input_tensors = []
        self.target_tensors = []
        
        for sample in list_of_lists_of_tokens:
            self.input_tensors.append(torch.tensor([sample[:-1]], dtype=torch.long))
            self.target_tensors.append(torch.tensor([sample[1:]], dtype=torch.long))
    
    def __len__(self):
        return len(self.input_tensors)
    
    def __getitem__(self, idx):
        # return a (input, target) tuple
        return (self.input_tensors[idx], self.target_tensors[idx])

In [0]:
def pad_list_of_tensors(list_of_tensors, pad_token):
    max_length = max([t.size(-1) for t in list_of_tensors])
    padded_list = []
    
    for t in list_of_tensors:
        padded_tensor = torch.cat([t, torch.tensor([[pad_token]*(max_length - t.size(-1))], dtype=torch.long)], dim = -1)
        padded_list.append(padded_tensor)
        
    padded_tensor = torch.cat(padded_list, dim=0)
    
    return padded_tensor

def pad_collate_fn(batch):
    # batch is a list of sample tuples
    input_list = [s[0] for s in batch]
    target_list = [s[1] for s in batch]
    
    #pad_token = persona_dict.get_id('<pad>')
    pad_token = 2
    
    input_tensor = pad_list_of_tensors(input_list, pad_token)
    target_tensor = pad_list_of_tensors(target_list, pad_token)
    
    return input_tensor, target_tensor

##### 1.2 define the tokenized datasets

In [0]:
print(wiki_dataset['train'][1])

encoded = wiki_dict.encode_token_seq(wiki_dataset['train'][1])
print(wiki_dict.decode_idx_seq(encoded))

['Released', 'in', 'January', '2011', 'in', 'Japan', ',', 'it', 'is', 'the', 'third', 'game', 'in', 'the', 'Valkyria', 'series', '.']
['Released', 'in', 'January', '2011', 'in', 'Japan', ',', 'it', 'is', 'the', 'third', 'game', 'in', 'the', 'Valkyria', 'series', '.']


In [0]:
wiki_tokenized_datasets = tokenize_dataset(wiki_dataset, wiki_dict)
wiki_tensor_dataset = {}

for split, listoflists in wiki_tokenized_datasets.items():
    wiki_tensor_dataset[split] = TensoredDataset(listoflists)
    
# check the first example
wiki_tensor_dataset['train'][13]

100%|██████████| 78274/78274 [00:00<00:00, 120058.08it/s]
100%|██████████| 8464/8464 [00:00<00:00, 125508.97it/s]
100%|██████████| 9708/9708 [00:00<00:00, 128737.65it/s]


(tensor([[  0, 171,  70, 172, 173, 174, 175,  23, 176, 177, 111, 178, 179, 180,
           10, 111, 181, 182, 183, 173, 184, 185, 186,  30, 183, 173,   3, 187,
           39]]),
 tensor([[171,  70, 172, 173, 174, 175,  23, 176, 177, 111, 178, 179, 180,  10,
          111, 181, 182, 183, 173, 184, 185, 186,  30, 183, 173,   3, 187,  39,
            1]]))

##### Define the dataloader

In [0]:
wiki_loaders = {}

batch_size = 64

for split, wiki_dataset in wiki_tensor_dataset.items():
    wiki_loaders[split] = DataLoader(wiki_dataset, batch_size=batch_size, shuffle=True, collate_fn=pad_collate_fn)

#### 2. Define the model and initialize the model

##### 1. define the model



In [0]:
import torch.nn as nn

class LSTMLanguageModel(nn.Module):
    """
    This model combines embedding, rnn and projection layer into a single model
    """
    def __init__(self, options):
        super().__init__()
        
        # create each LM part here 
        self.lookup = nn.Embedding(num_embeddings=options['num_embeddings'], embedding_dim=options['embedding_dim'], padding_idx=options['padding_idx'])
        self.LSTM = nn.LSTM(options['input_size'], options['hidden_size'], options['num_layers'], dropout=options['rnn_dropout'], batch_first=True)
        self.projection = nn.Linear(options['hidden_size'], options['num_embeddings'])
        
    def forward(self, encoded_input_sequence):
        """
        Forward method process the input from token ids to logits
        """
        embeddings = self.lookup(encoded_input_sequence)
        rnn_outputs = self.LSTM(embeddings)
        logits = self.projection(rnn_outputs[0])
        
        return logits

In [0]:
import os 

if not os.path.exists('/content/drive/My Drive/nlp2_save_model'):
  os.mkdir('/content/drive/My Drive/nlp2_save_model')

In [0]:
import torch
import torch.nn as nn
import torch.optim as optim
import pickle
import os

# creating a LSTM model, criterion and optimizer


# create the model folder
epoch_num = 0
load_pretrained = False


num_gpus = torch.cuda.device_count()
if num_gpus > 0:
    current_device = 'cuda'
else:
    current_device = 'cpu'
    
    
    

if load_pretrained:
    # input the epoch num you want to continue based on
    epoch_num = 0
    
    embedding_size = 256
    hidden_size = 300
    num_layers = 1
    rnn_dropout = 0.3

    model_predix = model_predix = '1001_LSTM_'+'dim_'+str(embedding_size)+'hidden_size_'+str(hidden_size)+'rnn_dropout_'+str(rnn_dropout)+ 'num_layers_' +str(num_layers)
    path_root = '/content/drive/My Drive/nlp2_save_model/' + model_predix

    option_path = os.path.join(path_root,'options.pickle')
    f = open(option_path, 'rb')
    options = pickle.load(f)

  
    model_name = 'lstm' + str(epoch_num)+'.pth'
    model_path = os.path.join(path_root,model_name)
    if not os.path.exists(model_path):
        raise EOFError('Download pretrained model!')
    model_dict = torch.load(model_path)
    model = LSTMLanguageModel(options).to(current_device)
    model.load_state_dict(model_dict)
    
else:
  # setting the new model parametter
#     embedding_size = 256
#     hidden_size = 512
#     num_layers = 3
#     rnn_dropout = 0.3
    embedding_size = 256
    hidden_size = 300
    num_layers = 1
    rnn_dropout = 0.3
    
    model_predix = model_predix = '1001_LSTM_'+'dim_'+str(embedding_size)+'hidden_size_'+str(hidden_size)+'rnn_dropout_'+str(rnn_dropout)+ 'num_layers_' + str(num_layers)
    path_root = '/content/drive/My Drive/nlp2_save_model/' + model_predix

    if not os.path.exists(path_root):
      os.mkdir(path_root)
    
    options = {
        'num_embeddings': len(wiki_dict),
        'embedding_dim': embedding_size,
        'padding_idx': wiki_dict.get_id('<pad>'),
        'input_size': embedding_size,
        'hidden_size': hidden_size,
        'num_layers': num_layers,
        'rnn_dropout': rnn_dropout,
    }
    
    model_para = options
    
    with open(os.path.join(path_root,'options.pickle'), 'wb') as handle:
      pickle.dump(model_para, handle, protocol=pickle.HIGHEST_PROTOCOL)

    model = LSTMLanguageModel(options).to(current_device)

    
    
criterion = nn.CrossEntropyLoss(ignore_index=wiki_dict.get_id('<pad>'))

model_parameters = [p for p in model.parameters() if p.requires_grad]
optimizer = optim.SGD(model_parameters, lr=0.001, momentum=0.999)

  "num_layers={}".format(dropout, num_layers))


In [0]:
model

LSTMLanguageModel(
  (lookup): Embedding(33181, 256, padding_idx=2)
  (LSTM): LSTM(256, 300, batch_first=True, dropout=0.3)
  (projection): Linear(in_features=300, out_features=33181, bias=True)
)

In [0]:
# now we make same training loop, now with dataset and the model
import torch 
import pickle

model_para = options


# define the epoch number of start training

if load_pretrained == True:
  epoch_now = epoch_num +1
else:
  epoch_now = 0
  
# if using pretrained model, the load the previous plot_cache 
if load_pretrained:
  plot_cache = torch.load(os.path.join(path_root,'plot_acc'))
else:
  plot_cache = []
    
    
# start training
for epoch_number in range(epoch_now,100):
    avg_loss=0
    if not load_pretrained:
        # do train
        model.train()
        train_log_cache = []
        for i, (inp, target) in enumerate(wiki_loaders['train']):
            optimizer.zero_grad()
            inp = inp.to(current_device)
            target = target.to(current_device)
            logits = model(inp)
            
            loss = criterion(logits.view(-1, logits.size(-1)), target.view(-1))
            
            loss.backward()
            optimizer.step()
            
            train_log_cache.append(loss.item())
            
            if i % 100 == 0:
                avg_loss = sum(train_log_cache)/len(train_log_cache)
                print('Step {} avg train loss = {:.{prec}f}'.format(i, avg_loss, prec=4))
                train_log_cache = []
                
    model_path = os.path.join(path_root,'lstm'+ str(epoch_number)+".pth")            
    torch.save(model.state_dict(), model_path)           
                
            
    #do valid
    valid_losses = []
    model.eval()
    with torch.no_grad():
        for i, (inp, target) in enumerate(wiki_loaders['valid']):
            inp = inp.to(current_device)
            target = target.to(current_device)
            logits = model(inp)

            loss = criterion(logits.view(-1, logits.size(-1)), target.view(-1))
            valid_losses.append(loss.item())
        avg_val_loss = sum(valid_losses) / len(valid_losses)
        print('Validation loss after {} epoch = {:.{prec}f}'.format(epoch_number, avg_val_loss, prec=4))
        
    plot_cache.append((avg_loss, avg_val_loss))

#     if load_pretrained:
#         break
        
torch.save(plot_cache, os.path.join(path_root,'plot_acc'))  

Step 0 avg train loss = 10.4201
Step 100 avg train loss = 10.3716
Step 200 avg train loss = 10.0798
Step 300 avg train loss = 9.3163
Step 400 avg train loss = 8.5175
Step 500 avg train loss = 8.2100
Step 600 avg train loss = 7.9637
Step 700 avg train loss = 7.7632
Step 800 avg train loss = 7.7155
Step 900 avg train loss = 7.5711
Step 1000 avg train loss = 7.4836
Step 1100 avg train loss = 7.4365
Step 1200 avg train loss = 7.3373
Validation loss after 0 epoch = 7.1523
Step 0 avg train loss = 7.2859
Step 100 avg train loss = 7.2802
Step 200 avg train loss = 7.2160
Step 300 avg train loss = 7.1749
Step 400 avg train loss = 7.1063
Step 500 avg train loss = 7.0531
Step 600 avg train loss = 7.0193
Step 700 avg train loss = 6.9520
Step 800 avg train loss = 6.9078
Step 900 avg train loss = 6.8756
Step 1000 avg train loss = 6.8456
Step 1100 avg train loss = 6.8129
Step 1200 avg train loss = 6.7719
Validation loss after 1 epoch = 6.5664
Step 0 avg train loss = 6.7693
Step 100 avg train loss = 6.

#### Results (LSTM vs. Baseline)

In [0]:
  plot_cache = torch.load(os.path.join(path_root,'plot_acc'))

In [0]:
import matplotlib.pyplot as plt
import numpy

epochs = numpy.array(list(range(len(plot_cache))))
plt.plot(epochs, [i[0] for i in plot_cache], label='Train loss')
plt.plot(epochs, [i[1] for i in plot_cache], label='Valid loss')

plt.legend()
plt.title('Loss curves')
plt.show()

In [0]:
import matplotlib.pyplot as plt
import numpy

epochs = numpy.array(list(range(len(plot_cache))))
plt.plot(epochs, [2**(i[0]/numpy.log(2)) for i in plot_cache], label='Train ppl')
plt.plot(epochs, [2**(i[1]/numpy.log(2)) for i in plot_cache], label='Valid ppl')

plt.legend()
plt.title('PPL curves')
plt.show()

#### Performance Variation Based on Hyperparameter Values

### II.2 Learned Embeddings

#### Utilities

Below is code to use [UMAP](https://umap-learn.readthedocs.io/en/latest/) to find a 2-dimensional representation of a weight matrix, and plot the resulting 2-dimensional points that correspond to certain words.

Use `!pip install umap-learn` to install UMAP.

In [0]:
%pylab inline 
import umap
import matplotlib.pyplot as plt

def umap_plot(weight_matrix, word_ids, words):
    """Run UMAP on the entire Vxd `weight_matrix` (e.g. model.lookup.weight or model.projection.weight),
    And plot the points corresponding to the given `word_ids`. """
    reduced = umap.UMAP(min_dist=0.0001).fit_transform(weight_matrix.detach().cpu().numpy())
    plt.figure(figsize=(20,20))

    to_plot = reduced[word_ids, :]
    plt.scatter(to_plot[:, 0], to_plot[:, 1])
    for i, word_id in enumerate(word_ids):
        current_point = to_plot[i]
        plt.annotate(words[i], (current_point[0], current_point[1]))

    plt.grid()
    plt.show()

In [0]:
Vsize = 100                                 # e.g. len(dictionary)
d = 32                                      # e.g. model.lookup.weight.size(1) 
fake_weight_matrix = torch.randn(Vsize, d)  # e.g. model.lookup.weight

words = ['the', 'dog', 'ran']
word_ids = [4, 54, 20]                  # e.g. use dictionary.get_id on a list of words

umap_plot(fake_weight_matrix, word_ids, words)

#### II.2.1 Word Similarity

#### II.2.2 Embedding Layer

#### II.2.3 Projection Layer

### II.3 Scoring

#### II.3.2 Highest and Lowest scoring sequences

#### II.3.3 Modified sequences

### II.4 Sampling

#### II.4.3 Number of unique tokens and sequence length 

(1,000 samples vs. 1,000 randomly selected validation-set sequences)

#### II.4.4 Example Samples