## Natrual language processing

### (17 points) Word embedding (Skip-gram)

NLTK includes a small selection of texts from the Project Gutenberg electronic text archive, which contains some 25,000 free electronic books, hosted at http://www.gutenberg.org/. We begin by getting the Python interpreter to load the NLTK package

In [2]:
import nltk
import random
import numpy as np
from collections import Counter
flatten = lambda l: [item for sublist in l for item in sublist]
random.seed(1024)

In [3]:
nltk.download('gutenberg')
nltk.download('punkt')
nltk.corpus.gutenberg.fileids()

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\Shawn\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\gutenberg.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Shawn\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

Then we will use the skip-gram algorithm to analyze the first 100 sentences of "melville-moby_dick.txt"

In [49]:
corpus_ = list(nltk.corpus.gutenberg.sents('melville-moby_dick.txt'))[:100]
corpus = [[word.lower() for word in sent] for sent in corpus_]
word_count = Counter(flatten(corpus))
print(len(word_count))

592


**(1) Define the first 1% of the most common words and the first 1% of the least common words as stop words. (2 points)**
(Hint: please refer to the most_common() method of Counter class

In [21]:
n = len(word_count)

word_count.most_common(n)

max_idx = int(0.01*n)
min_idx = int(0.99*n)+1

most_common = word_count.most_common(n)[:max_idx]
least_common = word_count.most_common(n)[min_idx:]

stopwords = []

for item in most_common:
    stopwords.append(item[0])
    
for item in least_common:
    stopwords.append(item[0])
    
stopwords

[',', '.', 'the', 'of', 'and', 'state', '--(', 'civitas', 'artificial', 'man']

Check the answer: [',', '.', 'the', 'of', 'and', 'man', 'artificial', 'civitas', '--(', 'state']

**(2) Construct the vocabulary (1 points)**

**Note**:
- You should remove the stop words from the corpus
- You should add \<'UNK'\> to the vocabulary

In [52]:
while len(set(flatten(corpus)))!=582:
    for item in corpus:
        for word in stopwords:
            try:
                item.remove(word)
            except:
                pass

vocab = list(set(flatten(corpus)))
vocab.append('<UNK>')

print(len(set(flatten(corpus))), len(vocab))

582 583


In [53]:
assert len(vocab) == 583

**(3) Building the training data  (8 points)**

**Note**:
- Take all pairs containing the central word as training data
- Any pair contains '\<DUMMY\>' should be ignored

**Example**:

For window:

('\<DUMMY\>', '[', 'moby', 'dick', 'by', 'herman', 'melville')

We know 'dick' is the central word, so we should capture the following pairs:
    
('dick', '['), (,'dick','moby') ('dick', 'by') ('dick', 'herman'), ('dick','melville')
    


In [61]:
from itertools import combinations

In [132]:
WINDOW_SIZE = 3
windows = flatten([list(nltk.ngrams(['<DUMMY>'] * WINDOW_SIZE + c + ['<DUMMY>'] * WINDOW_SIZE, WINDOW_SIZE * 2 + 1)) for c in corpus])
train_data = []
def get_training_data(windows, win_size):
    for window in windows:
        pairs = list(combinations(window,2))
        central = window[3]
        for pair in pairs:
            if '<DUMMY>' not in pair and central in pair:
                train_data.append([central,pair])
    return train_data
    
train_data = get_training_data(windows, WINDOW_SIZE)
assert len(train_data) == 7606

AssertionError: 

Now there are many words stored in the training data, and we need to convert them into their indices in the dictionary.

In [133]:
word2index = {'<UNK>' : 0}

for vo in vocab:
    if word2index.get(vo) is None:
        word2index[vo] = len(word2index)

index2word = {v:k for k, v in word2index.items()}

def convert_training_data_into_idx(train_data):
    central_word = []
    window_word = []
    
    for data in train_data:
        central_word.append(word2index[data[0]])
        for win_word in data[1]:
            if win_word != data[0]:
                window_word.append(word2index[win_word])
                
    return central_word, window_word
central_word, window_word = convert_training_data_into_idx(train_data)
train_data = list(zip(central_word, window_word))

**(4) Build skip-gram model (3 points)**

In [None]:
class Skipgram(nn.Module):
    def __init__(self, vocab_size, projection_dim):
        super(Skipgram,self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, projection_dim)
        self.embedding_u = nn.Embedding(vocab_size, projection_dim)
        self.embedding_v.weight.data.uniform_(-1, 1)
        self.embedding_u.weight.data.uniform_(0, 0)

    def forward(self, center_words,target_words, outer_words):
        '''Please return negative log likelihood'''
        embed_u = self.u_embeddings(center_words)
        embed_v = self.v_embeddings(target_words)

        score  = torch.mul(embed_u, embed_v)
        score = torch.sum(score, dim=1)
        log_target = F.logsigmoid(score).squeeze()

        neg_embed_v = self.v_embeddings(outer_words)

        neg_score = torch.bmm(neg_embed_v, embed_u.unsqueeze(2)).squeeze()
        neg_score = torch.sum(neg_score, dim=1)
        sum_log_sampled = F.logsigmoid(-1*neg_score).squeeze()

        loss = log_target + sum_log_sampled

        return -1*loss.sum()
    
    def prediction(self, inputs):
        embeds = self.embedding_v(inputs)
        return embeds

**(5) Train skip-gram model (3 points)**

In [None]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return Variable(LongTensor(idxs))

def getBatch(batch_size, train_data):
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    while eindex < len(train_data):
        batch = train_data[sindex: eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch
    
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

In [None]:
def train_model(n_epochs, model, batch_size, train_data, opt_fn):
    """
    Trains the model on a dataset.
    
    Args:
        n_epochs: number of epochs
        model: LanguageModel object
        train_dl: training data
        loss_fn: the loss function
        opt_fn: the optimizer
        lr: learning rate
    
    Returns:
        The trained model. 
        A tuple of (model, train_losses, val_losses, train_perplexity, val_perplexity)
    """
    for epoch in range(n_epochs):
        for i, batch in enumerate(getBatch(batch_size, train_data)):
            inputs, targets = zip(*batch)
            inputs = torch.cat(inputs) # B x 1
            targets = torch.cat(targets) # B x 1
            model.zero_grad()
            
            loss = model(inputs, targets, outers)
            
            loss.backward()
            optimizer.step()
            losses.append(loss.data.tolist()[0])

        if epoch % 10 == 0:
            print("Epoch : %d, mean_loss : %.02f" % (epoch,np.mean(losses)))
            losses = []
    
    return model, losses


num_epochs = 1  # Max number of training epochs
batch_size = 128  # Set the batch size
embed_size = 10
model = Skipgram(583, embed_size)
lr = 0.1 # Set the learning rate
opt_fn = optim.SGD(model.parameters(),lr=lr)  # Select an optimizer
train_model(num_epochs, model, batch_size, train_data, opt_fn)