In [1]:
import numpy as np
import pandas as pd
import re
from collections import Counter

In [2]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words = set(stop_words)

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
df = pd.read_csv('training/training-data.1m', sep='\t', header=None)

In [4]:
# df2 = pd.read_csv('drive/MyDrive/nlp/training-data.1m.conll', sep='\t', header=None)

In [5]:
df.columns = ['text']

In [6]:
df.text

0         The U.S. Centers for Disease Control and Preve...
1         When Ms. Winfrey invited Suzanne Somers to sha...
2         Elk calling -- a skill that hunters perfected ...
3                                                  Don 't !
4         Fish , ranked 98th in the world , fired 22 ace...
                                ...                        
897686    He cited " inefficient production " and " cost...
897687    Gary Jenkins , head of fixed income research a...
897688    The Wildcats had 15 turnovers and shot 39 perc...
897689    What high-quality research ( or logic ) suppor...
897690    When afflicted during a game , he would make e...
Name: text, Length: 897691, dtype: object

In [7]:
RE_STRIP_SPECIAL_CHARS = r'[^a-zA-Z0-9\s]'
RE_WHITESPACE = r'[\s]+'
RE_NUMBER = r'[0-9]+'

In [8]:
def extract_token(corpus):
    print('tokenizing sentences')
    result = []
    for sentence in corpus:
        s = sentence.lower()
        s = re.sub(RE_STRIP_SPECIAL_CHARS, '', s)
        # s = re.sub(RE_NUMBER, '<NUMBER>', s)
        s = re.sub(RE_NUMBER, ' ', s)
        tokenized = nltk.word_tokenize(s)
        filtered_sentence = [w for w in tokenized if not w in stop_words]
        result.append(filtered_sentence)
    return result

In [9]:
res = extract_token(df.text)

tokenizing sentences


In [10]:
def get_vocab(tokens):
    vocab = []
    for sentence in tokens:
        for token in sentence:
            vocab.append(token)

    word_counter = Counter(vocab)
    avg_doc_length = len(vocab) / len(tokens)
    vocab = set(vocab)
    return vocab, word_counter, avg_doc_length

In [11]:
vocab, word_counter, avg_doc_length = get_vocab(res)
vocab_size = len(vocab)
print(f'vocab size is {vocab_size}, average doc length is {avg_doc_length}')

vocab size is 262416, average doc length is 13.993544549293688


In [12]:
remove_words = []
remove_below = 4

for word, count in word_counter.items():
    if count < remove_below:
        remove_words.append(word)

remove_words = set(remove_words)
print(f'{len(remove_words)} words that appeared less than {remove_below} times will be substituted by <UNK>')

183020 words that appeared less than 4 times will be substituted by <UNK>


In [13]:
processed_tokens = []
print('substituting words')

for sentence in res:
    temp = []
    for word in sentence:
        if word in remove_words:
            temp.append('<UNK>')
        else:
            temp.append(word)
    processed_tokens.append(temp)

substituting words


In [32]:
vocab, word_counter, avg_doc_length = get_vocab(processed_tokens)
vocab_size = len(vocab)
print(f'vocab size is {vocab_size}, new average doc length is {avg_doc_length}')

vocab size is 79397, new average doc length is 13.993544549293688


In [15]:
# sorted(word_counter.items(), key=lambda x:x[1])

In [33]:
word_to_idx = {w: idx for idx, w in enumerate(vocab)}
idx_to_word = {idx: w for idx, w in enumerate(vocab)}

In [36]:
def generate_skipgram(tokens, window):
    result = []
    print(f'generating skipgrams with window size {window}')
    for idx, sentence in enumerate(tokens):
        if idx % 100000 == 0 and idx > 0:
            print(f'processed {idx} sentences')
        for idx, token in enumerate(sentence):
            for i in range(idx - window, idx, 1):
                if i >= 0:
                    skipgram = [token, sentence[i]]
                    result.append(skipgram)
            for j in range(idx + 1, idx + window + 1, 1):
                if j < len(sentence):
                    skipgram = [token, sentence[j]]
                    result.append(skipgram)
    return result

In [37]:
def skipgram_to_idx(skipgrams, idx_dict):
    print('creating skipgram words <-> index dictionary')
    result = []
    for skipgram in skipgrams:
        result.append([idx_dict[skipgram[0]], idx_dict[skipgram[1]]])
    return result

In [38]:
def generate_batches(skipgrams, batch_size):
    n_batches = len(skipgrams) // batch_size
    skipgrams = skipgrams[:n_batches*batch_size]
    for i in range(0, len(skipgrams), batch_size):
        context = []
        target = []
        batch = skipgrams[i:i+batch_size]
        for j in range(len(batch)):
            context.append(batch[j][0])
            target.append(batch[j][1])
        yield context, target    

In [39]:
w = 2
skipgrams = generate_skipgram(processed_tokens, window=w)
print(f'got {len(skipgrams)} skipgrams')
skipgrams_idx = skipgram_to_idx(skipgrams, word_to_idx)

generating skipgrams with window size 2
processed 100000 sentences
processed 200000 sentences
processed 300000 sentences
processed 400000 sentences
processed 500000 sentences
processed 600000 sentences
processed 700000 sentences
processed 800000 sentences
got 44877554 skipgrams
creating skipgram words <-> index dictionary


In [40]:
import torch
from torch import nn
import torch.optim as optim

In [41]:
class SkipgramModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, word_dist):
        super().__init__()
        
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.word_dist = word_dist
        
        # ("orange", "juice", observed=1), ("orange", "king", observed=0) 
        # => "orange" is context word, "juice" & "king" are target words
        self.context_embed = nn.Embedding(vocab_size, embed_dim)
        self.target_embed = nn.Embedding(vocab_size, embed_dim)
        
        self.context_embed.weight.data.uniform_(-1, 1)
        self.target_embed.weight.data.uniform_(-1, 1)
    
    def get_context_row(self, word):
        return self.context_embed(word)
    
    def get_target_row(self, word):
        return self.target_embed(word)
    
    def get_negative_samples(self, batch_size, k):
        negative_samples = torch.multinomial(self.word_dist, batch_size * k, replacement=True)
        device = "cuda" if self.target_embed.weight.is_cuda else "cpu"
        negative_samples = negative_samples.to(device)
        return self.target_embed(negative_samples).view(batch_size, k, self.embed_dim)  

In [42]:
class SkipgramLoss(nn.Module):
    def __init__(self):
        super().__init__()
    
    def forward(self, context_vectors, target_vectors, negative_vectors):
        batch_size, embed_dim = context_vectors.shape
        context_vectors = context_vectors.view(batch_size, embed_dim, 1)
        target_vectors = target_vectors.view(batch_size, 1, embed_dim)
        
        observed_sample_loss = torch.bmm(target_vectors, context_vectors).sigmoid().log()
        observed_sample_loss = observed_sample_loss.squeeze()
        
        negative_sample_loss = torch.bmm(negative_vectors.neg(), context_vectors).sigmoid().log()
        negative_sample_loss = negative_sample_loss.squeeze().sum(1)
        
        return -(observed_sample_loss + negative_sample_loss).mean()

In [43]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
word_freq = np.asarray(sorted(word_counter.values(), reverse=True))
unigram_dist = word_freq / word_freq.sum()
negative_sample_dist = torch.from_numpy(unigram_dist**(0.75) / np.sum(unigram_dist**(0.75)))

embed_dim = 100
model = SkipgramModel(vocab_size, embed_dim, negative_sample_dist).to(device)
criterion = SkipgramLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0015)

print_every = 1
epochs = 8
k = 4
batch_size = 512

print('training started')
# train for some number of epochs
for e in range(epochs):

    counter=0
    
    # get our input, target batches
    for context_words, target_words in generate_batches(skipgrams_idx, batch_size):
        context, targets = torch.LongTensor(context_words), torch.LongTensor(target_words)
        context, targets = context.to(device), targets.to(device)

        # input, outpt, and noise vectors
        context_vectors = model.get_context_row(context)
        target_vectors = model.get_target_row(targets)
        negative_vectors = model.get_negative_samples(batch_size, k)

        # negative sampling loss
        loss = criterion(context_vectors, target_vectors, negative_vectors)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        counter+=1
        if counter % 10000 == 0:
            print(counter)
        

    # loss stats
    if e % print_every == 0:
        print(f"Epoch: {e}/{epochs}")
        print("Loss: ", loss.item()) # avg batch loss at this point in training
    

training started
10000
20000
30000
40000
50000
60000
70000
80000
Epoch: 0/8
Loss:  0.8400945663452148
10000
20000
30000
40000
50000
60000
70000
80000
Epoch: 1/8
Loss:  0.8830307126045227
10000
20000
30000
40000
50000
60000
70000
80000
Epoch: 2/8
Loss:  0.7795015573501587
10000
20000
30000
40000
50000
60000
70000
80000
Epoch: 3/8
Loss:  0.8539145588874817
10000
20000
30000
40000
50000
60000
70000
80000
Epoch: 4/8
Loss:  0.8403704166412354
10000
20000
30000
40000
50000
60000
70000
80000
Epoch: 5/8
Loss:  0.7971633076667786
10000
20000
30000
40000
50000
60000
70000
80000
Epoch: 6/8
Loss:  0.8513500690460205
10000
20000
30000
40000
50000
60000
70000
80000
Epoch: 7/8
Loss:  0.855947732925415


In [44]:
def output_embed(embed):
    print('writing embedding to output file')
    f = open('embedding.txt', 'w')
    for idx, word_embed in enumerate(embed):
        word = idx_to_word[idx]
        temp = word + ' '
        for value in word_embed:
            temp = temp + str(value) + ' '
        temp += '\n'
        f.write(temp)
    f.close()
    print('completed')

In [45]:
embeddings = model.context_embed.weight.to('cpu').data.numpy()
output_embed(embeddings)

writing embedding to output file
completed


In [64]:
a='<UNK> 123 456'

In [67]:
a[6:]

'123 456'

In [47]:
from numpy import dot
from numpy.linalg import norm

In [58]:
a=embeddings[word_to_idx['boston']]
b=embeddings[word_to_idx['irvine']]

In [59]:
cos_sim = dot(a, b)/(norm(a)*norm(b))

In [60]:
cos_sim

0.289817

In [62]:
word_to_idx['<UNK>']

9675

In [63]:
embeddings[9675]

array([-0.05414752, -0.0376316 , -0.02607704, -0.06123179,  0.00534598,
        0.7662117 ,  0.03429497,  0.11548834,  0.04311212,  0.02372784,
       -0.07481746,  0.10039123, -0.15158641, -0.0273816 ,  0.28386605,
       -0.00294422, -0.11725447,  0.09890588,  0.06209876, -0.1405471 ,
       -0.20329618, -0.06901078, -0.03143039,  0.11849811,  0.02379608,
       -0.04710384,  0.09432679, -0.13477474,  0.00616562,  0.05474778,
        0.01303839,  0.05359967, -0.06368975,  0.04925928, -0.02631722,
        0.12471944,  0.11074349,  0.38512275,  0.07833233, -0.07567672,
       -0.16856281,  0.01495748,  0.07998393,  0.05758069, -0.03740498,
       -0.18101503,  0.13332672, -0.30309963,  0.0449719 , -0.12572257,
       -0.10403565,  0.10048373, -0.04887088,  0.19241288,  0.0100302 ,
       -0.02922419,  0.03214812, -0.11194465, -0.08714218, -0.04433919,
        0.18598603,  0.0391029 ,  0.12263826, -0.04211675, -0.06916683,
        0.14960961, -0.01032088,  0.00958018,  0.05241602, -0.02

In [51]:
def generate_input_layer(word_idx):
    input_layer = torch.zeros(vocab_size).float()
    input_layer[word_idx] = 1.0
    return input_layer

In [53]:
print('initializing NN')
embedding_dim = 50
W2 = Variable(torch.randn(vocab_size, embedding_dim).float(), requires_grad = True)
W1 = Variable(torch.randn(embedding_dim, vocab_size).float(), requires_grad = True)

num_epochs = 1000
learning_rate = 0.01

print('training started')
for epo in range(num_epochs):
    loss_val = 0
    for data, target in skipgrams_idx:
        x = Variable(generate_input_layer(data)).float()
        y_true = Variable(torch.from_numpy(np.array([target])).long())

        z1 = torch.matmul(W1, x)
        z2 = torch.matmul(W2, z1)
    
        log_softmax = F.log_softmax(z2, dim=0)

        loss = F.nll_loss(log_softmax.view(1,-1), y_true)
        loss_val += loss.item()
        loss.backward()
        with torch.no_grad():
            W1.data -= learning_rate * W1.grad.data
            W2.data -= learning_rate * W2.grad.data

            W1.grad.data.zero_()
            W2.grad.data.zero_()
    if epo % 50 == 0:    
        print(f'Loss at epo {epo}: {loss_val/len(skipgrams_idx)}')
    else:
        print(epo)

initializing NN
training started
Loss at epo 0: 18.454513640520123
1


KeyboardInterrupt: 