In [90]:
import re
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import plotly.express as px
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ID\AppData\Roaming\nltk_data...


True

In [2]:
# read text
text = open('parsed_text.txt').read()

In [3]:
def tokenize(text, lemmatization_mode=False):
    sents = sent_tokenize(text)
    words = [word_tokenize(sent) for sent in sents]
    processed = []
    if lemmatization_mode == True:
        lemmatizer = WordNetLemmatizer()
        for sent in words:
            processed.append([lemmatizer.lemmatize(word).lower() for word in sent if word.isalpha()])      # lemmatize non-numeric
    else:
        for sent in words:
            processed.append([word.lower() for word in sent if word.isalpha()])
    
    return processed

In [4]:
def bag_w(w):
    vocab = set()
    for sent in w:
        for word in sent:
            vocab.add(word)
    
    word_id, id_word = {}, {}  

    for i, word in enumerate(vocab):
        word_id[word] = i
        id_word[i] = word

    return word_id, id_word

In [5]:
def one_hot_encode(id, vocab):
    encoding = [0] * len(vocab)
    encoding[id] = 1

    return encoding

In [6]:
def generate_pairs(w, window):
    pairs = []
    for sent in w:
        sent_len = len(sent)
        for index, word in enumerate(sent):
            behind = list(range(max(0, index-window), index))
            ahead = list(range(index+1, min(sent_len, index+1+window)))
            for i in behind:
                pairs.append((word, sent[i]))
            for j in ahead:
                pairs.append((word, sent[j]))
    
    return pairs

In [7]:
def create_train_set(pairs, w_id):
    x = []
    y = []
    for x_word, y_word in pairs:
        x.append(one_hot_encode(w_id[x_word], w_id))
        y.append(one_hot_encode(w_id[y_word], w_id))

    return np.array(x), np.array(y)

In [8]:
def preprocess(t, l_mode, window):
    w = tokenize(t, lemmatization_mode=l_mode)
    w_id, id_w = bag_w(w)
    pairs = generate_pairs(w, window=window)
    x, y = create_train_set(pairs, w_id)

    return w, w_id, id_w, pairs, torch.tensor(x, dtype=torch.float), torch.tensor(y, dtype=torch.float)

In [36]:
# run
w, w_id, id_w, pairs, x, y = preprocess(t=text, l_mode=False, window=2)

In [46]:
class W2V(nn.Module):
    def __init__(self, len_vocab, len_embedding):
        super().__init__()
        self.len_vocab = len_vocab
        self.len_embedding = len_embedding
        self.layer1 = nn.Parameter(data=torch.randn(self.len_vocab, self.len_embedding), requires_grad=True)     # vocab_len * emb_len
        self.layer2 = nn.Parameter(data=torch.randn(self.len_embedding, self.len_vocab), requires_grad=True)     # emb_len * vocab_len 
 
    def forward(self, x):                                                                                         
        x = x @ self.layer1                                                                                      # num_of_pairs * vocab_len -> num_of_pairs * emb_len
        x = x @ self.layer2                                                                                      # num_of_pairs * emb_len -> num_of_pairs * vocab_len
        return x

In [47]:
len_vocab = len(w_id)
len_embedding = 10

learning_rate = 0.1
epochs = 100

model = W2V(len_vocab, len_embedding)
loss_fn = nn.CrossEntropyLoss()  
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

losses = []

for epoch in tqdm(range(epochs)):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    losses.append(loss.item())

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

100%|██████████| 100/100 [00:01<00:00, 86.94it/s]


In [48]:
fig = px.line(x=range(epochs), y=losses, markers=True)
fig.update_layout(xaxis_title='epoch',yaxis_title='loss')
fig.show()

In [69]:
def similarity(vec1, vec2):
    return (vec1 @ vec2) / (torch.norm(vec1) * torch.norm(vec2))

In [70]:
def find_similar(word1, word_vec, num):
    if word1 not in word_vec:
        print('Word is not in the dictionary!')
        return []

    vec1 = word_vec[word1]
    sim = {}

    for word2, vec2 in word_vec.items():
        if word1 != word2:
            sim[word2] = similarity(vec1, vec2)

    sim_sort = sorted(sim.items(), key=lambda item: item[1], reverse=True)
    similar = sim_sort[:num]

    return similar

In [71]:
vectors = list(model.parameters())[0].detach()

In [72]:
w_vec = {word: vectors[index] for word, index in w_id.items()}

In [73]:
find_similar('machine', w_vec, num=5)

[('learning', tensor(0.6812)),
 ('invented', tensor(0.4507)),
 ('dealing', tensor(0.4400)),
 ('employee', tensor(0.4339)),
 ('when', tensor(0.4294))]

### Optional

In [74]:
w, w_id, id_w, pairs, x, y = preprocess(t=text, l_mode=False, window=4)

In [75]:
len_vocab = len(w_id)
len_embedding = 10

learning_rate = 0.1
epochs = 100

model = W2V(len_vocab, len_embedding)
loss_fn = nn.CrossEntropyLoss()  
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

losses = []

for epoch in tqdm(range(epochs)):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    losses.append(loss.item())

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

100%|██████████| 100/100 [00:02<00:00, 45.12it/s]


In [76]:
vectors = list(model.parameters())[0].detach()

In [77]:
w_vec = {word: vectors[index] for word, index in w_id.items()}

In [78]:
find_similar('machine', w_vec, num=5)

[('learning', tensor(0.9599)),
 ('not', tensor(0.8703)),
 ('objectives', tensor(0.8621)),
 ('all', tensor(0.8493)),
 ('although', tensor(0.8321))]

In [79]:
w, w_id, id_w, pairs, x, y = preprocess(t=text, l_mode=False, window=2)

In [80]:
len_vocab = len(w_id)
len_embedding = 40

learning_rate = 0.1
epochs = 100

model = W2V(len_vocab, len_embedding)
loss_fn = nn.CrossEntropyLoss()  
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

losses = []

for epoch in tqdm(range(epochs)):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    losses.append(loss.item())

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

100%|██████████| 100/100 [00:01<00:00, 70.93it/s]


In [81]:
vectors = list(model.parameters())[0].detach()

In [82]:
w_vec = {word: vectors[index] for word, index in w_id.items()}

In [83]:
find_similar('machine', w_vec, num=5)

[('statistically', tensor(0.4583)),
 ('learning', tensor(0.4391)),
 ('behavior', tensor(0.4192)),
 ('book', tensor(0.3855)),
 ('introduced', tensor(0.3838))]

In [84]:
w, w_id, id_w, pairs, x, y = preprocess(t=text, l_mode=False, window=4)

In [85]:
len_vocab = len(w_id)
len_embedding = 40

learning_rate = 0.1
epochs = 100

model = W2V(len_vocab, len_embedding)
loss_fn = nn.CrossEntropyLoss()  
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

losses = []

for epoch in tqdm(range(epochs)):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    losses.append(loss.item())

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

100%|██████████| 100/100 [00:02<00:00, 38.69it/s]


In [86]:
vectors = list(model.parameters())[0].detach()

In [87]:
w_vec = {word: vectors[index] for word, index in w_id.items()}

In [88]:
find_similar('machine', w_vec, num=5)

[('learning', tensor(0.6488)),
 ('provides', tensor(0.5528)),
 ('set', tensor(0.4821)),
 ('describing', tensor(0.4617)),
 ('history', tensor(0.4514))]

In [91]:
w, w_id, id_w, pairs, x, y = preprocess(t=text, l_mode=True, window=2)

In [92]:
len_vocab = len(w_id)
len_embedding = 10

learning_rate = 0.1
epochs = 100

model = W2V(len_vocab, len_embedding)
loss_fn = nn.CrossEntropyLoss()  
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

losses = []

for epoch in tqdm(range(epochs)):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    losses.append(loss.item())

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

100%|██████████| 100/100 [00:01<00:00, 83.32it/s]


In [93]:
vectors = list(model.parameters())[0].detach()

In [94]:
w_vec = {word: vectors[index] for word, index in w_id.items()}

In [95]:
find_similar('machine', w_vec, num=5)

[('machines', tensor(0.9255)),
 ('which', tensor(0.8805)),
 ('learning', tensor(0.8747)),
 ('perceptrons', tensor(0.8319)),
 ('he', tensor(0.8235))]