In [164]:
import string
from pyexpat import model

from lightgbm import train
from unidecode import unidecode

import numpy as np
import torch
from torch import nn

In [165]:
class Word2Vec(nn.Module):
    
    def __init__(self, vocab_size, batch_size):
        super(Word2Vec, self).__init__()
        
        self.cbow = nn.Linear(vocab_size, batch_size)
        self.skip_gram = nn.Linear(batch_size, vocab_size, bias=False)
    
    def forward(self, input):
        latent_space = self.cbow(input)
        return self.skip_gram(latent_space)

In [166]:
def preprocess_text(text):
    illegal_chars = string.punctuation + "-" + "\n"
    
    for ch in illegal_chars:
        text = text.replace(ch, '')
        
    
    return text.lower()

text_lines = open("training_text", 'r').readlines()
text = ""

for line in text_lines:
    text = text + preprocess_text(line);

text = text.split(' ')

for i in range(len(text)):
    text[i] = unidecode(text[i])
    
print(len(text))


36959


In [167]:
## One Hot Encoding

map_vf = {}

word_order = [];

it = 0;

for word in text:
    if word not in map_vf:
        map_vf[word] = it
        it += 1

In [168]:
## Adding all One Hot Encoded data to training set

training_data = []

one_hot = np.zeros(it + 1)

context_size = 3

for i in range(context_size, len(text) - context_size, 1):
    X = one_hot.copy()
    for offset in range(-context_size + 1, context_size, 1):
        X[map_vf[text[i - offset]]] = 1
    training_data.append(X)

In [169]:
batch_size = 5000
device = torch.device("mps")

model = Word2Vec(len(map_vf) + 1, batch_size).to(device)

opt = torch.optim.Adam(model.parameters(), lr=1e-3)
loss = nn.MSELoss()

def train(epoch):
    print("Epoch: ", epoch)
    
    loss_total = 0;
    
    lenn = len(training_data) - (len(training_data) % batch_size)
    
    for i in range(0, lenn, batch_size):
        X_train = torch.Tensor(training_data[i:i+batch_size]).reshape(batch_size, -1).to(device)
        # print(X_train.shape)
        
        pred = model(X_train)
        loss_item = loss(X_train, pred)
        loss_total = loss_total + loss_item.item()
        
        opt.zero_grad()
        loss_item.backward()
        opt.step()
        
    print("Avg loss: ", loss_total / len(training_data) * batch_size)
    print("Total loss: ", loss_total)
        
for i in range(10):
    train(i)


Epoch:  0
Avg loss:  0.0007541246672362017
Total loss:  0.0055734337656758726
Epoch:  1
Avg loss:  0.0004725805963936605
Total loss:  0.003492654155706987
Epoch:  2
Avg loss:  0.0003195118618164244
Total loss:  0.0023613843659404665
Epoch:  3
Avg loss:  0.00026208530162262174
Total loss:  0.0019369676301721483
Epoch:  4
Avg loss:  0.00021821545537651304
Total loss:  0.0016127431445056573
Epoch:  5
Avg loss:  0.0001858723716615477
Total loss:  0.0013737083500018343
Epoch:  6
Avg loss:  0.00016501370173841742
Total loss:  0.0012195502640679479
Epoch:  7
Avg loss:  0.00014835251488878388
Total loss:  0.0010964140965370461
Epoch:  8
Avg loss:  0.00013583539672730414
Total loss:  0.001003905083052814
Epoch:  9
Avg loss:  0.00012537821355138127
Total loss:  0.0009266202250728384


In [170]:
def get_word_embedding(model, word):
    word = word.lower()
    embeddings = model.skip_gram.weight.detach().cpu()
    id = map_vf[word]
    return embeddings[id]

In [178]:
print(get_word_embedding(model, "samadaul"))

tensor([-0.0115,  0.0045,  0.0211,  ...,  0.0064, -0.0295,  0.0047])
