In [2]:
import torch
import pandas as pd
import string
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


# HyperParameter

In [4]:
EMBEDDING_DIM = 300
EPOCH = 20

In [5]:
class Word2Vec(torch.nn.Module):
    def __init__(self, voc_size, embedding_dim) -> None:
        super(Word2Vec, self).__init__()
        # Encoder
        self.w1 = torch.nn.Parameter(torch.randn(size=(embedding_dim, voc_size), requires_grad = True))
        self.b1 = torch.nn.Parameter(torch.randn(size = (embedding_dim,), requires_grad= True))
        
        # Decoder
        self.w2 = torch.nn.Parameter(torch.randn(size=(voc_size, embedding_dim), requires_grad = True))
        self.b2 = torch.nn.Parameter(torch.randn(size=(voc_size,), requires_grad = True))
    def forward(self, x):
        x = self.w1 @ x + self.b1
        x = self.w2 @ x + self.b2
        return x

In [10]:
stop_words = []
with open('./NLPUtils/english.txt', encoding='utf-8') as f:
    stop_words = [stop_word.replace('\n','') for stop_word in f.readlines()]

In [11]:
file_path = './basic_dataset/IMDB/IMDB Dataset.csv'

df = pd.read_csv(file_path, encoding='utf-8')

In [12]:
sentences = [sentence.lower() for sentence in df['review'][:200]]

In [13]:
for i in tqdm(range(len(sentences))):
    sentences[i] = sentences[i].translate(str.maketrans('', '', string.punctuation))

100%|██████████| 200/200 [00:00<00:00, 66655.61it/s]


In [14]:
def tokenize(sentences) -> list:
    tokens = [sentence.split() for sentence in sentences]
    return tokens

In [15]:
tokens = tokenize(sentences)

In [16]:
for i in tqdm(range(len(tokens))):
    for stop_word in stop_words:
        tokens[i] = list(filter(stop_word.__ne__, tokens[i]))

100%|██████████| 200/200 [00:00<00:00, 240.33it/s]


In [17]:
voc = []

In [18]:
for sentence in tqdm(sentences):
     for word in sentence:
        if word not in voc:
            voc.append(word)

100%|██████████| 200/200 [00:00<00:00, 8693.94it/s]


In [19]:
word2idx = {w:idx for (idx, w) in enumerate(voc)}
inx2word = {idx:w for (idx, w) in enumerate(voc)}

In [13]:
window_size = 3
idx_pair = []
for sentence in tqdm(sentences):
    indices = [word2idx[word] for word in sentence]
    for idx in range(len(indices)):
        if idx + window_size - 1 < len(indices):
            window = indices[idx : idx + window_size]
            for x_word in window:
                for y_word in window:
                    if x_word != y_word:
                        idx_pair.append([x_word, y_word])

100%|██████████| 200/200 [00:00<00:00, 219.01it/s]


In [14]:
DEVICE = 'cuda' if torch.cuda.is_available() else "cpu"

In [15]:
one_hot_vector = torch.zeros(size=(len(voc), len(voc)),requires_grad= False).to(DEVICE)

In [16]:
for idx, vector in enumerate(one_hot_vector):
    vector[idx] = 1.

In [17]:
word2vec = Word2Vec(voc_size=len(voc), embedding_dim=EMBEDDING_DIM).to(DEVICE)

In [18]:
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(word2vec.parameters(), lr=1e-3)
for epch in range(20):
    # correct = 0.
    for x,y in tqdm(idx_pair):
        optimizer.zero_grad()
        y = one_hot_vector[y]
        x = one_hot_vector[x]
        pred = word2vec(x)
        loss = loss_fn(pred, y)
        loss.backward()
        # correct += (pred.argmax(0) == y.argmax(0)).float()
        optimizer.step()
    # print(f'Acc = {correct/len(idx_pair)}')
    print(f'Current loss {loss.item()}')
        

100%|██████████| 1517320/1517320 [26:19<00:00, 960.64it/s]


Current loss 1.999158501625061


100%|██████████| 1517320/1517320 [26:52<00:00, 940.75it/s]


Current loss 2.0371532440185547


100%|██████████| 1517320/1517320 [26:32<00:00, 952.66it/s]


Current loss 2.018064260482788


100%|██████████| 1517320/1517320 [26:32<00:00, 952.87it/s]


Current loss 2.001497268676758


100%|██████████| 1517320/1517320 [26:30<00:00, 954.15it/s]


Current loss 2.0146384239196777


100%|██████████| 1517320/1517320 [26:29<00:00, 954.45it/s]


Current loss 2.016083240509033


 19%|█▊        | 281292/1517320 [04:55<21:38, 951.63it/s]


KeyboardInterrupt: 

In [20]:

torch.save(word2vec, 'word2vec.ph')

# Load Model

In [20]:
model = Word2Vec(len(voc), embedding_dim=EMBEDDING_DIM)

In [22]:
model.state_dict()

OrderedDict([('w1',
              tensor([[-0.7570, -1.1619,  1.5999,  ..., -0.1433, -0.8304,  1.0122],
                      [-1.4809, -0.9033, -0.3666,  ..., -3.3922, -1.0577,  0.7802],
                      [-0.4891, -2.2955, -1.0598,  ...,  1.7060,  0.2266,  0.2450],
                      ...,
                      [-0.9563, -0.9016, -0.1940,  ...,  0.1345, -0.8428,  0.5069],
                      [ 0.7356,  0.8071,  0.0396,  ..., -0.2778, -0.9067, -0.8001],
                      [-1.2157, -0.4392, -0.1110,  ...,  0.3787,  1.3319,  0.1884]])),
             ('b1',
              tensor([ 1.3634e+00, -1.5701e+00,  4.2080e-01, -1.5758e-01,  2.6060e-01,
                       1.9533e+00, -7.5437e-01,  2.2399e-01, -7.0086e-01,  9.5323e-01,
                      -1.1076e+00, -1.5141e+00, -7.6550e-01, -6.7069e-01,  4.9201e-01,
                       5.2997e-02,  8.0617e-01, -5.5404e-01,  9.9680e-01,  5.5406e-01,
                       6.5434e-01, -3.8817e-01, -8.7488e-02,  2.8059e-01,  6.1

In [6]:
temp = torch.load(f='./trained_models/word2vec.ph')