In [2]:
import torch
import pickle
from torch.utils.data import Dataset, DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DEVICE = 'cuda' if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 512
EMBEDDING_SIZE = 300

In [3]:
print(f'Current Device is {DEVICE}')

Current Device is cuda


In [4]:
import os

class IMDBDataset(Dataset):
    def __init__(self, root = '../basic_dataset/IMDB', voc_file = 'voc.data', idx_pair = 'XY_list.data', DEVICE = 'cpu') -> None:
        super(IMDBDataset, self).__init__()
        with open('../basic_dataset/IMDB/voc.data', 'rb') as f:
            self.voc = pickle.load(f)
            self.word2idx = {word : i for i, word in enumerate(self.voc)}
            self.idx2word = {i : word for i, word in enumerate(self.voc)}
        with open('../basic_dataset/IMDB/XY_list.data', 'rb') as f:
            self.idx_pair = pickle.load(f)
    def __len__(self):
        return len(self.idx_pair)
    def __getitem__(self, index):
        idx_x, idx_y = self.idx_pair[index]
        x = torch.zeros(size=(len(self.voc),))
        y = torch.zeros(size=(len(self.voc),))
        x[idx_x] = 1.
        y[idx_y] = 1.
        return x.to(DEVICE),y.to(DEVICE)

In [5]:
class Word2Vec(torch.nn.Module):
    def __init__(self, voc_size, embedding_dim) -> None:
        super(Word2Vec, self).__init__()
        # Encoder
        self.w1 = torch.nn.Parameter(torch.randn(size=(embedding_dim, voc_size), requires_grad = True))
        self.b1 = torch.nn.Parameter(torch.randn(size = (embedding_dim,), requires_grad= True))
        
        # Decoder
        self.w2 = torch.nn.Parameter(torch.randn(size=(voc_size ,embedding_dim ), requires_grad = True))
        self.b2 = torch.nn.Parameter(torch.randn(size=(voc_size,), requires_grad = True))
    def forward(self, x):
        x = x @ self.w1.T + self.b1
        x = x @ self.w2.T + self.b2
        return x

In [6]:
imdbDataset = IMDBDataset(DEVICE = DEVICE)

In [7]:
imdbDataloader = DataLoader(imdbDataset, batch_size=BATCH_SIZE)

In [8]:
word2vec = Word2Vec(voc_size=len(imdbDataloader.dataset.voc), embedding_dim=EMBEDDING_SIZE).to(DEVICE)
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(word2vec.parameters(), lr=1e-3)

In [9]:
for epoch in range(5):
    for batch, (x, y) in enumerate(imdbDataloader):
        optimizer.zero_grad()
        pred = word2vec(x)
        loss = loss_fn(pred, y)
        loss.backward()
        optimizer.step()
        if (BATCH_SIZE * batch) % (BATCH_SIZE * 10000) == 0:
            print(f'Current loss {loss.item()}')

Current loss 116.90101623535156
Current loss 21.411209106445312
Current loss 19.832914352416992
Current loss 11.248250007629395
Current loss 12.667123794555664
Current loss 8.951885223388672
Current loss 10.29633903503418
Current loss 8.24020004272461
Current loss 9.217126846313477
Current loss 7.952296257019043


In [1]:
torch.save(word2vec, '../trained_models/word2vec.pth')

NameError: name 'torch' is not defined

In [11]:
imdbDataset.idx2word[0]

'reviewers'

In [12]:
def getWord(word = ''):
    x = torch.zeros(len(imdbDataset.voc))
    x[imdbDataset.word2idx[word]] = 1.
    return x.to(DEVICE)

In [18]:
word2vec(getWord('function'))

tensor([ -0.9231,   1.1283,   3.1197,  ..., -18.2967, -29.3959, -16.3873],
       device='cuda:0', grad_fn=<AddBackward0>)