In [3]:
from pathlib import Path
import sys
sys.path.append(str(Path('src').resolve()))
import torch
from torch import nn
import numpy as np
from src.dataloader import SG_Dataset
from src.models import Skipgram_SM
from torch.utils.data import DataLoader, ConcatDataset, BatchSampler, Sampler
from collections import Counter

In [4]:
data = open('data/shakespeare_normalized.txt', 'r').read()
words = set(data.split())
vocab_size = len(words)
# When working with sents I can use this itertools.chain.from_iterable(sents) to trasnform to words 
word2id = {word: i for i, word in enumerate(words)}
id2word = {i: word for i, word in enumerate(words)} # for future use
print('tokens loaded')
freq = Counter(data.split())
min_freq = 1 # I implemented this for the future but right now I don't need it because my data is pretty small
id_sents = [[word2id[word] for word in sent.split(' ') if freq[word] >= min_freq] for sent in data.split('\n')]
window_size = 2

tokens loaded


In [5]:
len(max(id_sents, key=len))

216

In [6]:
centers, contexts = [], []
for ids in id_sents[:]:
    len_ids = len(ids)
    center, context = ids, [[] for _ in range(len_ids)]
    for i in range(len_ids):
        begin = max(0, i - window_size)
        for j in range(begin, i):
            context[i].append(ids[j])
            context[j].append(ids[i])
    # print(center, context)
    centers.extend(center)
    contexts.extend(context)

In [7]:
centers_contexts = list(zip(centers, contexts))

In [8]:
d = [[[], []] for _ in range(window_size * 2)]

In [9]:
for center, context in centers_contexts:
    if context:
        d[len(context) - 1][0].append(center)
        d[len(context) - 1][1].append(context)

In [10]:
datasets = [SG_Dataset(centers, np.stack(contexts)) for centers, contexts in d]

In [11]:
cd = ConcatDataset(datasets)

In [12]:
class SG_Softmax_Batch_Sampler(Sampler):
    """A Sequential BatchSampler for Skipgram_SM CumulativeDataset
    For more information: https://pytorch.org/docs/stable/_modules/torch/utils/data/sampler.html#BatchSampler
    """
    def calculate_len(self):
        # Below is code to count the length of the DataLoader
        # I put it here because I don't want to recalculate it constantly
        cum_len = 0
        prev_size = 0
        for cum_size, batch_size in zip(self.cumulative_sizes, self.batch_sizes):
            curr_size = cum_size - prev_size
            cum_len = (curr_size + batch_size - 1) // batch_size
            prev_size = cum_size
        self.cum_len = cum_len
    def __init__(self, cumulative_sizes, batch_size):
        self.cumulative_sizes = cumulative_sizes
        print(self.cumulative_sizes)
        self.batch_sizes = [0] * len(cumulative_sizes)
        for i in range(len(cumulative_sizes)):
            self.batch_sizes[i] = max(1, round(batch_size/(i + 1)))
        self.calculate_len()
    def __len__(self):
        return self.cum_len
    def __iter__(self):
        prev_size = 0
        for size, batch_size in zip(self.cumulative_sizes, self.batch_sizes):
            batch = [0] * batch_size
            idx_in_batch = 0
            # Begin sequential sampling
            for i in range(prev_size, size):
                batch[idx_in_batch] = i
                idx_in_batch += 1
                if idx_in_batch == batch_size:
                    yield batch
                    idx_in_batch = 0
                    # The line below I found in the pytorch implementation of BatchSampler. It seems to me that it's not necessary
                    # batch = [0] * self.batch_size
            if idx_in_batch > 0:
                yield batch[:idx_in_batch]
            prev_size = size

In [13]:
batch_sampler = SG_Softmax_Batch_Sampler(cd.cumulative_sizes, 64)

[1806, 12724, 21164, 66934]


In [72]:
1869 / 64 + (12724 - 1806) / 32 +  21228 / (round(64/3) * 3) + 66934 / 64# 2861

1610.811755952381

In [14]:
D = DataLoader(cd, batch_sampler=SG_Softmax_Batch_Sampler(cd.cumulative_sizes, 64))

[1806, 12724, 21164, 66934]


In [68]:
counter = 0
from torch.nn.functional import softmax
from torch import optim
model = Skipgram_SM(vocab_size, 10)
loss_fn = nn.CrossEntropyLoss(reduction='sum')
j = 0
times = 5
lr = 0.025
optimizer = optim.SGD(model.parameters(), lr=lr)
for X, y in D:
    optimizer.zero_grad()
    batch_size, context_len = y.shape
    if context_len == 2:
        continue
    preds = model(X)
    
    loss = 0
    for i in range(context_len):
        loss += loss_fn(preds, y[:, i])
        # print(softmax(preds[0]).mean())
    loss /= (batch_size * context_len)
    j += 1
    if j % 400 == 0:
        print(loss)
    loss.backward()
    optimizer.step()

tensor(9.4146, grad_fn=<DivBackward0>)
tensor(9.2174, grad_fn=<DivBackward0>)
tensor(9.3472, grad_fn=<DivBackward0>)
tensor(9.3097, grad_fn=<DivBackward0>)
tensor(9.1806, grad_fn=<DivBackward0>)
tensor(9.3777, grad_fn=<DivBackward0>)
tensor(9.2503, grad_fn=<DivBackward0>)
tensor(9.4274, grad_fn=<DivBackward0>)


In [366]:
print(len(id_sents))

6780
