<a href="https://colab.research.google.com/github/MarioGzSl/word2vec/blob/main/word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [80]:
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from tqdm import tqdm

def preprocess(text):
    return text.lower().split()

dataset_wikipedia = load_dataset("wikipedia", "20220301.en", split='train')

N = 1000
text = []
for i in range(N):
    article = dataset_wikipedia[i]['text']
    text.extend(preprocess(article))


In [81]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def create_vocabulary(text):
    word_counts = Counter(text)
    vocabulary = {word: idx for idx, word in enumerate(word_counts)}
    return vocabulary, word_counts

In [82]:
class SkipGramDataset(Dataset):
    def __init__(self, text, window_size=2):
        self.text = text
        self.window_size = window_size
        self.vocabulary, self.word_counts = create_vocabulary(text)

    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
      target_word = self.text[idx]
      target_idx = self.vocabulary[target_word]

      start = idx - self.window_size
      end = idx + self.window_size + 1

      context_indices = []
      for i in range(start, end):
          if i != idx and 0 <= i < len(self.text):
              context_indices.append(self.vocabulary[self.text[i]])
          elif i != idx:
              context_indices.append(0)

      context_tensor = torch.tensor(context_indices, dtype=torch.long)
      target_tensor = torch.tensor(target_idx, dtype=torch.long)

      return context_tensor, target_tensor

class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.output_layer = nn.Linear(embedding_dim, vocab_size)

    def forward(self, target_idxs):
        target_embeds = self.embeddings(target_idxs)
        out = self.output_layer(target_embeds)
        log_probs = torch.log_softmax(out, dim=1)
        return log_probs

In [83]:
dataset = SkipGramDataset(text)
dataloader = DataLoader(dataset, batch_size=128, shuffle=True)

vocab_size = len(dataset.vocabulary)
embedding_dim = 128

model = SkipGramModel(vocab_size, embedding_dim).to(device)

In [84]:
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

for epoch in range(100):
    total_loss = 0
    for context, target in tqdm(dataloader, desc=f"Epoch {epoch+1}"):
        target, context = target.to(device), context.to(device)

        optimizer.zero_grad()

        log_probs = model(target)

        log_probs = log_probs.unsqueeze(1).repeat(1, context.size(1), 1)
        log_probs = log_probs.view(-1, log_probs.size(-1))

        context = context.view(-1)
        loss = loss_function(log_probs, context)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"Epoch {epoch}, Total loss: {total_loss}")
    if(epoch%10 == 0):
      torch.save(model.embeddings.state_dict(), f"/content/drive/MyDrive/word2vec/word_embeddings_chk_'{epoch/10}'.pth")



torch.save(model.embeddings.state_dict(), '/content/drive/MyDrive/word2vec/word_embeddings.pth')

Epoch 1: 100%|██████████| 27693/27693 [21:34<00:00, 21.39it/s]


Epoch 0, Total loss: 315992.89405441284


Epoch 2: 100%|██████████| 27693/27693 [21:35<00:00, 21.37it/s]


Epoch 1, Total loss: 289394.6640806198


Epoch 3:  50%|████▉     | 13812/27693 [10:46<10:49, 21.37it/s]


KeyboardInterrupt: ignored

In [127]:
def create_inverse_vocabulary(vocabulary):
    inverse_vocabulary = {idx: word for word, idx in vocabulary.items()}
    return inverse_vocabulary

vocabulary, inverse_vocabulary = dataset.vocabulary, create_inverse_vocabulary(dataset.vocabulary)

def evaluate_word(model, word, vocabulary, inverse_vocabulary, device='cpu'):
    model.to(device).eval()

    if word in vocabulary:
        word_idx = torch.tensor([vocabulary[word]], dtype=torch.long).to(device)

        with torch.no_grad():
            log_probs = model(word_idx)

        probs = torch.exp(log_probs).squeeze(0)

        topk_values, topk_indices = probs.topk(100, largest=True, sorted=True)

        top_words = [inverse_vocabulary[idx.item()] for idx in topk_indices]

        return top_words, topk_values
    else:
        return f"La palabra '{word}' no se encontró en el vocabulario.", None

def get_embedding(model, word, device='cpu'):
    model.to(device).eval()

    if word in vocabulary:
      word_idx = torch.tensor([vocabulary[word]], dtype=torch.long).to(device)

      with torch.no_grad():
        word_embedding = model.embeddings(word_idx)
    return word_embedding

def get_words(word_embedding, vocabulary, inverse_vocabulary, device='cpu'):
    word_embedding = word_embedding.to(device)

    all_embeddings = torch.stack([model.embeddings(torch.tensor([idx])) for idx in range(len(vocabulary))]).squeeze(1)

    similarities = torch.nn.functional.cosine_similarity(word_embedding, all_embeddings)

    topk_values, topk_indices = similarities.topk(120, largest=True, sorted=True)

    closest_words = [inverse_vocabulary[idx.item()] for idx in topk_indices]

    return closest_words, topk_values


In [132]:
embedding = get_embedding(model, "morning")

In [133]:
get_words(embedding, vocabulary, inverse_vocabulary, device='cpu')

(['morning',
  'assam,',
  'schatz.',
  '"mentality"',
  'chatti.',
  'andranik',
  'nobilis,',
  'over-zealous',
  'one-off',
  'irrespective',
  'loses',
  'tegmen',
  'djurdjevic',
  'somatic',
  'saško)',
  'army."',
  'apple-1',
  '1851',
  '1–83",',
  'howerd,',
  '(leu,',
  'gulag,',
  'sarayı',
  'quietly.',
  'fältskog,',
  'heythrop',
  'poulin,',
  'relinquishing',
  'mouseion,',
  'conceptacles',
  'refuge.',
  'remittance',
  'chapter)',
  'fireproof',
  'hilal',
  'non-combatants',
  'sommerville.',
  'analog/hybrid',
  'catulus.',
  'scandalized',
  'mayock',
  'yellow-orange',
  'tone),',
  'suitors.',
  'lombardia',
  'mersenne',
  'given,',
  "mouth'.",
  'eliyahu,',
  'whatever',
  'balasko,',
  '597.',
  '5,610',
  'estate".',
  'p2000',
  'philosophers".',
  'achillia,',
  'taronites,',
  'lyon).',
  'coucy',
  "d'encouragement",
  'hardening,',
  'heat,',
  'expo,',
  'domagoj',
  'relationship."',
  'monahan,',
  'poisoning',
  'quaraouiyine',
  'juárez',
  'ap.'

In [96]:
word_to_test = "woman"
top_words, top_values = evaluate_word(model, word_to_test, vocabulary, inverse_vocabulary, device='cpu')
print(f"Las palabras más probables para '{word_to_test}' son:")
for word, value in zip(top_words, top_values):
    print(f"{word}: {value:.4f}")

Las palabras más probables para 'woman' son:
the: 0.0792
a: 0.0318
in: 0.0316
of: 0.0311
and: 0.0275
to: 0.0174
as: 0.0112
is: 0.0028
–: 0.0012
that: 0.0009
by: 0.0008
was: 0.0007
for: 0.0006
with: 0.0005
on: 0.0003
from: 0.0003
he: 0.0001
an: 0.0001
it: 0.0001
his: 0.0001
at: 0.0001
has: 0.0001
had: 0.0001
have: 0.0001
not: 0.0001
or: 0.0000
this: 0.0000
also: 0.0000
which: 0.0000
are: 0.0000
cinema.": 0.0000
but: 0.0000
kraivichien,: 0.0000
husayn: 0.0000
bee's: 0.0000
arnbitter,: 0.0000
spyrka: 0.0000
be: 0.0000
one: 0.0000
(b.: 0.0000
cristo: 0.0000
insisted: 0.0000
tunnels: 0.0000
expression.": 0.0000
dormancy.: 0.0000
mirabili: 0.0000
vesuviana,: 0.0000
rubber,: 0.0000
(bell's: 0.0000
mechanic.: 0.0000
¥2.15: 0.0000
viscosity.: 0.0000
1239): 0.0000
(d.: 0.0000
theaters: 0.0000
mavroneri: 0.0000
chiklis,: 0.0000
resolved?".: 0.0000
afford.: 0.0000
physik: 0.0000
antigonia.: 0.0000
oistrophe,: 0.0000
thoughts,: 0.0000
capo: 0.0000
iiis: 0.0000
realtoo: 0.0000
conduction.: 0.0000
ad