<a href="https://colab.research.google.com/github/MarioGzSl/word2vec/blob/main/word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from tqdm import tqdm

def preprocess(text):
    return text.lower().split()

dataset_wikipedia = load_dataset("wikipedia", "20220301.en", split='train')

N = 100
text = []
for i in range(N):
    article = dataset_wikipedia[i]['text']
    text.extend(preprocess(article))


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def create_vocabulary(text):
    word_counts = Counter(text)
    vocabulary = {word: idx for idx, word in enumerate(word_counts)}
    return vocabulary, word_counts

In [None]:
class SkipGramDataset(Dataset):
    def __init__(self, text, window_size=2):
        self.text = text
        self.window_size = window_size
        self.vocabulary, self.word_counts = create_vocabulary(text)

    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
      target_word = self.text[idx]
      target_idx = self.vocabulary[target_word]

      start = idx - self.window_size
      end = idx + self.window_size + 1

      context_indices = []
      for i in range(start, end):
          if i != idx and 0 <= i < len(self.text):
              context_indices.append(self.vocabulary[self.text[i]])
          elif i != idx:
              context_indices.append(0)

      context_tensor = torch.tensor(context_indices, dtype=torch.long)
      target_tensor = torch.tensor(target_idx, dtype=torch.long)

      return context_tensor, target_tensor

class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.output_layer = nn.Linear(embedding_dim, vocab_size)

    def forward(self, target_idxs):
        target_embeds = self.embeddings(target_idxs)
        out = self.output_layer(target_embeds)
        log_probs = torch.log_softmax(out, dim=1)
        return log_probs

In [None]:
dataset = SkipGramDataset(text)
dataloader = DataLoader(dataset, batch_size=128, shuffle=True)

vocab_size = len(dataset.vocabulary)
embedding_dim = 128

model = SkipGramModel(vocab_size, embedding_dim).to(device)

In [None]:
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

for epoch in range(10):
    total_loss = 0
    for context, target in tqdm(dataloader, desc=f"Epoch {epoch+1}"):
        target, context = target.to(device), context.to(device)

        optimizer.zero_grad()

        log_probs = model(target)

        log_probs = log_probs.unsqueeze(1).repeat(1, context.size(1), 1)
        log_probs = log_probs.view(-1, log_probs.size(-1))

        context = context.view(-1)
        loss = loss_function(log_probs, context)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"Epoch {epoch}, Total loss: {total_loss}")


torch.save(model.embeddings.state_dict(), 'word_embeddings.pth')

Epoch 1: 100%|██████████| 3810/3810 [00:50<00:00, 75.05it/s]


Epoch 0, Total loss: 41500.32925224304


Epoch 2: 100%|██████████| 3810/3810 [00:50<00:00, 75.46it/s]


Epoch 1, Total loss: 39893.88434123993


Epoch 3: 100%|██████████| 3810/3810 [00:50<00:00, 75.82it/s]


Epoch 2, Total loss: 38725.03659439087


Epoch 4: 100%|██████████| 3810/3810 [00:50<00:00, 75.67it/s]


Epoch 3, Total loss: 37820.6716632843


Epoch 5: 100%|██████████| 3810/3810 [00:50<00:00, 74.99it/s]


Epoch 4, Total loss: 37117.140500068665


Epoch 6: 100%|██████████| 3810/3810 [00:50<00:00, 75.69it/s]


Epoch 5, Total loss: 36586.55619430542


Epoch 7: 100%|██████████| 3810/3810 [00:50<00:00, 75.78it/s]


Epoch 6, Total loss: 36170.13304901123


Epoch 8: 100%|██████████| 3810/3810 [00:50<00:00, 75.72it/s]


Epoch 7, Total loss: 35827.666774749756


Epoch 9: 100%|██████████| 3810/3810 [00:50<00:00, 75.11it/s]


Epoch 8, Total loss: 35544.81055355072


Epoch 10: 100%|██████████| 3810/3810 [00:50<00:00, 75.58it/s]

Epoch 9, Total loss: 35309.880621910095





In [73]:
def create_inverse_vocabulary(vocabulary):
    inverse_vocabulary = {idx: word for word, idx in vocabulary.items()}
    return inverse_vocabulary

vocabulary, inverse_vocabulary = dataset.vocabulary, create_inverse_vocabulary(dataset.vocabulary)

def evaluate_word(model, word, vocabulary, inverse_vocabulary, device='cpu'):
    model.to(device).eval()

    if word in vocabulary:
        word_idx = torch.tensor([vocabulary[word]], dtype=torch.long).to(device)

        with torch.no_grad():
            log_probs = model(word_idx)

        probs = torch.exp(log_probs).squeeze(0)

        topk_values, topk_indices = probs.topk(10, largest=True, sorted=True)

        top_words = [inverse_vocabulary[idx.item()] for idx in topk_indices]

        return top_words, topk_values
    else:
        return f"La palabra '{word}' no se encontró en el vocabulario.", None



In [78]:
word_to_test = "king"
top_words, top_values = evaluate_word(model, word_to_test, vocabulary, inverse_vocabulary, device='cpu')
print(f"Las palabras más probables para '{word_to_test}' son:")
for word, value in zip(top_words, top_values):
    print(f"{word}: {value:.4f}")

Las palabras más probables para 'king' son:
the: 0.1016
and: 0.0320
of: 0.0295
in: 0.0140
to: 0.0068
a: 0.0018
that: 0.0002
by: 0.0002
hectocotyl: 0.0001
vector: 0.0001
