<a href="https://colab.research.google.com/github/HeshamEL-Shreif/word2vec-from-scratch/blob/main/word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# install dependancies

In [None]:
! pip install datasets

# Imports

In [2]:
import torch
import torch.nn as nn
import transformers
from huggingface_hub import notebook_login
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import WordLevelTrainer
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import random
from tqdm import tqdm
import torch.nn.functional as F

In [3]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Load Dataset

In [4]:
dataset = load_dataset("ag_news")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/8.07k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [5]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})


## Get a subset aas the corpus

In [6]:
corpus = dataset["train"]["text"][:5000]

# Build Tokenizer

In [7]:
tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()
trainer = WordLevelTrainer(vocab_size=5000, special_tokens=["[UNK]", "[PAD]"])

In [8]:
tokenizer.train_from_iterator(corpus, trainer)

In [9]:
output = tokenizer.encode(corpus[0])
print(output.tokens)
print(output.ids)

['Wall', 'St', '.', 'Bears', '[UNK]', 'Back', 'Into', 'the', 'Black', '(', 'Reuters', ')', 'Reuters', '-', '[UNK]', '-', 'sellers', ',', 'Wall', 'Street', "'", 's', '[UNK]', '\\', 'band', 'of', '[UNK]', '-', '[UNK]', ',', 'are', 'seeing', 'green', 'again', '.']
[360, 1073, 2, 4896, 0, 1160, 1582, 3, 3778, 13, 20, 15, 20, 5, 0, 5, 4775, 4, 360, 397, 16, 11, 0, 32, 3545, 8, 0, 5, 0, 4, 50, 3690, 1476, 531, 2]


In [10]:
tokenized_text = [tokenizer.encode(sentence).ids for sentence in corpus]

In [11]:
print(tokenized_text[0])

[360, 1073, 2, 4896, 0, 1160, 1582, 3, 3778, 13, 20, 15, 20, 5, 0, 5, 4775, 4, 360, 397, 16, 11, 0, 32, 3545, 8, 0, 5, 0, 4, 50, 3690, 1476, 531, 2]


In [12]:
vocab = tokenizer.get_vocab()

# Get Skip-Gram Pairs

In [13]:
def get_skip_gram_pairs(tokenized_text, window_size):
    pairs = []
    for sentence in tokenized_text:
        for i in range(len(sentence)):
            for j in range(max(0, i - window_size), min(len(sentence), i + window_size + 1)):
                if i != j:
                    pairs.append((sentence[i], sentence[j]))
    return pairs

In [14]:
pairs = get_skip_gram_pairs(tokenized_text, 2)

In [15]:
print(pairs[0:10])

[(360, 1073), (360, 2), (1073, 360), (1073, 2), (1073, 4896), (2, 360), (2, 1073), (2, 4896), (2, 0), (4896, 1073)]


# Skip-gram dataset

In [16]:
class SkipGram_Data(Dataset):
  def __init__(self, pairs):
    self.pairs = pairs
  def __len__(self):
    return len(self.pairs)
  def __getitem__(self, idx):
    return torch.tensor(self.pairs[idx][0]), torch.tensor(self.pairs[idx][1])

## Get negative samples

In [17]:
def sample_negative_words(batch_size, vocab_size, num_negatives, device, true_context):
    negative_samples = []
    for _ in range(batch_size):
        negatives = []
        while len(negatives) < num_negatives:
            rand_word = random.randint(0, vocab_size - 1)
            if rand_word != true_context[_].item():
                negatives.append(rand_word)
        negative_samples.extend(negatives)
    return torch.tensor(negative_samples, device=device)

# Model Class

In [18]:
class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.in_embed = nn.Embedding(vocab_size, embedding_dim)
        self.out_embed = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, center_words, context_words):
        center_embed = self.in_embed(center_words)
        context_embed = self.out_embed(context_words)
        scores = torch.sum(center_embed * context_embed, dim=1)
        return scores

    def get_embeddings(self):
        return self.in_embed.weight.data

# Initiate dataset

In [19]:
dataset = SkipGram_Data(pairs)
dataloader = DataLoader(dataset, batch_size=4096, shuffle=True)

# Set device

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Training loop

In [22]:
embedding_dim = 100
num_negatives = 5
vocab_size = len(vocab)

model = Word2Vec(vocab_size, embedding_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [21]:
for epoch in range(5):
    total_loss = 0
    dataloader_loop = tqdm(dataloader, desc=f"Epoch {epoch+1}", leave=False)

    for center, pos_context in dataloader_loop:
        center = center.to(device)
        pos_context = pos_context.to(device)
        batch_size = center.size(0)

        pos_scores = model(center, pos_context)
        pos_labels = torch.ones_like(pos_scores)

        neg_context = sample_negative_words(batch_size, vocab_size, num_negatives, device, pos_context)
        neg_center = center.repeat_interleave(num_negatives)
        neg_scores = model(neg_center, neg_context)
        neg_labels = torch.zeros_like(neg_scores)

        scores = torch.cat([pos_scores, neg_scores])
        labels = torch.cat([pos_labels, neg_labels])
        loss = F.binary_cross_entropy_with_logits(scores, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        dataloader_loop.set_postfix(loss=loss.item())

    print(f"Epoch {epoch+1}: Loss = {total_loss:.4f}")



Epoch 1: Loss = 418.2861




Epoch 2: Loss = 118.0602




Epoch 3: Loss = 77.8220




Epoch 4: Loss = 63.3786


                                                                      

Epoch 5: Loss = 55.4226


