In [2]:
import shutil
shutil.rmtree("/root/.cache/huggingface", ignore_errors=True)

In [None]:
%%capture

!pip install --upgrade --quiet datasets transformers fsspec

In [3]:
import numpy as np
import torch
import torch.nn.functional as F
import torch.nn as nn
import math
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tabulate import tabulate
from datasets import load_dataset
import torch.optim as optim

from tqdm.notebook import tqdm
from transformers import BertTokenizer

First cells will be the same than the ones on text convolution.

# Data loading


In [4]:
dataset = load_dataset("scikit-learn/imdb", split="train")
print(dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

IMDB%20Dataset.csv:   0%|          | 0.00/66.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset({
    features: ['review', 'sentiment'],
    num_rows: 50000
})


In [5]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
VOCSIZE = len(tokenizer.vocab)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [6]:
def preprocessing_fn(x, tokenizer):
    x["review_ids"] = tokenizer(
        x["review"],
        add_special_tokens=False,
        truncation=True,
        max_length=256,
        padding=False,
        return_attention_mask=False,
    )["input_ids"]
    x["label"] = 0 if x["sentiment"] == "negative" else 1
    return x

In [7]:
n_samples = 15000  # the number of training example

# We first shuffle the data !
dataset = dataset.shuffle()

# Select 10000 samples
split_dataset = dataset.select(range(n_samples))

# Tokenize the dataset
tok_dataset = split_dataset.map(preprocessing_fn, fn_kwargs={"tokenizer": tokenizer})


# Remove useless columns
tok_dataset = tok_dataset.select_columns(["review_ids", "label"])

# Split the train and validation
tok_dataset = tok_dataset.train_test_split(test_size=0.2)

document_train_set = tok_dataset["train"]
document_valid_set = tok_dataset["test"]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

In [8]:

# For words close to the left border, one take all the words to the left of this word until we reach the beginning, then one complete the missing words with the words to the right
# same idea for words close to the right border
import random
def extract_words_contexts(list_of_ids, R):
    w = []
    Cp = []
    n = len(list_of_ids)  # We assume that 2R <= n
    if 2*R >n:    #otherwise, the positive context of a word will be the wole list of ids except the target word
        for i in range(n):
          w.append(list_of_ids[i])
          context_words = [list_of_ids[j] for j in range(n) if j != i]
          context_words.extend(random.sample(context_words,k= 2*R -n + 1)) #and we add random words to have a 2*R length context
          Cp.append(context_words)
        return w,Cp

    for i in range(n):
        w.append(list_of_ids[i])
        if i < R:
            Cp.append([list_of_ids[j] for j in range(0, 2 * R + 1) if j != i])
        elif i + R > n - 1:
            Cp.append([list_of_ids[j] for j in range(n - 1 - 2 * R, n) if j != i])
        else:
            Cp.append([list_of_ids[j] for j in range(i - R, i + R + 1) if j != i])
    return w, Cp

In [9]:
def flatten_dataset_to_list(dataset, R):
    words_list = []
    contexts_list = []

    for example in dataset:
        words, contexts = extract_words_contexts(example["review_ids"], R)
        words_list.extend(words)
        contexts_list.extend(contexts)

    return words_list, contexts_list

R = 3
train_words_list, train_contexts_list = flatten_dataset_to_list(document_train_set, R)
valid_words_list, valid_contexts_list = flatten_dataset_to_list(document_valid_set, R)

In [10]:
class WordContextDataset(Dataset):
    def __init__(self, words, contexts):
        self.words = words
        self.contexts = contexts

    def __len__(self):
        return len(self.words)

    def __getitem__(self, index):
        return {"word": self.words[index], "context": self.contexts[index]}


train_set = WordContextDataset(train_words_list, train_contexts_list)
valid_set = WordContextDataset(valid_words_list, valid_contexts_list)

In [11]:
def collate_fn(K, batch):
    word_ids = [item["word"] for item in batch]
    positive_context_ids = [item["context"] for item in batch]
    R = (
        len(positive_context_ids[0]) // 2
    )  # because positive_context_ids has elements of size 2*R
    negative_context_ids = torch.randint(
        VOCSIZE,
        (
            len(batch),
            2 * K * R,
        ),
    ).tolist()  # We randomly sample from the whole vocabulary set

    negative_context_ids = torch.tensor(negative_context_ids)
    word_ids = torch.tensor(word_ids)
    positive_context_ids = torch.tensor(positive_context_ids)

    return {
        "word_id": word_ids,
        "positive_context_id": positive_context_ids,
        "negative_context_id": negative_context_ids,
    }

In [12]:
K = 3

batch_size = 64

train_dataloader = DataLoader(
    train_set, batch_size=batch_size, collate_fn=lambda batch: collate_fn(K, batch)
)

valid_dataloader = train_dataloader = DataLoader(
    valid_set, batch_size=batch_size, collate_fn=lambda batch: collate_fn(K, batch)
)

In [13]:
class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.word_embeddings = nn.Embedding(
            vocab_size, embedding_dim
        )  # embeddings table for the words
        self.context_embeddings = nn.Embedding(
            vocab_size, embedding_dim
        )  # embeddings table for the contexts

    def forward(self, batch_word, batch_context):
        batch_word_embeds = self.word_embeddings(batch_word)

        batch_context_embeds = self.context_embeddings(batch_context)

        scores = (batch_word_embeds.unsqueeze(1) * batch_context_embeds).sum(
            dim=2
        )  # A  dimension is added to the first tensor, with unsqueeze(1) to allow the dot product computation
        return scores.sigmoid()

In [14]:
# Now we create our model

VOCSIZE = len(tokenizer.vocab)
# Hyperparameters
d = 50  # the embedding dim
R = 3  # We recall the value previously used
K = 3


model = Word2Vec(VOCSIZE, d)

In [15]:
def loss_function(positive_scores, negative_scores): #We use a custom loss function
    B, l = positive_scores.shape
    B, L = negative_scores.shape
    positive_loss = (positive_scores).log()
    negative_loss = (1 - negative_scores + 1e-8).log()
    output = positive_loss.sum() + negative_loss.sum()
    return -1 * output / (B * (l + L))


optimizer = optim.AdamW(model.parameters(), lr=0.01)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


def training(B, E):
    train_dataloader = DataLoader(
        train_set, batch_size=B, collate_fn=lambda batch: collate_fn(K, batch)
    )

    for epoch in range(E):
        total_loss = 0
        for batch in train_dataloader:
            word = batch["word_id"].to(device)
            positive_context = batch["positive_context_id"].to(device)
            negative_context = batch["negative_context_id"].to(device)
            # Zero the gradients
            optimizer.zero_grad()

            # Forward pass
            positive_scores, negative_scores = model(word, positive_context), model(
                word, negative_context
            )
            loss = loss_function(positive_scores, negative_scores)
            total_loss += loss.item()
            # Backpropagation
            loss.backward()
            optimizer.step()

        average_loss = total_loss / len(train_dataloader)
        print(f"epoch {epoch + 1}, training loss : {average_loss:.4f}")

In [16]:
# Now we train it

B = 64
E = 10

training(B, E)

epoch 1, training loss : 0.2661
epoch 2, training loss : 0.2141
epoch 3, training loss : 0.2130
epoch 4, training loss : 0.2127
epoch 5, training loss : 0.2128
epoch 6, training loss : 0.2127
epoch 7, training loss : 0.2126
epoch 8, training loss : 0.2125
epoch 9, training loss : 0.2125
epoch 10, training loss : 0.2125


In [17]:
#Validation step

correct_predictions = 0
total_examples = 0

model.eval()

valid_dataloader = DataLoader(
    valid_set, batch_size=B, collate_fn=lambda batch: collate_fn(K, batch)
)

with torch.no_grad():
    for batch in valid_dataloader:
        word = batch["word_id"].to(device)
        positive_context = batch["positive_context_id"].to(device)
        negative_context = batch["negative_context_id"].to(device)

        # Forward pass
        positive_scores, negative_scores = model(word, positive_context), model(
            word, negative_context
        )

        batch_size = word.size(0)
        positive_predictions = (
            positive_scores >= 0.7
        ).sum()  # we want the positive score to be high (>=0.7)
        negative_predictions = (
            negative_scores <= 0.3
        ).sum()  # we want the negative score to be low (<=0.3)
        correct_predictions += positive_predictions + negative_predictions
        total_examples += batch_size * (
            positive_context.size(1) + negative_context.size(1)
        )  # each bach has this number of words

accuracy = correct_predictions / total_examples
print(f"Validation accuracy: {100*accuracy:.2f}%")

Validation accuracy: 86.25%


Let test the model on common words

In [18]:
model.eval()
context_id = tokenizer.vocab["frog"]  # it should be a low score
word_id = tokenizer.vocab["movie"]
word = torch.tensor([word_id]).to(device)
context = torch.tensor([context_id]).to(device)
print(model(word, context))

model.eval()
context_id = tokenizer.vocab["actor"]  # it should be a high score
word_id = tokenizer.vocab["movie"]
word = torch.tensor([word_id]).to(device)
context = torch.tensor([context_id]).to(device)
print(model(word, context))

model.eval()
context_id = tokenizer.vocab["the"]  # it should be higher than the previous one
word_id = tokenizer.vocab["movie"]
word = torch.tensor([word_id]).to(device)
context = torch.tensor([context_id]).to(device)
print(model(word, context))

tensor([[0.0444]], device='cuda:0', grad_fn=<SigmoidBackward0>)
tensor([[0.6402]], device='cuda:0', grad_fn=<SigmoidBackward0>)
tensor([[0.9982]], device='cuda:0', grad_fn=<SigmoidBackward0>)


In [19]:
#save the model's embeddings
def save_model(word2vec_model, embedding_dim, radius, ratio, batch_size, epoch):
    checkpoint = {
        "word_embedding": word2vec_model.word_embeddings,
        "context_embedding": word2vec_model.context_embeddings,
    }
    torch.save(checkpoint, f"/content/model_dim-{embedding_dim}_radius-{radius}_ratio-{ratio}-batch-{batch_size}-epoch-{epoch}.ckpt")


save_model(model, d, R, K, B, E)
