In [40]:
import numpy as np
import torch
import torch.nn.functional as F
import torch.nn as nn
import math
from torch.utils.data import DataLoader, TensorDataset
from tabulate import tabulate
from transformers import BertTokenizer
from datasets import load_dataset

from tqdm import tqdm
import random


This is a template of the notebook that you should complete and enrich with your own code.

First cells will be the same than the ones of the lab on text convolution.

# Data loading


In [2]:
dataset = load_dataset("scikit-learn/imdb", split="train")
print(dataset)

Dataset({
    features: ['review', 'sentiment'],
    num_rows: 50000
})


# Pre-processing / Tokenization

This is a very important step. It maybe boring but very important. In this session we will be lazy, but in real life, the time spent on inspecting and cleaning data is never wasted. It is true for text, but also for everything.



In PyTorch, everything is tensor. Words are replaced by indices. A sentence, is therefore a sequence of indices (long integers). In the first HW, you constructed a `WhiteSpaceTokenizer`. Here we will use an already built tokenizer. It is more appropriate to transformers. It relies on sub-word units, and converts everything in lower case. This is not always the best choice, but here it will be sufficient. To quote the documentation, this tokenizer allows you to:
- Tokenize (splitting strings in sub-word token strings), converttokens strings to ids and back, and encoding/decoding (i.e., tokenizing and converting to integers).
- Add new tokens to the vocabulary in a way that is independent of the underlying structure (BPE, SentencePiece…).
- Manage special tokens (like mask, beginning-of-sentence, etc.): adding them, assigning them to attributes in the tokenizer for easy access and making sure they are not split during tokenization.

Here we are going to use the tokenizer from the well known Bert model, that we can directly download.

In [3]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)




In [4]:
def preprocessing_fn(x, tokenizer):
    x["review_ids"] = tokenizer(
        x["review"],
        add_special_tokens=False,
        truncation=True,
        max_length=256,
        padding=False,
        return_attention_mask=False,
    )["input_ids"]
    x["label"] = 0 if x["sentiment"] == "negative" else 1
    return x


Same celel than in the lab session.

🚧 **TODO** 🚧

Read the documentation about HuggingFace dataset and complete the code below.
You should:
- Shuffle the dataset
- For computational reasons, use only a total of **5000 samples**.
- Tokenize the dataset with the `preprocessing_fn`. (*Hint: use the `Dataset.map` method from HuggingFace*).
- Keep only columns `review_ids` and `label`.
- Make a train/validation split, (**80% / 20%**). Call these dataset `train_set` and `valid_set`.


In [5]:
n_samples = 5000  # the number of training example

# We first shuffle the data !
dataset = dataset.shuffle()

# Select 5000 samples
splitted_dataset = dataset.select(range(n_samples))

# Tokenize the dataset
splitted_dataset = splitted_dataset.map(
    preprocessing_fn, fn_kwargs={"tokenizer": tokenizer}
)


# Remove useless columns
splitted_dataset = splitted_dataset.select_columns(["review_ids", "label"])

# Split the train and validation
splitted_dataset = splitted_dataset.train_test_split(test_size=0.2)

document_train_set = splitted_dataset["train"]
document_valid_set = splitted_dataset["test"]

Map: 100%|██████████| 5000/5000 [00:19<00:00, 252.03 examples/s]


In [6]:
def extract_words_contexts(ids_list, radius):
    words = []
    contexts = []
    
    # add padding with 0 to handle the borders
    padded_ids = [0] * radius + ids_list + [0] * radius
    
    for i, word_id in enumerate(ids_list):
        
        context = padded_ids[i:i+radius] + padded_ids[i+radius+1:i+2*radius+1]
        
        words.append(word_id)
        contexts.append(context)
    
    return words, contexts

In [7]:
def flatten_dataset_to_list(dataset, radius):
    word_list = []
    context_list = []
    
    for example in dataset:
        words, contexts = extract_words_contexts(example["review_ids"], radius)
        word_list.extend(words)
        context_list.extend(contexts)
    
    return word_list, context_list

In [49]:
radius = 5
word_list_train, context_list_train = flatten_dataset_to_list(document_train_set, radius)
word_list_valid, context_list_valid = flatten_dataset_to_list(document_valid_set, radius)

In [52]:
word_list_train_tensor = torch.tensor(word_list_train)
context_list_train_tensor = torch.tensor(context_list_train)
word_list_valid_tensor = torch.tensor(word_list_valid)
context_list_valid_tensor = torch.tensor(context_list_valid)

train_set = TensorDataset(word_list_train_tensor, context_list_train_tensor)
valid_set = TensorDataset(word_list_valid_tensor, context_list_valid_tensor)

In [53]:
word_list_train_tensor

tensor([ 3100,  1010,  1045,  ...,  1028, 12362,  5553])

In [54]:
def collate_fn(batch, K):
    word_ids, positive_context_ids = zip(*batch)
    word_ids_tensor = torch.stack(word_ids)
    positive_context_ids_tensor = torch.stack(positive_context_ids)
    R = positive_context_ids_tensor.shape[1]//2
    
    vocab = list(tokenizer.vocab.keys())
    negative_context_ids = []
    for _ in range(len(batch)):
        negative_words = random.sample(vocab, 2*R*K)
        negative_context_ids.append(tokenizer.encode(negative_words, add_special_tokens=False))
    negative_context_ids_tensor = torch.tensor(negative_context_ids)
    
    return {
        'word_id': word_ids_tensor,
        'positive_context_ids': positive_context_ids_tensor,
        'negative_context_ids': negative_context_ids_tensor
    }


In [55]:
import functools

batch_size = 128
K = 10

train_dataloader = DataLoader(
    train_set, batch_size=batch_size, collate_fn=functools.partial(collate_fn, K=K)
)
valid_dataloader = DataLoader(
    valid_set, batch_size=batch_size, collate_fn=functools.partial(collate_fn, K=K)
)
n_valid = len(valid_set)
n_train = len(train_set)

In [56]:
# Assuming you have already defined train_dataloader and valid_dataloader

def print_batch_info(dataloader):
    for i, batch in enumerate(dataloader):
        if i == 3:  # Stop after 3 iterations
            break

        R = batch['positive_context_ids'].shape[1]//2
        K = batch['negative_context_ids'].shape[1]//(2*R)
        print(f"Batch {i+1}:")
        print(f"  R: {R}")
        print(f"  K: {K}")
        print(f"  Shape of word_id: {batch['word_id'].shape}")
        print(f"  Shape of positive_context_ids: {batch['positive_context_ids'].shape}")
        print(f"  Shape of negative_context_ids: {batch['negative_context_ids'].shape}")
        print()

# Print information for train DataLoader
print("Train DataLoader:")
print_batch_info(train_dataloader)

Train DataLoader:
Batch 1:
  R: 5
  K: 10
  Shape of word_id: torch.Size([128])
  Shape of positive_context_ids: torch.Size([128, 10])
  Shape of negative_context_ids: torch.Size([128, 100])

Batch 2:
  R: 5
  K: 10
  Shape of word_id: torch.Size([128])
  Shape of positive_context_ids: torch.Size([128, 10])
  Shape of negative_context_ids: torch.Size([128, 100])

Batch 3:
  R: 5
  K: 10
  Shape of word_id: torch.Size([128])
  Shape of positive_context_ids: torch.Size([128, 10])
  Shape of negative_context_ids: torch.Size([128, 100])



# Model

In [27]:
class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.target_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.context_embeddings = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, word_ids, context_ids):
        word_vectors = self.target_embeddings(word_ids)
        context_vectors = self.context_embeddings(context_ids)
        dot_products = torch.sum(word_vectors * context_vectors, dim=1)
        outputs = torch.sigmoid(dot_products)
        return outputs

In [32]:
def train(model, data_loader, optimizer, device, epochs):
    model.train()
    for epoch in tqdm(range(epochs)):
        total_loss = 0
        for batch in data_loader:
            word_ids = batch['word_id'].to(device)
            pos_context_ids = batch['positive_context_ids'].to(device)
            neg_context_ids = batch['negative_context_ids'].to(device)

            optimizer.zero_grad()
            loss = 0

            for i in range(pos_context_ids.size(1)):
                pos_predictions = model(word_ids, pos_context_ids[:, i])
                pos_labels = torch.ones(pos_predictions.shape, device=device)
                loss += F.binary_cross_entropy(pos_predictions, pos_labels)

            for i in range(neg_context_ids.size(1)):
                neg_predictions = model(word_ids, neg_context_ids[:, i])
                neg_labels = torch.zeros(neg_predictions.shape, device=device)
                loss += F.binary_cross_entropy(neg_predictions, neg_labels)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        average_loss = total_loss / len(data_loader)
        print(f"Epoch {epoch + 1}/{epochs}, Average Loss: {average_loss:.4f}")

In [47]:
for i in train_dataloader:
    print(len(i['word_id']))
    break

128


In [None]:
d = 128
vocab_size = len(tokenizer.vocab)
model = Word2Vec(vocab_size, d)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
epochs = 2
train(model, train_dataloader, optimizer, device, epochs)

In [None]:
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
}, 'model_dim-{d}_radius-{radius}_ratio-{K}-batch-{batch_size}-epoch-{epochs}.ckpt')