In [None]:
%%capture
!pip install transformers datasets

In [None]:
import numpy as np
import torch
import torch.nn.functional as F
import torch.nn as nn
import math
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from tabulate import tabulate
from datasets import load_dataset
import random
from tqdm.notebook import tqdm
from transformers import BertTokenizer
import pandas as pd
import os
from functools import partial

# Data loading


In [None]:
dataset = load_dataset("scikit-learn/imdb", split="train")
print(dataset)

Dataset({
    features: ['review', 'sentiment'],
    num_rows: 50000
})


In [None]:
print(dataset.shape) # we have 50000 texts/sentences and the associated score
print(dataset[1]["review"], "\n", dataset[1]["sentiment"]) # we can have either positive and negative sentiment

(50000, 2)
A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master's of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional 'dream' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell's murals decorating every surface) are terribly w

# Pre-processing / Tokenization

Here we are going to use the tokenizer from the well known Bert model, that we can directly download.

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)


In [None]:
VOCSIZE = tokenizer.vocab_size # size of the vocabulary

In [None]:
def preprocessing_fn(x, tokenizer):
    x["review_ids"] = tokenizer(
        x["review"],
        add_special_tokens = False,
        truncation = True,
        max_length = 256,
        padding = False,
        return_attention_mask = False,
    )["input_ids"]
    x["label"] = 0 if x["sentiment"] == "negative" else 1
    return x


In [None]:
print(preprocessing_fn(dataset[10], tokenizer)) # we are just interested in the columns ("input_ids", "label")
# Basically, each word/token is associated to a certain id inside the vocabulary


{'review': 'Phil the Alien is one of those quirky films where the humour is based around the oddness of everything rather than actual punchlines.<br /><br />At first it was very odd and pretty funny but as the movie progressed I didn\'t find the jokes or oddness funny anymore.<br /><br />Its a low budget film (thats never a problem in itself), there were some pretty interesting characters, but eventually I just lost interest.<br /><br />I imagine this film would appeal to a stoner who is currently partaking.<br /><br />For something similar but better try "Brother from another planet"', 'sentiment': 'negative', 'review_ids': [6316, 1996, 7344, 2003, 2028, 1997, 2216, 21864, 15952, 3152, 2073, 1996, 17211, 2003, 2241, 2105, 1996, 5976, 2791, 1997, 2673, 2738, 2084, 5025, 8595, 12735, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 2012, 2034, 2009, 2001, 2200, 5976, 1998, 3492, 6057, 2021, 2004, 1996, 3185, 12506, 1045, 2134, 1005, 1056, 2424, 1996, 13198, 2030, 5976, 2791, 6057, 

# DATA PREPROCESSING


Same as than in the lab session.


- Shuffle the dataset
- For computational reasons, use only a total of **5000 samples**.
- Tokenize the dataset with the `preprocessing_fn`. (*Hint: use the `Dataset.map` method from HuggingFace*).
- Keep only columns `review_ids` and `label`.
- Make a train/validation split, (**80% / 20%**). Call these dataset `train_set` and `valid_set`.


- Apply the same preprocessing to get a dataset (with the same tokenizer)
with a train and a validation split, with two columns review_ids (list
of int) and label (int).

In [None]:
n_samples = 100  # the number of training example

# We first shuffle the data !
dataset_mod = dataset.shuffle(seed = 123) # it is very importand to shuffle the dataset in those type of applications

# Select 5000 samples
random.seed(12345)
dataset_mod = dataset_mod.select(random.sample(range(1, dataset.num_rows), n_samples)) # select randomly 5000 sentences inside the dataset

# Tokenize the dataset
dataset_mod = dataset_mod.map(preprocessing_fn, fn_kwargs = {"tokenizer": tokenizer})

# Remove useless columns
dataset_mod = dataset_mod.remove_columns(["review", "sentiment"])

# Split the train and validation
dataset_mod = dataset_mod.train_test_split(test_size = 0.2, seed = 123)

document_train_set = dataset_mod["train"]
document_valid_set = dataset_mod["test"]



In [None]:
document_train_set[0]["review_ids"]

[2019,
 4895,
 11263,
 10695,
 2100,
 1010,
 4895,
 13966,
 3861,
 2029,
 2003,
 2019,
 6151,
 6810,
 2099,
 6455,
 2203,
 2000,
 2848,
 19041,
 1005,
 2476,
 1012,
 2009,
 2003,
 1037,
 12063,
 2023,
 3185,
 2001,
 2412,
 2081,
 1012]

In [None]:
print(0 in document_train_set["review_ids"]) # we can use 0 for padding

False


In [None]:
n_train = len(document_train_set)
n_test = len(document_valid_set)
print("\n", f"The length of the training set is {n_train}","\n", f"The length of the testing set is {n_test}")


 The length of the training set is 80 
 The length of the testing set is 20


In [None]:
print(type(document_train_set[1]["review_ids"]))
print(type(document_train_set[1]["review_ids"][0]))
print(type(document_train_set[1]["label"]))
# We have checked that the columns review_ids is made of lists of int while the label column is made of int.


<class 'list'>
<class 'int'>
<class 'int'>


- Write a *function extract_words_contexts*. It should retrieve all pairs of valid $\left(w, C+\right)$ from a list of ids representing a text document.
It takes the radius *R* as an argument. Its output is therefore two lists:
 - The first one contains the ids of w.
 - The second one contains the list of ids of $C+$, within the local window corresponding to w.

Make sure that every $C+$ has the same size (i.e., contains the same number
of ids).

One simple idea could be to add zeros (a sort of padding) each time we are working with a word that does not allor to take into a account all the $2R$ positive contexts words. For example, if we are working with the first word of a sentence with a radius equal to R, we will the real ids and then R zeros in the remaining R positions. We understand that it is not the most powerfull idea.

In [None]:
def extract_words_contexts(R, ids):
  # R: radius
  # ids: list of ids representing a sentence
  w = []
  positive_list = []

  for i in range(len(ids)):
    w.append(ids[i]) # add the corresponding word

    if i == 0: # for the first word, we just sample the next R words
      circ = ids[1:(i+R+1)]
      circ = circ + [0]*(2*R - len(circ)) # padding
      positive_list.append(circ)

    if i == (len(ids) - 1): # for the last word, we just sample the previous R words
      circ1 = ids[(i-R):i]
      circ1 = circ1 + [0]*(2*R - len(circ1)) # padding
      positive_list.append(circ1)

    elif (i!= 0 & i!= (len(ids) - 1)):
      if i < R:
        circ3 = ids[0 : i]
        circ4 = ids[(i + 1): i + R + 1]
        circ5 = circ3 + circ4
        circ5 = circ5 + [0]*(2*R - len(circ5)) # padding
        positive_list.append(circ5)
      if i >= R:
        circ3 = ids[i - R:i]
        circ4 = ids[i + 1: min(i + R + 1, len(ids))]
        circ5 = circ3 + circ4
        circ5 = circ5 + [0]*(2*R - len(circ5)) # padding
        positive_list.append(circ5)

  return w, positive_list



radius = 5
example = extract_words_contexts(R = radius, ids = document_train_set[0]["review_ids"])
print(f"For the id {example[0][2]}, the corresponding positive context with a radius {radius} is {example[1][2]}")


For the id 11263, the corresponding positive context with a radius 5 is [2019, 4895, 10695, 2100, 1010, 4895, 13966, 0, 0, 0]


- Write a function *flatten_dataset_to_list* that applies the function
extract_words_contexts on a whole dataset. Apply the function to your initial *document_train_set* and *document_valid_set*, and get the corresponding flattened lists. Embed these lists in two valid PyTorch Dataset, like in HW 1, call them train_set and valid_set.

We can choose different values for the hyperparamer radius, even if they are quite similar. An high value for the radius parameter is not suitable.

In [None]:
radius1 = 4
radius2 = 6

In [None]:
def flatten_dataset_to_list(data, R):
    words = []
    positive_context = []
    for input_ids in tqdm(data): # for each document
      ids = input_ids["review_ids"] # extract ids
      wo, po = extract_words_contexts(R = R, ids = ids) # extract the list with id and the list with context
      words += wo
      positive_context += po
    return words, positive_context

# In the literature it has been studied that the optimal value for the radius is between 4 and 5
doc_train = flatten_dataset_to_list(document_train_set, R = radius1) # model with radius = 4
doc_test = flatten_dataset_to_list(document_valid_set, R = radius1)

doc_train2 = flatten_dataset_to_list(document_train_set, R = radius2) # model with radius equal to 6
doc_test2 = flatten_dataset_to_list(document_valid_set, R = radius2)

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

In [None]:
len(doc_train[0]) # train lenght

16044

In [None]:
class IMDBDataset(Dataset): # three mandatory components, self, get_item and len
    def __init__(self, data: list):

        self.dataset = data

    def __len__(self):

        return len(self.dataset[0])

    def __getitem__(self, idx: int):

        return self.dataset[0][idx], self.dataset[1][idx]

In [None]:
train = IMDBDataset(doc_train) # train with radius equal to 4
test = IMDBDataset(doc_test)

train2 = IMDBDataset(doc_train2) # train with radius equal to 6
test2 = IMDBDataset(doc_test2)

print(train[1]) # print the second word
print(len(train)) # number of words in the train dataset (not vocabulary)

(4895, [2019, 11263, 10695, 2100, 1010, 0, 0, 0])
16044


- Write a collate_fn function that adds the negative context to the batch. It
should be parametrized by the scaling factor K. The output of collate_fn
should be a Python dictionary, with three keys:
 - word_id,
 - positive_context_ids,
 - negative_context_ids.

Make sure that each value of the dictionary is a valid torch.Tensor.

For the negative context, simply randomly sample from the whole vocabulary set.

We can choose different values for the hyper parameter scale

In [None]:
scale1 = 2
scale2 = 4

In [None]:
def collate_fn(batch, K, R = radius, sizes = VOCSIZE):
  # K: scale factor for negative context
  # R: radius
  # batch is a tuple made of an integer and a list of integers
  word_id = [element[0] for element in batch]
  positive_context_ids = [element[1] for element in batch]
  neg_context_ids = []
  # we would like to sample 2K*R random words from the vocabulary
  for i in range(0, len(batch)): # for each word in the batch
    inner_negative = [] # define a list
    while len(inner_negative) < 2*K*R:
      negative = random.randint(0, sizes-1)

      while (negative == word_id[i] or negative in positive_context_ids[i]): # we would like to have word which is the different
      # from word_id and any other word in the positive context
        negative = random.randint(0, sizes-1)

      inner_negative.append(negative)

    neg_context_ids.append(inner_negative)

  return{"word_id":torch.tensor(word_id),
         "positive_context_ids": torch.tensor(positive_context_ids),
         "negative_context_ids": torch.tensor(neg_context_ids)}


Now we will define 4 different partial collate function with different values of the hyperparameters.

In [None]:
partial1 = partial(collate_fn, K = scale1, R = radius1) #scale 2, radius 4
partial2 = partial(collate_fn, K = scale1, R = radius2) #scale 2, radius 6
partial3 = partial(collate_fn, K = scale2, R = radius1) #scale 4, radius 4
partial4 = partial(collate_fn, K = scale2, R = radius2) #scale 4, radius 6

- Wraps everything in a **DataLoader**, like in HW 1.

Our idea is to train $8$ different models in order to make the final ablation study.

In [None]:
batch_size = 256 # we decide to fix this particular batch_size in order to speed up computations
#####
# scale = 2, radius = 4
train_dataloader = DataLoader(
    train, batch_size = batch_size, collate_fn = partial1
)

valid_dataloader = DataLoader(
    test, batch_size = batch_size, collate_fn = partial1
)
n_train = len(train)
n_test = len(test)

print(f"The length of the training set is {n_train}, while the length of the test set is {n_test}")

The length of the training set is 16044, while the length of the test set is 4468


In [None]:
####
# scale = 2, radius = 6
train_dataloader1 = DataLoader(
    train2, batch_size = batch_size, collate_fn = partial2
)

valid_dataloader1 = DataLoader(
    test2, batch_size = batch_size, collate_fn = partial2
)


####
# scale = 4, radius = 4

train_dataloader2 = DataLoader(
    train, batch_size = batch_size, collate_fn = partial3
)

valid_dataloader2= DataLoader(
    test, batch_size = batch_size, collate_fn = partial3
)

####
# scale = 4, radius = 6
train_dataloader3 = DataLoader(
    train2, batch_size = batch_size, collate_fn = partial4
)

valid_dataloader3 = DataLoader(
    test2, batch_size = batch_size, collate_fn = partial4
)

- Make 2 or 3 three iterations in the **DataLoader** and print R, K and the
shapes of all the tensors in the batches (let the output be visible).

In [None]:
batch = next(iter(train_dataloader))
print(batch["word_id"].size())
print(batch["positive_context_ids"].size()) # batchsize x 2R
print(batch["negative_context_ids"].size()) # batchsize x 2KR
batch.keys()

# 256 is the batch_size
# 8 is 2 times the radius
# 16 is 2 times the radius times scale

torch.Size([256])
torch.Size([256, 8])
torch.Size([256, 16])


dict_keys(['word_id', 'positive_context_ids', 'negative_context_ids'])

In [None]:
i = 0
for batch in train_dataloader:
  i = i + 1
  print(batch["word_id"].size())
  print(batch["positive_context_ids"].size()) # batchsize x 2R
  print(batch["negative_context_ids"].size())
  if  i == 3:
    break
print(f"The scale k is equal to {scale1} and the radius is {radius1}")

torch.Size([256])
torch.Size([256, 8])
torch.Size([256, 16])
torch.Size([256])
torch.Size([256, 8])
torch.Size([256, 16])
torch.Size([256])
torch.Size([256, 8])
torch.Size([256, 16])
The scale k is equal to 2 and the radius is 4


# MODEL

- Write a model named **Word2Vec** which is a valid torch.nn.Module (i.e.,
write a class that inherits from the torch.nn.Module), and implement the
Word2Vec model. It should be parametrized by the **vocabulary size** and
the **embeddings dimension**. Use the module torch.nn.Embedding.

In [None]:
# General idea of the model

kk = nn.Embedding(num_embeddings = VOCSIZE, embedding_dim = 4)
t1 = kk(batch["word_id"])
t2 = kk(batch["positive_context_ids"])
#kk(batch["negative_context_ids"]).size()
torch.bmm(t2, t1.unsqueeze(-1)).size()
t3 = torch.bmm(t2, t1.unsqueeze(-1)).squeeze(-1)
print(t3)
torch.sigmoid(t3)

tensor([[-0.6378,  3.4517,  1.2378,  ..., -1.4589,  2.5013, -1.4589],
        [ 2.4936,  0.5794, -0.3415,  ...,  1.7300, -1.3169,  1.0765],
        [-1.4200, -0.6472, -1.4589,  ...,  1.5271, -2.3744, -0.5957],
        ...,
        [ 0.4737, -0.1463, -0.5483,  ...,  1.7514,  0.9631,  1.5287],
        [-1.7430, -1.5042, -3.1410,  ...,  0.8789, -2.8912,  0.2947],
        [ 0.1062, -0.2680,  1.7514,  ...,  0.9977,  0.9785, -0.3561]],
       grad_fn=<SqueezeBackward1>)


tensor([[0.3457, 0.9693, 0.7752,  ..., 0.1886, 0.9242, 0.1886],
        [0.9237, 0.6409, 0.4155,  ..., 0.8494, 0.2113, 0.7458],
        [0.1947, 0.3436, 0.1886,  ..., 0.8216, 0.0851, 0.3553],
        ...,
        [0.6163, 0.4635, 0.3663,  ..., 0.8521, 0.7237, 0.8218],
        [0.1489, 0.1818, 0.0414,  ..., 0.7066, 0.0526, 0.5731],
        [0.5265, 0.4334, 0.8521,  ..., 0.7306, 0.7268, 0.4119]],
       grad_fn=<SigmoidBackward0>)

In [None]:
class Word2vec(nn.Module):
  '''
  Implementation of Word2Vec from scratch using pytorch:
  - vocab_size --> size of the vocabulary
  - embedding_dim --> dimension of the embedding space --> the dimension is the same for the word and the context, but we use two different embedding tables
  We compute the scores between the words and the contexts and then we apply a sigmoid function to each cell, as the implementation requires.
  '''
  def __init__(self, vocab_size, embedding_dim):
    super().__init__()
    self.embedding_dim = embedding_dim
    self.size = vocab_size
    self.embedding = nn.Embedding(num_embeddings = self.size, embedding_dim = self.embedding_dim) # word embedding table
    self.embedding_cont = nn.Embedding(num_embeddings = self.size, embedding_dim = self.embedding_dim) # contexts embedding table

  def forward(self, input):
    self.words = self.embedding(input["word_id"]).unsqueeze(-1) #batch_size x embeddings x 1
    self.positive = self.embedding_cont(input["positive_context_ids"]) #batch_size x 2*R x embedddings
    self.negative = self.embedding_cont(input["negative_context_ids"]) #batch_size x 2*k*R x embeddings

    mult1 = torch.bmm(self.positive, self.words).squeeze(-1) # compute the score between words and the positive context
    mult2 = torch.bmm(self.negative, self.words).squeeze(-1) # compute the score between words and the negative context
    # Then apply the sigmoid function.
    sig1 = torch.sigmoid(mult1) #batch size x 2*R
    sig2 = torch.sigmoid(mult2) # bact size x 2*R*K

    return (sig1, sig2)


In [None]:
dim = [25, 100] # embeddings

We decide to test two different embeddings dimensions. So, at the end we will have $8$ different models

In [None]:
class Personal_loss(nn.Module):
  '''
  Compute the loss of the Word2vec model:
  for each observation, we sum -log(p) for the words in the positive context and -log(1-p) for the negative context.
  The we compute the sum of the negative likelihood for each observation to have an absolute loss value for the batch.
  Our goal is to minimize the negative log-likelihood.

  We choose to add epsilon in order to avoig log of zero and have an infinitive loss
  '''

  def __init__(self, positive, negative, epsilon = 0.05):
    super().__init__()
    self.pos = positive
    self.neg = negative
    self.epsilon = epsilon

  def forward(self):
    t1 = torch.log(self.pos + self.epsilon).sum(dim = 1)#dim = 1)
    t2 = torch.log(1 - self.neg + self.epsilon).sum(dim = 1)#dim = 1)
    t3 = -1*t1 -1*t2

    return (torch.sum(t3))


In [None]:
model99 = Word2vec(vocab_size =  VOCSIZE,
                 embedding_dim = 5) # we choose 5 embeddings dimension by default and the vocsize of the bert tokenizer
out = model99(batch)
print(out[1].size())

Personal_loss(out[0], out[1]).forward().item()

torch.Size([256, 16])


5344.10595703125

- Train the model. The training should be parametrized by the batch size
B, and the number of epochs E.

We choose a number of epochs equal to $3$ because after an empirical study the loss seems quite stationary after few iterations. The training part is indirectly parametrized by the batch size beacuse we decide to use matrix multiplications instead of for loops. The batch size is fixed to $256$.

Remark:
You will see $8$ different models with different combinations for the parameters.


We define a sort of accuracy:
- we compute the score between a word and the two contexts;
- if the score for each couple is greater than a certain threshold (fixed to 0.3) we assing the label $1$, otherwise $0$;
- words in the positive contexts have label $1$ while words in the negative contexts have label $0$.
This accuracy should state how much the model is able to understand that a word is similar to words in the positive context.

We had also the idea to implement cosine similarity but it was not a proper accuracy score.

In [None]:
# scale = 2, radius = 4 --> train_dataloader, valid_dataloader
# scale = 2, radius = 6 --> train_dataloader1, valid_dataloader1
# scale = 4, radius = 4 --> train_dataloader2, valid_dataloader2
# scale = 4, radius = 6 --> train_dataloader3, valid_dataloader3

n_epochs = 3 # we fix a low number of epochs
def validation(mod, valid, rad, sca, thresh):
  number = 0
  correct = 0
  mod.eval()
  with torch.no_grad():

    for batch in tqdm(valid):
      number += len(batch["word_id"]) * (2*rad + 2*rad*sca) #basically we have 256x4xr^2xsca elements
      # The idea is that we wil not use just the length of a batch, because for each word you have associated a vector of size
      # 2R * 2RK
      out = mod(batch)
      pos = out[0]
      neg = out[1]
      correct += (pos >= thresh).sum().item() + (neg <= thresh).sum().item()

    print(f"The validation accuracy is {correct/number}")




def training(mod, train, epochs, rad, sca, thresh, valid):
  optimizer = torch.optim.Adam(mod.parameters(), lr = 0.001)
  for e in range(epochs):
    mod.train()
    running_loss = 0
    number = 0 # number of training elements
    correct = 0

    for batch in tqdm(train): # the batch_size is fixed to 64 here
      number += len(batch["word_id"]) * (2*rad +2*rad*sca)
      optimizer.zero_grad()
      out = mod(batch)
      pos = out[0]
      neg = out[1]
      loss = Personal_loss(pos, neg)
      loss_v = loss.forward()
      loss_v.backward()
      optimizer.step()
      running_loss += loss_v.item()
      correct += (out[0] >= thresh).sum().item() + (out[1] <= thresh).sum().item()

    print(f"Loss:{running_loss/number}")
    print(f"Accuracy:{correct/number}")
  validation(mod, valid, rad, sca, thresh)

In [None]:
model = Word2vec(vocab_size =  VOCSIZE,
                 embedding_dim = dim[0]) # we choose 25 embeddings dimension by default and the vocsize of the bert tokenizer
training1 = training(mod = model, train = train_dataloader, epochs = n_epochs, rad = radius1, sca = scale1, thresh = 0.3, valid = valid_dataloader)
# scale = 2, radius = 4, dim = 25


  0%|          | 0/63 [00:00<?, ?it/s]

Loss:1.1292451794958072
Accuracy:0.47715916645890466


  0%|          | 0/63 [00:00<?, ?it/s]

Loss:1.113319833825713
Accuracy:0.4829790991440206


  0%|          | 0/63 [00:00<?, ?it/s]

Loss:1.099597428432082
Accuracy:0.4880199243746364


  0%|          | 0/18 [00:00<?, ?it/s]

The validation accuracy is 0.4872892420173083


In [None]:
model2 = Word2vec(vocab_size =  VOCSIZE,
                 embedding_dim = dim[1]) # we choose 100 embeddings dimension by default and the vocsize of the bert tokenizer
training2 = training(mod = model2, train = train_dataloader, epochs = n_epochs, rad = radius1, sca = scale1, thresh = 0.3, valid = valid_dataloader)
 # scale = 2, radius = 4, dim = 100

  0%|          | 0/63 [00:00<?, ?it/s]

Loss:1.2780945780349968
Accuracy:0.493829468960359


  0%|          | 0/63 [00:00<?, ?it/s]

Loss:1.252189274194046
Accuracy:0.5046408834039724


  0%|          | 0/63 [00:00<?, ?it/s]

Loss:1.2379448972397846
Accuracy:0.510247860051525


  0%|          | 0/18 [00:00<?, ?it/s]

The validation accuracy is 0.5065745299910475


In [None]:
model3 = Word2vec(vocab_size =  VOCSIZE,
                 embedding_dim = dim[0])
training3 = training(mod = model3, train = train_dataloader1, epochs = n_epochs, rad = radius2, sca = scale1, thresh = 0.3,  valid = valid_dataloader1)
 # scale = 2, radius = 6, dim = 25

  0%|          | 0/63 [00:00<?, ?it/s]

Loss:1.1267108991735941
Accuracy:0.47948869774786007


  0%|          | 0/63 [00:00<?, ?it/s]

Loss:1.1157829829730395
Accuracy:0.48333921992298956


  0%|          | 0/63 [00:00<?, ?it/s]

Loss:1.0991690203556972
Accuracy:0.4875221612787058


  0%|          | 0/18 [00:00<?, ?it/s]

The validation accuracy is 0.4857256540336218


In [None]:
model4 = Word2vec(vocab_size =  VOCSIZE,
                 embedding_dim = dim[1])
training4 = training(mod = model4, train = train_dataloader1, epochs = n_epochs, rad = radius2, sca = scale1, thresh = 0.3, valid = valid_dataloader1)
 # scale = 2, radius = 6, dim = 100

  0%|          | 0/63 [00:00<?, ?it/s]

Loss:1.2784976118598117
Accuracy:0.49455836726779134


  0%|          | 0/63 [00:00<?, ?it/s]

Loss:1.254935561185689
Accuracy:0.5037431784813984


  0%|          | 0/63 [00:00<?, ?it/s]

Loss:1.2377446025437426
Accuracy:0.5105110252361561


  0%|          | 0/18 [00:00<?, ?it/s]

The validation accuracy is 0.5073361185715707


In [None]:
model5 = Word2vec(vocab_size =  VOCSIZE,
                 embedding_dim = dim[0])
training5 = training(mod = model5, train = train_dataloader2, epochs = n_epochs, rad = radius1, sca = scale2, thresh = 0.3, valid = valid_dataloader2)
# scale = 4, radius = 4, dim = 25

  0%|          | 0/63 [00:00<?, ?it/s]

Loss:1.128256623508983
Accuracy:0.4611739591124408


  0%|          | 0/63 [00:00<?, ?it/s]

Loss:1.1188320725792158
Accuracy:0.4620091623036649


  0%|          | 0/63 [00:00<?, ?it/s]

Loss:1.1086524737026886
Accuracy:0.4638977187733732


  0%|          | 0/18 [00:00<?, ?it/s]

The validation accuracy is 0.4602338854073411


In [None]:
model6 = Word2vec(vocab_size =  VOCSIZE,
                 embedding_dim = dim[1])
training5 = training(mod = model6, train = train_dataloader2, epochs = n_epochs, rad = radius1, sca = scale2, thresh = 0.3, valid = valid_dataloader2)
# scale = 4, radius = 4, dim = 100

  0%|          | 0/63 [00:00<?, ?it/s]

Loss:1.2812321231359853
Accuracy:0.48346110695587136


  0%|          | 0/63 [00:00<?, ?it/s]

Loss:1.2654666524966498
Accuracy:0.4894571179257043


  0%|          | 0/63 [00:00<?, ?it/s]

Loss:1.251496943890142
Accuracy:0.49397438294689605


  0%|          | 0/18 [00:00<?, ?it/s]

The validation accuracy is 0.49120971351835274


In [None]:
model7 = Word2vec(vocab_size =  VOCSIZE,
                 embedding_dim = dim[0])
training7 = training(mod = model7, train = train_dataloader3, epochs = n_epochs, rad = radius2, sca = scale2, thresh = 0.3, valid = valid_dataloader3)
# scale = 4, radius = 6, dim = 25

  0%|          | 0/63 [00:00<?, ?it/s]

Loss:1.1211727228880344
Accuracy:0.4564437380536857


  0%|          | 0/63 [00:00<?, ?it/s]

Loss:1.1101116613151722
Accuracy:0.4594645973572675


  0%|          | 0/63 [00:00<?, ?it/s]

Loss:1.0990352793246307
Accuracy:0.4609438627108784


  0%|          | 0/18 [00:00<?, ?it/s]

The validation accuracy is 0.46243285586392124


In [None]:
model8 = Word2vec(vocab_size =  VOCSIZE,
                 embedding_dim = dim[1])
training8 = training(mod = model8, train = train_dataloader3, epochs = n_epochs, rad = radius2, sca = scale2, thresh = 0.3, valid = valid_dataloader3)
# scale = 4, radius = 6, dim = 100

  0%|          | 0/63 [00:00<?, ?it/s]

Loss:1.2864984593961657
Accuracy:0.4822810188647885


  0%|          | 0/63 [00:00<?, ?it/s]

Loss:1.2699149715804325
Accuracy:0.48786358347876674


  0%|          | 0/63 [00:00<?, ?it/s]

Loss:1.2514649441818204
Accuracy:0.4943042051026344


  0%|          | 0/18 [00:00<?, ?it/s]

The validation accuracy is 0.491819606087735


Although all the models have a good performance in testing, we cannot say that those models are good enough because this is just a **vanilla accuracy**. This could also depend on the fact that we have few samples in the validation set.

We can extrat the embeddings using this line of code

In [None]:
embeddingss = model2.embedding # embeddings extraction
embeddingss

Embedding(30522, 100)

- Write a function save_model that saves the model’s embeddings in a file.
The file name should be formated like:

  *model_dim-<d>_radius-<R>_ratio-<K>-batch-<B>-epoch-<E>.ckpt*

In [None]:
data_dir = "/content"

In [None]:
def save_model(model, d, R, K, B, E):
  # model: model
  # d: embeddings dimension
  # R: radius
  # K: scale
  # B: batch_size
  # E: number of epochs
  embeddings = model.embedding
  #embeddings.weight.requires_grad = False # we do not want that they are optimized again


  name = f"model_dim-{d}_radius-{R}_ratio-{K}-batch-{B}-epoch-{E}.ckpt"

  torch.save(embeddings, os.path.join(data_dir, name))

In [None]:
# DIM = 25
save_model(model, d = dim[0], R = radius1, K = scale1, B = batch_size, E = n_epochs)  # scale = 2. radius = 4
save_model(model3, d = dim[0], R = radius2, K = scale1, B = batch_size, E = n_epochs)  # scale = 2, radius = 6
save_model(model5, d = dim[0], R = radius1, K = scale2, B = batch_size, E = n_epochs) # scale = 4, radius = 4
save_model(model7, d = dim[0], R = radius2, K = scale2, B = batch_size, E = n_epochs) # scale = 4, radius = 6

# DIM = 100
save_model(model2, d = dim[1], R = radius1, K = scale1, B = batch_size, E = n_epochs)  # scale = 2. radius = 4
save_model(model4, d = dim[1], R = radius2, K = scale1, B = batch_size, E = n_epochs)  # scale = 2, radius = 6
save_model(model6, d = dim[1], R = radius1, K = scale2, B = batch_size, E = n_epochs) # scale = 4, radius = 4
save_model(model8, d = dim[1], R = radius2, K = scale2, B = batch_size, E = n_epochs) # scale = 4, radius = 6

# Training a model on a longuer data set

After finding good parameters on the classification, we can train our best model on more data

In [None]:
n_samples_2 = 200  # the number of training example

# We first shuffle the data !
dataset_mod_2 = dataset.shuffle(seed = 123) # it is very importand to shuffle the dataset in those type of applications

# Select 5000 samples
random.seed(12345)
dataset_mod_2 = dataset_mod_2.select(random.sample(range(1, dataset.num_rows), n_samples_2)) # select randomly 20000 sentences inside the dataset

# Tokenize the dataset
dataset_mod_2 = dataset_mod_2.map(preprocessing_fn, fn_kwargs = {"tokenizer": tokenizer})

# Remove useless columns
dataset_mod_2 = dataset_mod_2.remove_columns(["review", "sentiment"])

# Split the train and validation
dataset_mod_2 = dataset_mod_2.train_test_split(test_size = 0.2, seed = 123)

document_train_set_2 = dataset_mod_2["train"]
document_valid_set_2 = dataset_mod_2["test"]



Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [None]:
n_train_2 = len(document_train_set_2)
n_test_2 = len(document_valid_set_2)

# In the literature it has been studied that the optimal value for the radius is between 4 and 5
doc_train_2 = flatten_dataset_to_list(document_train_set_2, R = radius1) # model with radius = 4
doc_test_2 = flatten_dataset_to_list(document_valid_set_2, R = radius1)

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

In [None]:
train_2 = IMDBDataset(doc_train_2) # train with radius equal to 4
test_2 = IMDBDataset(doc_test_2)

partial1_2 = partial(collate_fn, K = scale1, R = radius1) #scale 2, radius 4

In [None]:
# batch_size = 256
# scale = 2, radius = 4
train_dataloader_2 = DataLoader(
    train, batch_size = batch_size, collate_fn = partial1_2
)

valid_dataloader_2 = DataLoader(
    test, batch_size = batch_size, collate_fn = partial1_2
)
n_train_2 = len(train_2)
n_test_2 = len(test_2)

print(f"The length of the training set is {n_train_2}, while the length of the test set is {n_test_2}")

The length of the training set is 32298, while the length of the test set is 8320


In [None]:
best_model = Word2vec(vocab_size =  VOCSIZE,
                 embedding_dim = dim[1]) # we choose 100 embeddings dimension and the vocsize of the bert tokenizer
n_epochs_2 = 3
training1_2 = training(mod = best_model, train = train_dataloader_2, epochs = n_epochs_2, rad = radius1, sca = scale1, thresh = 0.3, valid = valid_dataloader_2)
# scale = 2, radius = 4, dim = 100

  0%|          | 0/63 [00:00<?, ?it/s]

Loss:1.2814173026739941
Accuracy:0.49335681043796226


  0%|          | 0/63 [00:00<?, ?it/s]

Loss:1.258365287137212
Accuracy:0.5028307570846838


  0%|          | 0/63 [00:00<?, ?it/s]

Loss:1.2435426701732994
Accuracy:0.5092557965594615


  0%|          | 0/18 [00:00<?, ?it/s]

The validation accuracy is 0.5033758579528499


In [None]:
def save_best_model(model, d, R, K, B, E):
  # model: model
  # d: embeddings dimension
  # R: radius
  # K: scale
  # B: batch_size
  # E: number of epochs
  embeddings = model.embedding
  #embeddings.weight.requires_grad = False # we do not want that they are optimized again


  name = f"best_model_dim-{d}_radius-{R}_ratio-{K}-batch-{B}-epoch-{E}.ckpt"

  torch.save(embeddings, os.path.join(data_dir, name))
save_best_model(best_model, d = dim[1], R = radius1, K = scale1, B = batch_size, E = n_epochs_2)  # scale = 2. radius = 4