In [1]:
import shutil
shutil.rmtree("/root/.cache/huggingface", ignore_errors=True)

In [2]:
%%capture

!pip install --upgrade --quiet datasets transformers fsspec

In [3]:
import numpy as np
import torch
import torch.nn.functional as F
import torch.nn as nn
import math
from torch.utils.data import DataLoader
from tabulate import tabulate
from datasets import load_dataset
import torch.optim as optim

from tqdm.notebook import tqdm
from transformers import BertTokenizer

In [4]:
dataset = load_dataset("scikit-learn/imdb", split="train")
print(dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

IMDB%20Dataset.csv:   0%|          | 0.00/66.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset({
    features: ['review', 'sentiment'],
    num_rows: 50000
})


# Pre-processing / Tokenization

In [5]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
VOCSIZE = len(tokenizer.vocab)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [6]:
VOCSIZE = len(tokenizer.vocab)

In [7]:
def preprocessing_fn(x, tokenizer):
    x["review_ids"] = tokenizer(
        x["review"],
        add_special_tokens=False,
        truncation=True,
        max_length=256,
        padding=False,
        return_attention_mask=False,
    )["input_ids"]
    x["label"] = 0 if x["sentiment"] == "negative" else 1
    return x

In [8]:
n_samples = 5000  # the number of training example

# We first shuffle the data !
dataset = dataset.shuffle()

# Select 5000 samples
split_dataset = dataset.select(range(n_samples))

# Tokenize the dataset
tok_dataset = split_dataset.map(preprocessing_fn, fn_kwargs={"tokenizer": tokenizer})

# Remove useless columns
tok_dataset = tok_dataset.select_columns(["review_ids", "label"])

# Split the train and validation
tok_dataset = tok_dataset.train_test_split(test_size=0.2)

train_set = tok_dataset["train"]
valid_set = tok_dataset["test"]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [9]:
class DataCollator:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, batch):
        features = [{"input_ids": x["review_ids"]} for x in batch]
        features = self.tokenizer.pad(
            features, padding="max_length", max_length=256, return_tensors="pt"
        )
        label = torch.tensor([x["label"] for x in batch])[:, None]
        return {"review_ids": features["input_ids"], "label": label}

In [10]:
data_collator = DataCollator(tokenizer)
batch_size = 64

train_dataloader = DataLoader(
    train_set, batch_size=batch_size, collate_fn=data_collator
)
valid_dataloader = DataLoader(
    valid_set, batch_size=batch_size, collate_fn=data_collator
)

n_valid = len(valid_set)
n_train = len(train_set)

# Convolution model with one layer

In [11]:
class Conv1dClassifier(nn.Module):
    """A text classifier:
    - input = minibatch
    - output = probability associated to a binary classification task
    - vocab_size: the number of words in the vocabulary we want to embed
    - embedding_dim: size of the word vectors
    """

    def __init__(self, vocab_size, embedding_dim, feature_size=100, kernel_size=3):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.feature_size = feature_size

        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        self.conv = nn.Conv1d(
            in_channels=embedding_dim,
            out_channels=feature_size,
            kernel_size=kernel_size,
            padding=(kernel_size - 1) // 2,
            stride=1,
        )

        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(feature_size, 2)

    def forward(self, input_ids):
        embedded = self.embedding(input_ids)

        embedded = embedded.permute(0, 2, 1)

        conv_output = self.conv(embedded)

        pooled = nn.functional.max_pool1d(conv_output, conv_output.size(2))

        pooled = pooled.squeeze(2)
        pooled = self.dropout(pooled)

        output = self.fc(pooled)

        return output

In [12]:
model = Conv1dClassifier(vocab_size=VOCSIZE, embedding_dim=50)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [13]:
def training(model, E):
    loss_function = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

    for e in range(E):
        model.train()

        total_loss = 0.0
        correct_predictions = 0

        for batch in train_dataloader:
            inputs, labels = batch["review_ids"], batch["label"]
            inputs, labels = inputs.to(device), labels.to(device)
            labels = labels.view(-1)
            optimizer.zero_grad()

            outputs = model(inputs)

            loss = loss_function(outputs, labels)

            loss.backward()

            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_dataloader)
        print(f"Epoch {e + 1}/{E}:")
        print(f"  Training Loss: {avg_loss:.4f}")

    # Validation loop
    model.eval()
    valid_loss = 0.0
    valid_correct_predictions = 0
    valid_total_samples = 0

    with torch.no_grad():
        for batch in valid_dataloader:
            inputs, labels = batch["review_ids"], batch["label"]
            inputs, labels = inputs.to(device), labels.to(device)
            labels = labels.view(-1)

            outputs = model(inputs)
            loss = loss_function(outputs, labels)

            valid_loss += loss.item()
            valid_total_samples += labels.size(0)

            _, predictions = torch.max(outputs, 1)
            valid_correct_predictions += (
                (predictions == labels).sum().item()
            )  # we compare the prediction with the label

    valid_accuracy = valid_correct_predictions / valid_total_samples

    print(f"Validation Accuracy: {valid_accuracy * 100:.2f}%")

In [14]:
E = 10
training(model, E)

Epoch 1/10:
  Training Loss: 0.8195
Epoch 2/10:
  Training Loss: 0.7138
Epoch 3/10:
  Training Loss: 0.6728
Epoch 4/10:
  Training Loss: 0.6441
Epoch 5/10:
  Training Loss: 0.6227
Epoch 6/10:
  Training Loss: 0.6178
Epoch 7/10:
  Training Loss: 0.5933
Epoch 8/10:
  Training Loss: 0.5788
Epoch 9/10:
  Training Loss: 0.5436
Epoch 10/10:
  Training Loss: 0.5284
Validation Accuracy: 76.70%


# Augmented model with Word2Vec


In [15]:
def load_model(conv_model, filename):
    checkpoint = torch.load(filename,weights_only=False)
    word_embeddings = checkpoint["word_embedding"]
    conv_model.embedding.weight.data = word_embeddings.weight.data.clone() # Set the weights of the embedding layer
    return conv_model


In [16]:
model = Conv1dClassifier(vocab_size=VOCSIZE, embedding_dim=50)
filename= "model_dim-50_radius-3_ratio-3-batch-64-epoch-10.ckpt"
model = load_model(model, filename)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
training(model, E)

Epoch 1/10:
  Training Loss: 0.6966
Epoch 2/10:
  Training Loss: 0.6626
Epoch 3/10:
  Training Loss: 0.6237
Epoch 4/10:
  Training Loss: 0.5476
Epoch 5/10:
  Training Loss: 0.4734
Epoch 6/10:
  Training Loss: 0.3988
Epoch 7/10:
  Training Loss: 0.3381
Epoch 8/10:
  Training Loss: 0.2763
Epoch 9/10:
  Training Loss: 0.2251
Epoch 10/10:
  Training Loss: 0.1726
Validation Accuracy: 82.10%



The validation accuracy increased from 76.70% to 82.10% when initialized with word2vec's embeddings which is a good improvement. This improvement was expected as shown in the Yoon Kim's 2014 paper on CNN for sentence
classification