# BERT with TF-IDF features

## Installation

In [None]:
# !pip install --upgrade pip # upgrade pip

In [None]:
# !pip install -U huggingface-hub
# !pip install -U scikit-learn
# !pip install -U transformers
# !pip install -U datasets
# !pip install datasets==1.18.1
# !pip install ipywidgets

For tqdm progress bars (on a terminal):
1. `conda install -c conda-forge nodejs`
2. `jupyter labextension install @jupyter-widgets/jupyterlab-manager`
3. `jupyter nbextension enable --py widgetsnbextension`
4. `jupyter lab clean`
5. Refresh web page...

In [3]:
import sklearn
import transformers
import datasets
# import huggingface_hub

In [4]:
print("Current versions:")
print(sklearn.__version__)
print(datasets.__version__)
print(transformers.__version__)
# print(huggingface_hub.__version__)

Current versions:
1.0.2
2.0.0
4.18.0


## Libraries

In [5]:
import os
import sys
# sys.path.insert(0, os.path.abspath(".."))

from tqdm.autonotebook import tqdm

import time

import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

import torch
import torch.nn as nn

from datasets import load_dataset, Dataset, concatenate_datasets

from transformers import AutoTokenizer

from transformers import BertModel, DistilBertModel
from transformers.data.data_collator import DataCollatorWithPadding

In [6]:
%load_ext autoreload
%autoreload 2

## Device and Seed

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [8]:
model_name = 'bert-base-uncased'
cache_dir = 'cache_dir/'

batch_size = 128

tfidf_dim = 3000 # 4000

alpha = 10
learning_algo = RidgeClassifier(alpha=alpha)

## Load and Tokenize Data

In [9]:
# Custom functions for loading and preparing data

def tokenize(sample, tokenizer):
    """Tokenize sample"""
    
    sample = tokenizer(sample['text'], truncation=True, padding=False, return_length=True)
    
    return sample

def load_and_tokenize_dataset(dataset_name, model_name='bert-base-uncased', cache_dir='cache_dir/'):
    """
    Load dataset from the datasets library of HuggingFace.
    Tokenize and sort data by length.
    """
    
    # tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Load dataset
    dataset = load_dataset(dataset_name, cache_dir=cache_dir)
    
    # Rename label column for tokenization purposes
    dataset = dataset.rename_column('label', 'labels')
    
    # Tokenize data
    dataset = dataset.map(lambda x: tokenize(x, tokenizer), batched=True)
    
    # sorting dataset
    for split in dataset.keys():
        dataset[split] = dataset[split].flatten_indices().sort("length")
    
    return dataset, tokenizer

In [10]:
dataset, tokenizer = load_and_tokenize_dataset('imdb', model_name=model_name, cache_dir=cache_dir) 

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to cache_dir/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to cache_dir/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/25 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/25 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/50 [00:00<?, ?ba/s]

In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask', 'length'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask', 'length'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask', 'length'],
        num_rows: 50000
    })
})

In [12]:
dataset['train']._indices

MemoryMappedTable
indices: uint64
----
indices: [[340,14303,7796,20312,2809,17429,23421,10699,20570,16088,...,18432,12167,10979,14031,5795,1614,23205,5702,13899,20137],[3945,967,4769,10309,23471,18625,24657,10503,19083,14340,...,1762,21874,3403,2454,10963,21356,12233,2278,1644,24853],[14189,2755,22674,19072,1489,4587,3407,82,4848,18008,...,18499,9148,20490,20257,7327,18187,6109,21717,23407,21115],[4818,8905,16761,10976,11651,8144,1090,3710,13113,9613,...,13201,2202,23876,8790,22138,19887,9913,14505,5285,18855],[12854,12057,17948,11427,10499,18610,15817,13489,2385,15861,...,21291,6377,8057,18801,7733,9446,24643,12931,18094,7619],[14034,22119,20291,19070,16723,17657,3869,14703,18391,23153,...,12799,14271,23774,15793,5783,6446,14893,9100,4545,8879],[16469,804,17649,6141,15554,23508,4138,7022,23529,16649,...,8421,3903,7301,3547,7249,12658,3296,9283,6744,2110],[1892,1103,9798,24346,21949,6594,14182,18580,13214,22456,...,14037,22526,19119,12668,3026,13969,12427,24790,9730,14088],[15006,12231

## Compute TF-IDF features

In [13]:
def get_tfidf_features(dataset, dim=4000):
    """Compute tf-idf features and add it as a new field for the dataset"""

    t0 = time.time()
    
    vectorizer = TfidfVectorizer(max_features=dim)
    vectorizer.fit(dataset['train']['text'])
        
    for split in dataset.keys():
        X_tmp = vectorizer.transform(dataset[split]['text'])
        X_tmp = list(X_tmp.todense())
        X_tmp = [np.asarray(row).reshape(-1) for row in X_tmp]
        
        indices = dataset[split]._indices # ***
        dataset[split]._indices = None    # ***
        dataset[split] = dataset[split].add_column("additional_fts", X_tmp)
        dataset[split]._indices = indices # ***
        
        dataset[split].set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels', 'length', 'additional_fts'])
        dataset[split] = dataset[split].remove_columns("text")
    
    t1 = time.time()
    
    return dataset, t1-t0

In [14]:
dataset, tfidf_time = get_tfidf_features(dataset, dim=tfidf_dim)

In [15]:
dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask', 'length', 'additional_fts'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask', 'length', 'additional_fts'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask', 'length', 'additional_fts'],
        num_rows: 50000
    })
})

In [16]:
tfidf_time

34.571083068847656

In [17]:
dataset['train']['length'] # not sorted!!!

tensor([197, 238, 341,  ..., 174, 129, 294])

In [18]:
dataset['train']._indices

MemoryMappedTable
indices: uint64
----
indices: [[340,14303,7796,20312,2809,17429,23421,10699,20570,16088,...,18432,12167,10979,14031,5795,1614,23205,5702,13899,20137],[3945,967,4769,10309,23471,18625,24657,10503,19083,14340,...,1762,21874,3403,2454,10963,21356,12233,2278,1644,24853],[14189,2755,22674,19072,1489,4587,3407,82,4848,18008,...,18499,9148,20490,20257,7327,18187,6109,21717,23407,21115],[4818,8905,16761,10976,11651,8144,1090,3710,13113,9613,...,13201,2202,23876,8790,22138,19887,9913,14505,5285,18855],[12854,12057,17948,11427,10499,18610,15817,13489,2385,15861,...,21291,6377,8057,18801,7733,9446,24643,12931,18094,7619],[14034,22119,20291,19070,16723,17657,3869,14703,18391,23153,...,12799,14271,23774,15793,5783,6446,14893,9100,4545,8879],[16469,804,17649,6141,15554,23508,4138,7022,23529,16649,...,8421,3903,7301,3547,7249,12658,3296,9283,6744,2110],[1892,1103,9798,24346,21949,6594,14182,18580,13214,22456,...,14037,22526,19119,12668,3026,13969,12427,24790,9730,14088],[15006,12231

## Create dataloaders

In [19]:
def create_dataloaders(dataset, tokenizer, batch_size=256):
    dataloader_d = {}

    for split in ['train', 'test']:
        dataloader_d[split] = torch.utils.data.DataLoader(dataset[split], 
                                                          batch_size=batch_size, 
                                                          collate_fn=DataCollatorWithPadding(tokenizer))
        
    return dataloader_d

In [20]:
dataloader_d = create_dataloaders(dataset, tokenizer, batch_size=batch_size)

In [21]:
dataloader_d

{'train': <torch.utils.data.dataloader.DataLoader at 0x7f74ac035700>,
 'test': <torch.utils.data.dataloader.DataLoader at 0x7f74ac035730>}

In [22]:
for b in dataloader_d['train']:
    break

## Model
- **Embedding layer**

In [23]:
class Embedding(nn.Module):
    """
    Implements an embedding layer.
    """

    def __init__(self, model_name='bert-base-uncased', pooling='mean', device=torch.device('cpu')):
        
        """
        Constructor

        Parameters
        ----------
        model_name : str
            Name of the BERT model and tokenizer.

        Attributes
        ----------
        model_name : str
            Name of the BERT model and tokenizer.
            The list of or possible models is provided here: https://huggingface.co/models
        pooling : str
            Pooling strategy to be applied, either 'mean' or 'cls'.
            For 'mean', the sentence embedding is the mean of the token embeddings.
            For 'cls', the sentence embedding is the embedding of the [CSL] token (as usual in BERT).
        device : torch.device
            GPU is available, CPU otherwise.
        """
        
        super(Embedding, self).__init__()

        self.model_name = model_name
        self.pooling = pooling
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = BertModel.from_pretrained(self.model_name, output_hidden_states=True)
        self.model.to(self.device).eval()
        print('Model downloaded:', model_name)

    def forward(self, batch):
        """
        Embeds a batch of token ids into a 3D tensor.
        If a GPU is available, the embedded batch is computed and put on the GPU.

        Parameters
        ----------
        batch: torch.Tensor
            2D tensor: batch of text to be embedded.
            Each sentence is represented as a vertical sequence of token ids.

        Returns
        -------
        batch_emb : torch.Tensor
            3D tensor (batch size x max sentence length x embedding dim)
            BERT embedding of the batch of texts.
        """
        
        with torch.no_grad():
            
            batch = batch.to(self.device)
            
            # DOES NOT IMPROVE THE RESULTS 
#             # New attention mask with last 1 element - correposnding to [SEP] token - removed.
#             # Accordingly, the mean pooling will not take the embedding of [SEP] into account.
#             last_indices = batch['length'] - 1
#             batch_size = batch['length'].shape[0]
#             indices = torch.tensor([range(batch_size), last_indices]).transpose(0,1)
#             # cf. https://discuss.pytorch.org/t/modify-array-with-list-of-indices/27739
#             batch['attention_mask'][indices[:, 0], indices[:, 1]] = 0
            
            if self.pooling == 'mean':
                
                batch_emb = self.model(batch["input_ids"], batch["attention_mask"])[0]
                # batch_emb = torch.mean(batch_emb, dim=1)
                # batch_emb = batch_emb.transpose(0, 1)
                # batch_emb = batch_emb[:, :, :] # removing CLS and/or SEP does not seem to improve
                batch_emb = torch.sum(batch_emb, dim=1).transpose(0, 1)
                batch_emb = torch.div(batch_emb, batch['length']).transpose(0, 1)
            
            elif self.pooling == 'cls':
            
                batch_emb = self.model(batch["input_ids"], batch["attention_mask"])[1]

            return batch_emb

In [24]:
#embedding = Embedding()
embedding = Embedding(pooling='mean')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model downloaded: bert-base-uncased


In [25]:
embedding.model_name, embedding.pooling, embedding.device

('bert-base-uncased', 'mean', device(type='cuda'))

In [26]:
outputs = embedding(b)

In [27]:
outputs.shape

torch.Size([128, 768])

- **full model: embedding + feature concaatenation**

In [28]:
class BertTFIDF(nn.Module):
    """
    Impdements BERT + TF-IDF model:
    Concatenate BERT (or similar model) sentence embedding to most relevant TF-IDF features.
    """
    
    def __init__(self, model_name='bert-base-uncased', device=torch.device('cpu')):
        
        super(BertTFIDF, self).__init__()
        
        self.model_name = model_name
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        self.embedding = Embedding(model_name=self.model_name, device=self.device)
        
    def forward(self, batch):
        
        embedded_input = self.embedding(batch)
        additional_fts = batch['additional_fts']
        
        output = torch.cat([embedded_input, additional_fts], dim=1)
        
        return output

In [29]:
b['additional_fts'].shape

torch.Size([128, 3000])

In [30]:
model = BertTFIDF(model_name=model_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model downloaded: bert-base-uncased


In [31]:
X = model(b)

In [32]:
X.shape

torch.Size([128, 3768])

## Training

In [33]:
def process_dataset(dataset, model, batch_size=256):
    """
    Pass a dataset into a model.
    
    Parameters
    ----------
    dataset : datasets.arrow_dataset.Dataset
        Dataset to be processed
    model : __main__.BertTFIDF
        Model instance of the BertTFIDF class
    
    Returns
    -------
    outputs_t, labels_t : torch.Tensor, torch.Tensor
        Tuple of outputs and labels resulting from passing the dataset into the model.
    """
    
    dataloader = torch.utils.data.DataLoader(dataset, 
                                             batch_size=batch_size, 
                                             collate_fn=DataCollatorWithPadding(tokenizer))
    
    outputs_t = torch.Tensor().to(device)
    labels_t = torch.Tensor().to(device)

    for i, batch in tqdm(enumerate(dataloader), total=len(dataloader)):

        batch = batch.to(device)
        outputs = model(batch)
        outputs_t = torch.cat([outputs_t, outputs], dim=0)

        labels = batch['labels']
        labels_t = torch.cat([labels_t, labels], dim=0)
    
    outputs_t = outputs_t.cpu().numpy()
    labels_t = labels_t.cpu().numpy()
    
    return outputs_t, labels_t

In [34]:
# X_train, y_train = process_dataset(dataset['train'], model, batch_size=batch_size)

In [35]:
# X_train.shape

In [36]:
# y_train.shape

In [37]:
def train_learning_algo(learning_algo, dataset, model, batch_size=256):
    """
    Train the learning algorithm associated with the supervised pb (X_train, y_train).
    More specifically, after the train set is passed through the model (EMB + POOL + ADD_TF-IDF), 
    a vector of X_train of text emeddings concatenated with TF-IDF features is obtained.
    Then, the association between X_train and y_train is learned by means of a learning algorithm.
    """
    
    X_train, y_train = process_dataset(dataset['train'], model, batch_size=batch_size)
    
    # fit sklearn learning algo
    learning_algo.fit(X_train, y_train)
    
    return learning_algo

In [38]:
learning_algo = train_learning_algo(learning_algo, dataset, model, batch_size)

  0%|          | 0/196 [00:00<?, ?it/s]

  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T


In [39]:
def predict(learning_algo, dataset, model, batch_size=256):
    """
    Compute train and test predictions for the dataset.
    """
    
    #X_train, y_train = process_dataset(dataset['train'], model, batch_size=batch_size)
    #y_train_preds = learning_algo.predict(X_train)
    y_train, y_train_preds = None, None
    
    X_test, y_test = process_dataset(dataset['test'], model, batch_size=batch_size)
    y_test_preds = learning_algo.predict(X_test)
    
    return y_train, y_train_preds, y_test, y_test_preds

In [40]:
y_train, y_train_preds, y_test, y_test_preds = predict(learning_algo, dataset, model, batch_size)

  0%|          | 0/196 [00:00<?, ?it/s]

In [41]:
y_test_preds.shape

(25000,)

In [42]:
# Results
print(classification_report(y_test, y_test_preds, digits=4))

              precision    recall  f1-score   support

         0.0     0.8472    0.8678    0.8573     12500
         1.0     0.8645    0.8434    0.8538     12500

    accuracy                         0.8556     25000
   macro avg     0.8558    0.8556    0.8556     25000
weighted avg     0.8558    0.8556    0.8556     25000



In [43]:
# Does reproduce the results!!!

- After many experiments, the batch size seems to influence the results.