# BERT with TF-IDF features

## Installation

In [1]:
# !pip install --upgrade pip # upgrade pip

In [2]:
# !pip install -U scikit-learn
# !pip install -U transformers
# !pip install -U datasets
# !pip install ipywidgets

For tqdm progress bars (on a terminal):
1. `conda install -c conda-forge nodejs`
2. `jupyter labextension install @jupyter-widgets/jupyterlab-manager`
3. `jupyter nbextension enable --py widgetsnbextension`
4. `jupyter lab clean`
5. Refresh web page...

In [3]:
# # Check versions
# import sklearn
# import transformers
# import datasets

# print("Current versions:")
# print(sklearn.__version__)
# print(datasets.__version__)
# print(transformers.__version__)

## Libraries

In [4]:
import os
import sys
sys.path.insert(0, os.path.abspath(".."))

import time
import random
import pickle

import numpy as np
import torch

from sklearn.linear_model import RidgeClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

import src.model as mod
from src.train import *

In [5]:
%load_ext autoreload
%autoreload 2

## Device and seeds

In [6]:
# device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# seeds (torch generator seed missing?)
seed = 1979
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

## Parameters

In [7]:
# Dataset folder & device
dataset_name = 'trec' # 'trec'
sort = True           # True

# Model 
model_name = 'bert-base-uncased'
tfidf_dim = 3000
batch_size = 256
pooling = 'mean'      # 'mean', 'mean_std', cls', 'mean_cls', 'mean_std_cls'.
mode = 'default'      # 'default', 'bert_only', 'tfidf_only'

# Learning algo
alpha = 10

# results
results_folder = "/raid/home/jeremiec/Data/TextClassification"
results_file = os.path.join(results_folder, dataset_name) + '.pkl'
cache_dir = os.path.join(results_folder, 'cache_dir_' + dataset_name + '/')

if os.path.exists(cache_dir):
    os.system("rm -rf " + cache_dir)

In [8]:
# x_tmp = torch.rand(size=(100000, 73500), device=device)

## Load and tokenize data

**Overview of the datasets**
1. Sentiment analysis
    - ``IMDB``
    - ``Yelp. P``
    - ``Yelp. F``
2. Question classification
    - ``TREC``
    - ``Yahoo! Answers``
3. Topic detection
    - ``AG News``
    - ``DBPedia``

In [9]:
# # Sentiment
# # IMDB
# dataset, tokenizer, model_name = load_and_tokenize_dataset('imdb', model_name=model_name, cache_dir=cache_dir)

# # Yelp P. XXX CUDA OoM
# dataset, tokenizer, model_name = load_and_tokenize_dataset('yelp_polarity', model_name=model_name, cache_dir=cache_dir)

# # Yelp F. XXX CUDA OoM
# dataset, tokenizer, model_name = load_and_tokenize_dataset('yelp_review_full', model_name=model_name, cache_dir=cache_dir)


# # Question
# # TREC
# dataset, tokenizer, model_name = load_and_tokenize_dataset('trec', model_name=model_name, cache_dir=cache_dir)

# # Yahoo! Answers # XXX
# dataset, tokenizer, model_name = load_and_tokenize_dataset('yahoo_answers_topics', model_name=model_name, cache_dir=cache_dir)


# # Topic
# # AG NEWS
# dataset, tokenizer, model_name = load_and_tokenize_dataset('ag_news', model_name=model_name, cache_dir=cache_dir)

# # DBPedia # XXX
# dataset, tokenizer, model_name = load_and_tokenize_dataset('dbpedia_14', model_name=model_name, cache_dir=cache_dir)



dataset, tokenizer, model_name = load_and_tokenize_dataset(dataset_name, 
                                                           model_name=model_name, 
                                                           sort=sort,
                                                           cache_dir=cache_dir)

Using custom data configuration default


Downloading and preparing dataset trec/default (download: 350.79 KiB, generated: 403.39 KiB, post-processed: Unknown size, total: 754.18 KiB) to /raid/home/jeremiec/Data/TextClassification/cache_dir_trec/trec/default/1.1.0/751da1ab101b8d297a3d6e9c79ee9b0173ff94c4497b75677b59b61d5467a9b9...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/336k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/23.4k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/5452 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

Dataset trec downloaded and prepared to /raid/home/jeremiec/Data/TextClassification/cache_dir_trec/trec/default/1.1.0/751da1ab101b8d297a3d6e9c79ee9b0173ff94c4497b75677b59b61d5467a9b9. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'label-fine', 'text', 'input_ids', 'token_type_ids', 'attention_mask', 'length'],
        num_rows: 5452
    })
    test: Dataset({
        features: ['labels', 'label-fine', 'text', 'input_ids', 'token_type_ids', 'attention_mask', 'length'],
        num_rows: 500
    })
})

## Padding statistics

In [11]:
len(dataset['train'])

5452

In [12]:
81450 / 5452

14.939471753484959

In [13]:
dataset['train'].set_format(type='torch', columns=['input_ids', 
                                                 'attention_mask', 
                                                 'labels', 
                                                 'length'])

# dataset['train'] = dataset['train'].remove_columns(text_field)

In [14]:
dataloader = torch.utils.data.DataLoader(dataset['train'],
                                     # shuffle=False,
                                     drop_last=False,
                                     batch_size=batch_size,
                                     collate_fn=DataCollatorWithPadding(tokenizer)
                                    )

In [15]:
dataloader

<torch.utils.data.dataloader.DataLoader at 0x7f11c0c701c0>

In [16]:
nb_pad = 0

for b in dataloader:
    nb_pad += (b['input_ids'] == 0).sum().item()

nb_pad

3658

## Model

In [17]:
t0 = time.time()

dataset = mod.get_tfidf_features(dataset, dim=tfidf_dim)

t1 = time.time()

tfidf_time = t1 - t0
print(f"Features computed in {tfidf_time} sec.")

Features computed in 0.9909000396728516 sec.


In [18]:
model = mod.BertTFIDF(model_name=model_name, pooling=pooling, mode=mode, device=device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model downloaded: bert-base-uncased


## Training loop

In [19]:
# Processing data
t0 = time.time()

X_train, y_train = process_dataset(dataset['train'], model, tokenizer, device, batch_size)

t1 = time.time()

training_time = t1 - t0

  0%|          | 0/22 [00:00<?, ?it/s]

In [20]:
# Fitting the model
# Note: if different alpha are tested, insert a loop here
t0 = time.time()

learning_algo = RidgeClassifier(alpha=alpha)
learning_algo.fit(X_train, y_train)

t1 = time.time()

fitting_time = t1 - t0

In [21]:
# # OLD VERSION

# t0 = time.time()

# learning_algo = train_learning_algo(learning_algo, dataset, model, tokenizer, 
#                                     device, batch_size)

# t1 = time.time()

# training_time = t1 - t0
# print(print(f"Model trained in {training_time} sec."))

## Results

In [22]:
y_test, y_test_preds = predict(learning_algo, dataset, model, tokenizer, 
                               device, batch_size)

  0%|          | 0/2 [00:00<?, ?it/s]

In [23]:
# Results
test_results = classification_report(y_test, y_test_preds, digits=4, output_dict=True)
print(classification_report(y_test, y_test_preds, digits=4))

              precision    recall  f1-score   support

         0.0     0.9259    0.9058    0.9158       138
         1.0     0.8272    0.7128    0.7657        94
         2.0     1.0000    0.7778    0.8750         9
         3.0     0.8857    0.9538    0.9185        65
         4.0     0.8819    0.9912    0.9333       113
         5.0     0.9125    0.9012    0.9068        81

    accuracy                         0.8920       500
   macro avg     0.9055    0.8738    0.8859       500
weighted avg     0.8913    0.8920    0.8897       500



In [24]:
test_results

{'0.0': {'precision': 0.9259259259259259,
  'recall': 0.9057971014492754,
  'f1-score': 0.9157509157509157,
  'support': 138},
 '1.0': {'precision': 0.8271604938271605,
  'recall': 0.7127659574468085,
  'f1-score': 0.7657142857142857,
  'support': 94},
 '2.0': {'precision': 1.0,
  'recall': 0.7777777777777778,
  'f1-score': 0.8750000000000001,
  'support': 9},
 '3.0': {'precision': 0.8857142857142857,
  'recall': 0.9538461538461539,
  'f1-score': 0.9185185185185185,
  'support': 65},
 '4.0': {'precision': 0.8818897637795275,
  'recall': 0.9911504424778761,
  'f1-score': 0.9333333333333333,
  'support': 113},
 '5.0': {'precision': 0.9125,
  'recall': 0.9012345679012346,
  'f1-score': 0.9068322981366459,
  'support': 81},
 'accuracy': 0.892,
 'macro avg': {'precision': 0.9055317448744832,
  'recall': 0.8737620001498545,
  'f1-score': 0.8858582252422832,
  'support': 500},
 'weighted avg': {'precision': 0.8913366721520922,
  'recall': 0.892,
  'f1-score': 0.8896991115004157,
  'support': 

In [25]:
print(f"Total training time: {tfidf_time + training_time + fitting_time} sec.")

Total training time: 7.449293613433838 sec.


In [26]:
# del dataset, tokenizer, model, learning_algo
torch.cuda.empty_cache()

In [27]:
XXX stop here

SyntaxError: invalid syntax (651886343.py, line 1)

In [None]:
# save results

if os.path.exists(results_file):
    with open(results_file, 'rb') as fh:
        results_d = pickle.load(fh)
else:
    results_d = {}

    
key = (pooling, mode, tfidf_dim, alpha, batch_size)
results_d[key] = (test_results, 
                  tfidf_time + training_time + fitting_time, 
                  "pooling - mode - tfidf_dim - alpha - batch_size")

with open(results_file, 'wb') as fh:
    pickle.dump(results_d, fh)

In [None]:
results_d

**Remarks**

- We chose to implement the **standard mean:** ``torch.mean(batch, dim=1)``: <br>
    Instead of a custom mean where we sum the embedded tokens and divide by the sentence length, we implement a standard mean of the batch. After some experiments, this seems to work better.
- Similarly, we chose to implement the **standard std:** ``torch.std(batch, dim=1)``: <br>
    Instead of a custom std involving the sentence length, we implement a standard std of the batch. After some experiments, this seems to work better.
- The **sorting by length** process infuences the results: <br>
For some datasets, the results are better when the data are sorted by length, while for others, the opposite holds true. This phenomenon is probably due to the mean operation applied to the batch, which would yield very different results depending on whether the batch is sorted or not.
- The **batch size** singificantly influences the results: <br>
    Large batches include more padding than small batches. These padded tokens are actually involved in the computation of the mean, and thus influences it. For this reason, the batch size influences the results. The batch size will be a hyperparameter of our model.