# BERT with TF-IDF features

## Installation

In [None]:
# !pip install --upgrade pip # upgrade pip

In [None]:
# !pip install -U scikit-learn
# !pip install -U transformers
# !pip install -U datasets
# !pip install ipywidgets

For tqdm progress bars (on a terminal):
1. `conda install -c conda-forge nodejs`
2. `jupyter labextension install @jupyter-widgets/jupyterlab-manager`
3. `jupyter nbextension enable --py widgetsnbextension`
4. `jupyter lab clean`
5. Refresh web page...

In [None]:
# # Check versions
# import sklearn
# import transformers
# import datasets
# 
# print("Current versions:")
# print(sklearn.__version__)
# print(datasets.__version__)
# print(transformers.__version__)

## Libraries

In [4]:
import os
import sys
sys.path.insert(0, os.path.abspath(".."))

import time
import random

import numpy as np
import torch

from sklearn.linear_model import RidgeClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

import src.model as mod
from src.train import *

In [5]:
%load_ext autoreload
%autoreload 2

## Device and seeds

In [6]:
# device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# seeds (torch generator seed missing?)
seed = 1979
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

## Parameters

In [7]:
# Dataset folder & device
cache_dir = 'cache_dir/'

# Model 
model_name = 'bert-base-uncased'
tfidf_dim = 4000
batch_size = 128

# Learning algo
alpha = 10
learning_algo = RidgeClassifier(alpha=alpha)

## Load and tokenize data

In [8]:
dataset, tokenizer, model_name = load_and_tokenize_dataset('imdb', model_name=model_name, cache_dir=cache_dir)

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to cache_dir/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to cache_dir/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

## Model

In [9]:
t0 = time.time()

dataset = mod.get_tfidf_features(dataset, dim=tfidf_dim)

t1 = time.time()

tfidf_time = t1 - t0
print(f"Features computed in {tfidf_time} sec.")

Features computed in 36.29901456832886 sec.


In [10]:
model = mod.BertTFIDF(model_name=model_name, device=device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model downloaded: bert-base-uncased


## Training loop

In [11]:
t0 = time.time()

learning_algo = train_learning_algo(learning_algo, dataset, model, tokenizer, 
                                    device, batch_size)

t1 = time.time()

training_time = t1 - t0
print(print(f"Model trained in {training_time} sec."))

  0%|          | 0/196 [00:00<?, ?it/s]

Model trained in 130.02256894111633 sec.
None


## Results

In [12]:
y_test, y_test_preds = predict(learning_algo, dataset, model, tokenizer, 
                               device, batch_size)

  0%|          | 0/196 [00:00<?, ?it/s]

In [13]:
# Results
test_results = classification_report(y_test, y_test_preds, digits=4, output_dict=True)
print(classification_report(y_test, y_test_preds, digits=4))

              precision    recall  f1-score   support

         0.0     0.9431    0.9582    0.9506     12500
         1.0     0.9575    0.9422    0.9498     12500

    accuracy                         0.9502     25000
   macro avg     0.9503    0.9502    0.9502     25000
weighted avg     0.9503    0.9502    0.9502     25000



In [14]:
test_results

{'0.0': {'precision': 0.9431451295377589,
  'recall': 0.95816,
  'f1-score': 0.9505932775110123,
  'support': 12500},
 '1.0': {'precision': 0.9574831314527275,
  'recall': 0.94224,
  'f1-score': 0.949800411273739,
  'support': 12500},
 'accuracy': 0.9502,
 'macro avg': {'precision': 0.9503141304952432,
  'recall': 0.9501999999999999,
  'f1-score': 0.9501968443923756,
  'support': 25000},
 'weighted avg': {'precision': 0.9503141304952432,
  'recall': 0.9502,
  'f1-score': 0.9501968443923756,
  'support': 25000}}

In [15]:
print(f"Total training time: {tfidf_time + training_time} sec.")

Total training time: 166.3215835094452 sec.
