# Iterated BERT with probs given as text

## Installation

In [18]:
# !pip install --upgrade pip # upgrade pip

In [19]:
# !pip install -U scikit-learn
# !pip install -U transformers
# !pip install -U datasets
# !pip install ipywidgets

For tqdm progress bars (on a terminal):
1. `conda install -c conda-forge nodejs`
2. `jupyter labextension install @jupyter-widgets/jupyterlab-manager`
3. `jupyter nbextension enable --py widgetsnbextension`
4. `jupyter lab clean`
5. Refresh web page...

In [20]:
# # # Check versions
# import sklearn
# import transformers
# import datasets

# print("Current versions:")
# print(sklearn.__version__)
# print(datasets.__version__)
# print(transformers.__version__)

## Libraries

In [21]:
import os
import sys
sys.path.insert(0, os.path.abspath(".."))

import time
import random
import pickle

import numpy as np
import torch

from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

import src.model as mod
from src.train import *

In [22]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Device and seeds

In [23]:
# device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# seeds (torch generator seed missing?)
seed = 42 # 1979
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

## Parameters

In [24]:
# Dataset folder & device
dataset_name = 'imdb'  # 'trec'
sort = False           # True

# Model 
model_name = 'bert-base-uncased'
batch_size = 256
pooling = 'mean'      # 'mean', 'mean_std', cls', 'mean_cls', 'mean_std_cls'.
mode = 'bert_only'    # 'default', 'bert_only', 'tfidf_only'

# results
results_folder = "/raid/home/jeremiec/Data/TextClassification"
results_file = os.path.join(results_folder, dataset_name) + '.pkl'
cache_dir = os.path.join(results_folder, 'cache_dir_' + dataset_name + '/')

if os.path.exists(cache_dir):
    os.system("rm -rf " + cache_dir)

## Load and tokenize data

**Overview of the datasets**
1. Sentiment analysis
    - ``IMDB``
    - ``Yelp. P``
    - ``Yelp. F``
2. Question classification
    - ``TREC``
    - ``Yahoo! Answers``
3. Topic detection
    - ``AG News``
    - ``DBPedia``

In [25]:
# # Sentiment
# # IMDB
# dataset, tokenizer, model_name = load_and_tokenize_dataset('imdb', model_name=model_name, cache_dir=cache_dir)

# # Yelp P. XXX CUDA OoM
# dataset, tokenizer, model_name = load_and_tokenize_dataset('yelp_polarity', model_name=model_name, cache_dir=cache_dir)

# # Yelp F. XXX CUDA OoM
# dataset, tokenizer, model_name = load_and_tokenize_dataset('yelp_review_full', model_name=model_name, cache_dir=cache_dir)


# # Question
# # TREC
# dataset, tokenizer, model_name = load_and_tokenize_dataset('trec', model_name=model_name, cache_dir=cache_dir)

# # Yahoo! Answers # XXX
# dataset, tokenizer, model_name = load_and_tokenize_dataset('yahoo_answers_topics', model_name=model_name, cache_dir=cache_dir)


# # Topic
# # AG NEWS
# dataset, tokenizer, model_name = load_and_tokenize_dataset('ag_news', model_name=model_name, cache_dir=cache_dir)

# # DBPedia # XXX
# dataset, tokenizer, model_name = load_and_tokenize_dataset('dbpedia_14', model_name=model_name, cache_dir=cache_dir)


dataset, tokenizer, model_name = load_and_tokenize_dataset(dataset_name, 
                                                           model_name=model_name, 
                                                           sort=sort,
                                                           cache_dir=cache_dir)

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /raid/home/jeremiec/Data/TextClassification/cache_dir_imdb/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to /raid/home/jeremiec/Data/TextClassification/cache_dir_imdb/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

In [27]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask', 'length'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask', 'length'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask', 'length'],
        num_rows: 50000
    })
})

In [32]:
dataset = mod.tensorize_dataset(dataset)

In [33]:
dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask', 'length'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask', 'length'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask', 'length'],
        num_rows: 50000
    })
})

## Model

In [34]:
model = mod.BertTFIDF(model_name=model_name, 
                      pooling=pooling, 
                      mode=mode, 
                      device=device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model downloaded: bert-base-uncased


## Training loop 1

In [35]:
# Processing data
t0 = time.time()

X_train, y_train = process_dataset(dataset['train'], 
                                   model, 
                                   tokenizer, 
                                   device, 
                                   batch_size)

t1 = time.time()

training_time = t1 - t0

  0%|          | 0/98 [00:00<?, ?it/s]

In [36]:
# Fitting the model
# Note: if different alpha are tested, insert a loop here
t0 = time.time()

learning_algo = RidgeClassifier(alpha=1.0)
learning_algo.fit(X_train, y_train)

t1 = time.time()

fitting_time = t1 - t0

## Results

In [38]:
y_train, y_train_preds, y_test, y_test_preds = predict(learning_algo, 
                                                       dataset, 
                                                       model, 
                                                       tokenizer, 
                                                       device, 
                                                       batch_size,
                                                       mode='train_test')

  0%|          | 0/98 [00:00<?, ?it/s]

  0%|          | 0/98 [00:00<?, ?it/s]

In [39]:
# Train results
print(classification_report(y_train, y_train_preds, digits=4))

              precision    recall  f1-score   support

         0.0     0.8962    0.8996    0.8979     12500
         1.0     0.8992    0.8958    0.8975     12500

    accuracy                         0.8977     25000
   macro avg     0.8977    0.8977    0.8977     25000
weighted avg     0.8977    0.8977    0.8977     25000



In [40]:
# Test results
print(classification_report(y_test, y_test_preds, digits=4))

              precision    recall  f1-score   support

         0.0     0.8813    0.8876    0.8845     12500
         1.0     0.8868    0.8805    0.8836     12500

    accuracy                         0.8840     25000
   macro avg     0.8841    0.8840    0.8840     25000
weighted avg     0.8841    0.8840    0.8840     25000



In [41]:
# del dataset, tokenizer, model, learning_algo
torch.cuda.empty_cache()