# Iterated BERT with probs given as text

## Installation

In [None]:
# !pip install --upgrade pip # upgrade pip

In [None]:
# !pip install -U scikit-learn
# !pip install -U transformers
# !pip install -U datasets
# !pip install ipywidgets

For tqdm progress bars (on a terminal):
1. `conda install -c conda-forge nodejs`
2. `jupyter labextension install @jupyter-widgets/jupyterlab-manager`
3. `jupyter nbextension enable --py widgetsnbextension`
4. `jupyter lab clean`
5. Refresh web page...

In [None]:
# # # Check versions
# import sklearn
# import transformers
# import datasets

# print("Current versions:")
# print(sklearn.__version__)
# print(datasets.__version__)
# print(transformers.__version__)

## Libraries

In [4]:
import os
import sys
sys.path.insert(0, os.path.abspath(".."))

import time
import random
import pickle

import pandas as pd

import numpy as np
import torch

import datasets

from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import classification_report

import src.model as mod
from src.train import *

In [5]:
%load_ext autoreload
%autoreload 2

## Device and seeds

In [6]:
# device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# seeds (torch generator seed missing?)
seed = 42 # 1979
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

## Parameters

In [7]:
# Dataset folder & device
dataset_name = 'imdb'  # 'trec'
sort = False           # True

# Model 
model_name = 'bert-base-uncased'
batch_size = 256
pooling = 'mean'      # 'mean', 'mean_std', cls', 'mean_cls', 'mean_std_cls'.
mode = 'bert_only'    # 'default', 'bert_only', 'tfidf_only'

# results
results_folder = "/raid/home/jeremiec/Data/TextClassification"
results_file = os.path.join(results_folder, dataset_name) + '.pkl'
cache_dir = os.path.join(results_folder, 'cache_dir_' + dataset_name + '/')

if os.path.exists(cache_dir):
    os.system("rm -rf " + cache_dir)

## Load and tokenize data

**Overview of the datasets**
1. Sentiment analysis
    - ``IMDB``
    - ``Yelp. P``
    - ``Yelp. F``
2. Question classification
    - ``TREC``
    - ``Yahoo! Answers``
3. Topic detection
    - ``AG News``
    - ``DBPedia``

In [8]:
# # Sentiment
# # IMDB
# dataset, tokenizer, model_name = load_and_tokenize_dataset('imdb', model_name=model_name, cache_dir=cache_dir)

# # Yelp P. XXX CUDA OoM
# dataset, tokenizer, model_name = load_and_tokenize_dataset('yelp_polarity', model_name=model_name, cache_dir=cache_dir)

# # Yelp F. XXX CUDA OoM
# dataset, tokenizer, model_name = load_and_tokenize_dataset('yelp_review_full', model_name=model_name, cache_dir=cache_dir)


# # Question
# # TREC
# dataset, tokenizer, model_name = load_and_tokenize_dataset('trec', model_name=model_name, cache_dir=cache_dir)

# # Yahoo! Answers # XXX
# dataset, tokenizer, model_name = load_and_tokenize_dataset('yahoo_answers_topics', model_name=model_name, cache_dir=cache_dir)


# # Topic
# # AG NEWS
# dataset, tokenizer, model_name = load_and_tokenize_dataset('ag_news', model_name=model_name, cache_dir=cache_dir)

# # DBPedia # XXX
# dataset, tokenizer, model_name = load_and_tokenize_dataset('dbpedia_14', model_name=model_name, cache_dir=cache_dir)


dataset, tokenizer, model_name = load_and_tokenize_dataset(dataset_name, 
                                                           model_name=model_name, 
                                                           sort=sort,
                                                           cache_dir=cache_dir)

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /raid/home/jeremiec/Data/TextClassification/cache_dir_imdb/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to /raid/home/jeremiec/Data/TextClassification/cache_dir_imdb/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

In [9]:
# subseting dataset
dataset['train'] = dataset['train'].shuffle(seed=42).flatten_indices()#.select(range(1000))
dataset['test'] = dataset['test'].shuffle(seed=42).flatten_indices()#.select(range(1000))

Flattening the indices:   0%|          | 0/25 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/25 [00:00<?, ?ba/s]

In [10]:
train_df = pd.DataFrame(dataset['train'])
test_df = pd.DataFrame(dataset['test'])

In [11]:
dataset_train = datasets.Dataset.from_pandas(train_df, split='train')
dataset_test = datasets.Dataset.from_pandas(test_df, split='test')

In [12]:
dataset = datasets.DatasetDict({'train': dataset_train, 'test': dataset_test})

In [13]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask', 'length'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask', 'length'],
        num_rows: 25000
    })
})

## Run BERT + LR (phase 1)

In [14]:
dataset = mod.tensorize_dataset(dataset)

In [15]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask', 'length'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask', 'length'],
        num_rows: 25000
    })
})

In [16]:
model = mod.BertTFIDF(model_name=model_name, 
                      pooling=pooling, 
                      mode=mode, 
                      device=device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model downloaded: bert-base-uncased


- Training

In [17]:
# Processing data
t0 = time.time()

X_train, y_train = process_dataset(dataset['train'], 
                                   model, 
                                   tokenizer, 
                                   device, 
                                   batch_size)

t1 = time.time()

training_time = t1 - t0

  0%|          | 0/98 [00:00<?, ?it/s]

In [18]:
# Fitting the model
# Note: if different alpha are tested, insert a loop here
t0 = time.time()

learning_algo = LogisticRegression()
learning_algo.fit(X_train, y_train)

t1 = time.time()

fitting_time = t1 - t0

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


- Results

In [19]:
y_train, y_train_preds, y_test, y_test_preds = predict(learning_algo, 
                                                       dataset, 
                                                       model, 
                                                       tokenizer, 
                                                       device, 
                                                       batch_size,
                                                       predict_proba=True,
                                                       mode='train_test')

  0%|          | 0/98 [00:00<?, ?it/s]

  0%|          | 0/98 [00:00<?, ?it/s]

In [20]:
y_train_preds_final = np.argmax(y_train_preds, axis=1)
y_test_preds_final = np.argmax(y_test_preds, axis=1)

In [21]:
# Train results
print(classification_report(y_train, y_train_preds_final, digits=4))

              precision    recall  f1-score   support

         0.0     0.9011    0.9049    0.9030     12500
         1.0     0.9045    0.9006    0.9026     12500

    accuracy                         0.9028     25000
   macro avg     0.9028    0.9028    0.9028     25000
weighted avg     0.9028    0.9028    0.9028     25000



In [22]:
# Test results
print(classification_report(y_test, y_test_preds_final, digits=4))

              precision    recall  f1-score   support

         0.0     0.8881    0.8925    0.8903     12500
         1.0     0.8920    0.8876    0.8898     12500

    accuracy                         0.8900     25000
   macro avg     0.8900    0.8900    0.8900     25000
weighted avg     0.8900    0.8900    0.8900     25000



In [23]:
# del dataset, tokenizer, model, learning_algo
torch.cuda.empty_cache()

## Update dataset with predictions

In [24]:
list(y_train) == list(dataset['train']['labels'])

True

In [25]:
list(y_test) == list(dataset['test']['labels'])

True

In [26]:
# dataset['train'] = dataset['train'].remove_columns(['prob_0', 'prob_1'])
# dataset['test'] = dataset['test'].remove_columns(['prob_0', 'prob_1'])

for i in range(y_train_preds.shape[1]):
    dataset['train'] = dataset['train'].add_column('prob_'+str(i), y_train_preds[:, i])
    dataset['test'] = dataset['test'].add_column('prob_'+str(i), y_test_preds[:, i])

In [27]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask', 'length', 'prob_0', 'prob_1'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask', 'length', 'prob_0', 'prob_1'],
        num_rows: 25000
    })
})

In [28]:
train_df = pd.DataFrame(dataset['train'])
train_txt = pd.DataFrame(dataset['train']['text'], columns=['text_old'])
train_df = train_df.drop(['input_ids', 'attention_mask', 'length'], axis=1)

test_df = pd.DataFrame(dataset['test'])
test_txt = pd.DataFrame(dataset['test']['text'], columns=['text_old'])
test_df = test_df.drop(['input_ids', 'attention_mask', 'length'], axis=1)

train_df = pd.concat([train_df, train_txt], axis=1)
test_df = pd.concat([test_df, test_txt], axis=1)

In [29]:
for i in range(y_train_preds.shape[1]):
    
    label_1 = 'prob_'+str(i)
    label_2 = 'prob_'+str(i)+'_str'
    
    train_df = train_df.astype({label_1: 'float32'})
    train_df[label_2] = (train_df[label_1].round(2)*100).astype(int).astype(str)
    
    test_df = test_df.astype({label_1: 'float32'})
    test_df[label_2] = (test_df[label_1].round(2)*100).astype(int).astype(str)
    
train_df = train_df.astype({'labels': 'int32'})
test_df = test_df.astype({'labels': 'int32'})

In [30]:
def create_updated_text_column(row):
    
    txt = "Probabilities: "
    
    for i in range(y_train_preds.shape[1]):
    
        label = 'prob_'+str(i)+'_str'
        txt = txt + row[label] + " " 
    
    txt = txt + ". Text: " + row['text_old']
    
    return txt

train_df['text'] = train_df.apply(lambda row: create_updated_text_column(row), axis=1)
test_df['text'] = test_df.apply(lambda row: create_updated_text_column(row), axis=1)

In [31]:
# test_df

In [32]:
dataset_train = datasets.Dataset.from_pandas(train_df, split='train')
dataset_test = datasets.Dataset.from_pandas(test_df, split='test')

In [33]:
dataset = datasets.DatasetDict({'train': dataset_train, 'test': dataset_test})

In [34]:
dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'prob_0', 'prob_1', 'text_old', 'prob_0_str', 'prob_1_str', 'text'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['labels', 'prob_0', 'prob_1', 'text_old', 'prob_0_str', 'prob_1_str', 'text'],
        num_rows: 25000
    })
})

## Run BERT + LR (phase 2)

In [35]:
def tokenize(sample, tokenizer):
    """Tokenize sample"""
    
    # get field to 
    for field in ['text', 'content', 'question_title']:
        
        if field in sample.keys():
    
            sample = tokenizer(sample[field], truncation=True, padding=False, return_length=True)
    
    return sample


dataset = dataset.map(lambda x: tokenize(x, tokenizer), batched=True)

dataset = mod.tensorize_dataset(dataset)

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

In [36]:
# tokenizer.decode(dataset['train']['input_ids'][0])

In [37]:
model = mod.BertTFIDF(model_name=model_name, 
                      pooling=pooling, 
                      mode=mode, 
                      device=device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model downloaded: bert-base-uncased


- Training

In [38]:
# Processing data
t0 = time.time()

X_train, y_train = process_dataset(dataset['train'], 
                                   model, 
                                   tokenizer, 
                                   device, 
                                   batch_size)

t1 = time.time()

training_time = t1 - t0

  0%|          | 0/98 [00:00<?, ?it/s]

In [39]:
# Fitting the model
# Note: if different alpha are tested, insert a loop here
t0 = time.time()

learning_algo = LogisticRegression()
learning_algo.fit(X_train, y_train)

t1 = time.time()

fitting_time = t1 - t0

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


- Results

In [40]:
y_train, y_train_preds, y_test, y_test_preds = predict(learning_algo, 
                                                       dataset, 
                                                       model, 
                                                       tokenizer, 
                                                       device, 
                                                       batch_size,
                                                       predict_proba=True,
                                                       mode='train_test')

  0%|          | 0/98 [00:00<?, ?it/s]

  0%|          | 0/98 [00:00<?, ?it/s]

In [41]:
y_train_preds_final = np.argmax(y_train_preds, axis=1)
y_test_preds_final = np.argmax(y_test_preds, axis=1)

In [42]:
# Train results
print(classification_report(y_train, y_train_preds_final, digits=4))

              precision    recall  f1-score   support

         0.0     0.8983    0.9048    0.9016     12500
         1.0     0.9041    0.8976    0.9008     12500

    accuracy                         0.9012     25000
   macro avg     0.9012    0.9012    0.9012     25000
weighted avg     0.9012    0.9012    0.9012     25000



In [43]:
# Test results
print(classification_report(y_test, y_test_preds_final, digits=4))

              precision    recall  f1-score   support

         0.0     0.8888    0.8924    0.8906     12500
         1.0     0.8920    0.8884    0.8902     12500

    accuracy                         0.8904     25000
   macro avg     0.8904    0.8904    0.8904     25000
weighted avg     0.8904    0.8904    0.8904     25000



In [44]:
# del dataset, tokenizer, model, learning_algo
torch.cuda.empty_cache()