# BERT Finetuning
## 7 Text Classification Datasets

## Librairy

In [1]:
import os
import sys
sys.path.insert(0, os.path.abspath(".."))

In [2]:
# # Check versions
# import sklearn
# import transformers
# import datasets

# print("Current versions:")
# print(sklearn.__version__)
# print(datasets.__version__)
# print(transformers.__version__)

In [3]:
import time
import pickle

import numpy as np

import torch
import torch.nn as nn
from torch.autograd import Variable

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, f1_score

from transformers import BertTokenizer, BertTokenizerFast
from transformers import BertForSequenceClassification, AdamW
from transformers import Trainer, TrainingArguments
from transformers import EarlyStoppingCallback
from transformers.data.data_collator import DataCollatorWithPadding

# import src.utils.matrix as mat

# from datasets import load_dataset, Dataset, concatenate_datasets

from src.train import *

## Global variables

In [4]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [5]:
DATASET_NAME = 'dbpedia_14'
# 'imdb', 'yelp_polarity', 'yelp_review_full'
# 'trec', 'yahoo_answers_topics'
# 'ag_news', 'dbpedia_14'

In [6]:
RESULTS_PATH = "/raid/home/jeremiec/Data/TextClassification/BERT_FiT/" + DATASET_NAME
RESULTS_FILE = os.path.join(RESULTS_PATH, DATASET_NAME) + '.pkl'
CACHE_DIR = os.path.join(RESULTS_PATH, 'cache_dir_' + DATASET_NAME + '/')

In [7]:
MODEL_NAME = 'bert-base-uncased'
BATCH_SIZE = 24
NB_EPOCHS = 4

## Dataset

In [8]:
dataset, tokenizer, model_name = load_and_tokenize_dataset(dataset_name=DATASET_NAME, 
                                                           model_name=MODEL_NAME, 
                                                           sort=False,
                                                           cache_dir=CACHE_DIR)

dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

train_dataset = dataset["train"].shuffle(seed=42)
train_val_datasets = train_dataset.train_test_split(train_size=0.8)

train_dataset = train_val_datasets['train']
val_dataset = train_val_datasets['test']
test_dataset = dataset["test"].shuffle(seed=42)

Downloading and preparing dataset d_bpedia14/dbpedia_14 (download: 65.18 MiB, generated: 191.44 MiB, post-processed: Unknown size, total: 256.62 MiB) to /raid/home/jeremiec/Data/TextClassification/BERT_FiT/dbpedia_14/cache_dir_dbpedia_14/d_bpedia14/dbpedia_14/2.0.0/7f0577ea0f4397b6b89bfe5c5f2c6b1b420990a1fc5e8538c7ab4ec40e46fa3e...


Downloading data:   0%|          | 0.00/68.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/560000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/70000 [00:00<?, ? examples/s]

Dataset d_bpedia14 downloaded and prepared to /raid/home/jeremiec/Data/TextClassification/BERT_FiT/dbpedia_14/cache_dir_dbpedia_14/d_bpedia14/dbpedia_14/2.0.0/7f0577ea0f4397b6b89bfe5c5f2c6b1b420990a1fc5e8538c7ab4ec40e46fa3e. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/560 [00:00<?, ?ba/s]

  0%|          | 0/70 [00:00<?, ?ba/s]

In [9]:
train_dataset, val_dataset, test_dataset

(Dataset({
     features: ['labels', 'title', 'content', 'input_ids', 'token_type_ids', 'attention_mask', 'length'],
     num_rows: 448000
 }),
 Dataset({
     features: ['labels', 'title', 'content', 'input_ids', 'token_type_ids', 'attention_mask', 'length'],
     num_rows: 112000
 }),
 Dataset({
     features: ['labels', 'title', 'content', 'input_ids', 'token_type_ids', 'attention_mask', 'length'],
     num_rows: 70000
 }))

In [10]:
# get number of labels
num_labels = len(set(train_dataset['labels'].tolist()))
num_labels

14

## BERT Finetune

In [11]:
MODEL_NAME

'bert-base-uncased'

In [12]:
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

## Training

In [13]:
training_args = TrainingArguments(
    
    # output
    output_dir=RESULTS_PATH,          
    
    # params
    num_train_epochs=NB_EPOCHS,               # nb of epochs
    per_device_train_batch_size=BATCH_SIZE,   # batch size per device during training
    per_device_eval_batch_size=BATCH_SIZE,    # cf. paper Sun et al.
    learning_rate=2e-5,                       # cf. paper Sun et al.
#     warmup_steps=500,                         # number of warmup steps for learning rate scheduler
    warmup_ratio=0.1,                         # cf. paper Sun et al.
    weight_decay=0.01,                        # strength of weight decay
    
#     # eval
    evaluation_strategy="steps",              # cf. paper Sun et al.
    eval_steps=50,                            # cf. paper Sun et al.
#     evaluation_strategy='no', # no more evaluation, takes time
    
    # log
    logging_dir=RESULTS_PATH+'/logs',  
    logging_strategy='steps',
    logging_steps=50, # 10? same as eval_steps
    
    # save
    # save_strategy='epoch',
    # save_strategy='steps',
    # load_best_model_at_end=False
    load_best_model_at_end=True               # cf. paper Sun et al.
)

In [14]:
def compute_metrics(p):
    
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    
    return {"val_accuracy": accuracy}

In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    # compute_metrics=compute_metrics,
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

In [16]:
results = trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: title, length, content. If title, length, content are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 448000
  Num Epochs = 4
  Instantaneous batch size per device = 24
  Total train batch size (w. parallel, distributed & accumulation) = 24
  Gradient Accumulation steps = 1
  Total optimization steps = 74668


Step,Training Loss,Validation Loss


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: title, length, content. If title, length, content are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 112000
  Batch size = 24


KeyboardInterrupt: 

In [None]:
training_time = results.metrics["train_runtime"]
training_time

In [None]:
trainer.save_model(os.path.join(RESULTS_PATH, 'best_model'))

In [None]:
# get model
# trained_model = trainer.model

## Results

In [None]:
results_d = {}

# # finetuned model
# model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)
# model.load_state_dict(torch.load(RESULTS_PATH + 'best_model-0/pytorch_model.bin'))
# model.to(device)
model.eval()

# compute test acc
test_trainer = Trainer(model, data_collator=DataCollatorWithPadding(tokenizer))
raw_preds, labels, _ = test_trainer.predict(test_dataset)
preds = np.argmax(raw_preds, axis=1)
test_acc = accuracy_score(y_true=labels, y_pred=preds)
test_results = classification_report(labels, preds, digits=4, output_dict=True)
print(test_results)

# save acc and time
results_d['test_accuracy'] = test_acc # best model evaluation only
results_d['test_classification-report'] = test_results
results_d['training_time'] = training_time

In [None]:
results_d

In [None]:
# save results
with open(RESULTS_FILE, 'wb') as fh:
    pickle.dump(results_d, fh)

In [None]:
# # load results
# with open(RESULTS_FILE, 'rb') as fh:
#     results_d = pickle.load(fh)
#     
# results_d