![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

# BERT FOR TOKEN CLASSIFICATION - Training/Test Split and Evaluation
Using Hugging Face and importing it to Finance  NLP for scalability.

This is a transformer-based approach, which usually returns much bigger models (10x) compared to NerModel, but it can improve the performance over NerModel.

In this notebook we don't save the model, we just train and get metrics on test set. Please see next notebook to check how we finally train with all data and save the model in Spark NLP format.

# Installation

In [None]:
! pip -q install seqeval

In [None]:
! pip install transformers==4.8.1
! pip install pyspark==3.1.2
! pip install spark-nlp
! pip install spark-nlp-display

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Setting name of the project

In [None]:
PROJECT_NAME = 'financial_operations'

# Imports

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

import sparknlp
from pyspark.sql import functions as F

from sparknlp.training import CoNLL
from google.colab import files

import pandas as pd
import numpy as np
from tqdm import tqdm, trange

import transformers
from transformers import BertForTokenClassification, TFBertForTokenClassification, AdamW
from transformers import get_linear_schedule_with_warmup

from sklearn.metrics import classification_report

## Setting up Torch

In [None]:
torch.__version__

'1.12.1+cu113'

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

torch.cuda.get_device_name(0)

'Tesla T4'

# Check that files are available

In [None]:
!head -n 20 conll_noO.conll

# Creating folders for logs and checkpoints

In [None]:
!mkdir {PROJECT_NAME}

In [None]:
!mkdir {PROJECT_NAME}/logs

# Starting a Spark Session for SparkNLP

In [None]:
spark = sparknlp.start()

In [None]:
spark

# Convert JSL conlls in dataframe format

In [None]:
def get_conll_df(pth):
  data = CoNLL().readDataset(spark, pth)
  data = data.withColumn("sentence_idx", F.monotonically_increasing_id())

  df = data.select('sentence_idx', F.explode(F.arrays_zip('token.result','label.result','pos.result')).alias("cols")) \
  .select('sentence_idx',
          F.expr("cols['0']").alias("word"),
          F.expr("cols['1']").alias("tag"),
          F.expr("cols['2']").alias("pos")).toPandas()
  return df

data_df = get_conll_df('./conll_noO.conll')

In [None]:
train_idx, test_idx = train_test_split(data_df['sentence_idx'].unique(), shuffle=True, random_state=42, train_size=0.85, test_size=0.15)

In [None]:
len(train_idx)

493

In [None]:
len(train_idx)

493

In [None]:
train_data_df = data_df[data_df['sentence_idx'].isin(train_idx)]
test_data_df = data_df[data_df['sentence_idx'].isin(test_idx)]

In [None]:
train_data_df

Unnamed: 0,sentence_idx,word,tag,pos
0,0,From,O,NN
1,0,and,O,NN
2,0,after,O,NN
3,0,the,O,NN
4,0,Closing,O,NN
...,...,...,...,...
25386,8589934882,be,I-OBLIGATION,NNP
25387,8589934882,signed,I-OBLIGATION,NNP
25388,8589934882,in,I-OBLIGATION,NNP
25389,8589934882,counterparts,I-OBLIGATION,NNP


In [None]:
test_data_df

Unnamed: 0,sentence_idx,word,tag,pos
100,2,not,B-OBLIGATION,NNP
101,2,less,I-OBLIGATION,NNP
102,2,than,I-OBLIGATION,NNP
103,2,five,I-OBLIGATION,NNP
104,2,million,I-OBLIGATION,NNP
...,...,...,...,...
25204,8589934879,Recipient,O,NN
25205,8589934879,for,O,NN
25206,8589934879,such,O,NN
25207,8589934879,Service,O,NN


## Checking the DF looks good

In [None]:
train_data_df.head(25)

Unnamed: 0,sentence_idx,word,tag,pos
0,0,From,O,NN
1,0,and,O,NN
2,0,after,O,NN
3,0,the,O,NN
4,0,Closing,O,NN
5,0,Date,O,NN
6,0,and,O,NN
7,0,until,O,NN
8,0,5:00,O,NN
9,0,pm,O,NN


In [None]:
test_data_df.head(25)

Unnamed: 0,sentence_idx,word,tag,pos
100,2,not,B-OBLIGATION,NNP
101,2,less,I-OBLIGATION,NNP
102,2,than,I-OBLIGATION,NNP
103,2,five,I-OBLIGATION,NNP
104,2,million,I-OBLIGATION,NNP
105,2,U.S,I-OBLIGATION,NNP
106,2,.,I-OBLIGATION,NNP
107,2,Dollars,I-OBLIGATION,NNP
108,2,(,I-OBLIGATION,NNP
109,2,"US$5,000,000",I-OBLIGATION,NNP


In [None]:
print (train_data_df.shape)

(21512, 4)


In [None]:
print (test_data_df.shape)

(3879, 4)


In [None]:
train_data_df['tag'].value_counts()

In [None]:
test_data_df['tag'].value_counts()

# First, train / fine-tune a model on the dataset

## Iterating function to feed the model with sentences
Converting conll sentence annotations to tuples (word, pos, tag)

In [None]:
## convert conll file to sentences

class SentenceGetter(object):
    
    def __init__(self, dataset):
        self.n_sent = 1
        self.dataset = dataset
        self.empty = False
        agg_func = lambda s: [(w,p, t) for w,p, t in zip(s["word"].values.tolist(),
                                                       s['pos'].values.tolist(),
                                                        s["tag"].values.tolist())]
        self.grouped = self.dataset.groupby("sentence_idx").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

train_getter = SentenceGetter(train_data_df)
test_getter = SentenceGetter(test_data_df)

## Getting sentences and labels
- Sentences: concatenation of first element of tuple (word)
- Labels: concatenation of second element of tuple (label)

In [None]:
# Sentences 
train_sentences = [[word[0] for word in sentence] for sentence in train_getter.sentences]
print("Example of train sentence:")
print (train_sentences[5])

test_sentences = [[word[0] for word in sentence] for sentence in test_getter.sentences]
print("Example of test sentence:")
print (test_sentences[5])

# Labels
train_labels = [[s[2] for s in sentence] for sentence in train_getter.sentences]
print("Example of train sentence:")
print(train_labels[5])

test_labels = [[s[2] for s in sentence] for sentence in test_getter.sentences]
print("Example of test sentence:")
print(test_labels[5])

## Converting tags to numeric values with a dict

In [None]:
tag_values = list(set(train_data_df["tag"].values))
tag_values.append("PAD")
tag2idx = {t: i for i, t in enumerate(tag_values)}

In [None]:
print(tag_values[:10])
print(tag2idx)

## Model metadata

### Bulding on top of biobert

In [None]:
MODEL_TO_TRAIN = 'yiyanghkust/finbert-pretrain'

### Hyperparam settings

In [None]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 256
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 15
LEARNING_RATE = 2e-05

## Instantiating the proper tokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained(MODEL_TO_TRAIN, do_lower_case=False)

### Tokenize and extend the labels in case a word is split

In [None]:
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

## Tokenize and get tokens and labels

In [None]:
train_tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in zip(train_sentences, train_labels)
]

test_tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in zip(test_sentences, test_labels)
]

train_tokenized_texts_tokens = [token_label_pair[0] for token_label_pair in train_tokenized_texts_and_labels]
test_tokenized_texts_tokens = [token_label_pair[0] for token_label_pair in test_tokenized_texts_and_labels]

train_tokenized_texts_labels = [token_label_pair[1] for token_label_pair in train_tokenized_texts_and_labels]
test_tokenized_texts_labels = [token_label_pair[1] for token_label_pair in test_tokenized_texts_and_labels]

In [None]:
print(train_tokenized_texts_tokens[5])
print(train_tokenized_texts_labels[5])

In [None]:
print(test_tokenized_texts_tokens[5])
print(test_tokenized_texts_labels[5])

## Converting tokens to id && padding sentences to have fixed length

In [None]:
train_input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in train_tokenized_texts_tokens],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

test_input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in test_tokenized_texts_tokens],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

train_tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in train_tokenized_texts_labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

test_tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in test_tokenized_texts_labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

In [None]:
print(train_input_ids[5])
print(test_input_ids[5])
print(train_tags[5])
print(test_tags[5])

[   1 1325   57  205    1    5    1  171  435  129  545    5  889    7
   23  866   29    5 1049    1    5 4983    7    5  434    6    5 1049
    8    5   69 3761   11   32    5    1    1   15 2295    1    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 

## Now that sentences are padded, I need to prevent attention from seeing pads (id=0)

In [None]:
train_attention_masks = [[float(i != 0.0) for i in ii] for ii in train_input_ids]
test_attention_masks = [[float(i != 0.0) for i in ii] for ii in test_input_ids]

In [None]:
print(train_attention_masks[5])
print(test_attention_masks[5])

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,

### Double checking that pairing input-mask is in place

In [None]:
for i,m in zip(train_input_ids[5], train_attention_masks[5]):
  print(f"Token id: {i}\Token mask: {m}")

Token id: 1\Token mask: 1.0
Token id: 1325\Token mask: 1.0
Token id: 57\Token mask: 1.0
Token id: 205\Token mask: 1.0
Token id: 1\Token mask: 1.0
Token id: 5\Token mask: 1.0
Token id: 1\Token mask: 1.0
Token id: 171\Token mask: 1.0
Token id: 435\Token mask: 1.0
Token id: 129\Token mask: 1.0
Token id: 545\Token mask: 1.0
Token id: 5\Token mask: 1.0
Token id: 889\Token mask: 1.0
Token id: 7\Token mask: 1.0
Token id: 23\Token mask: 1.0
Token id: 866\Token mask: 1.0
Token id: 29\Token mask: 1.0
Token id: 5\Token mask: 1.0
Token id: 1049\Token mask: 1.0
Token id: 1\Token mask: 1.0
Token id: 5\Token mask: 1.0
Token id: 4983\Token mask: 1.0
Token id: 7\Token mask: 1.0
Token id: 5\Token mask: 1.0
Token id: 434\Token mask: 1.0
Token id: 6\Token mask: 1.0
Token id: 5\Token mask: 1.0
Token id: 1049\Token mask: 1.0
Token id: 8\Token mask: 1.0
Token id: 5\Token mask: 1.0
Token id: 69\Token mask: 1.0
Token id: 3761\Token mask: 1.0
Token id: 11\Token mask: 1.0
Token id: 32\Token mask: 1.0
Token id: 5

In [None]:
for i,m in zip(test_input_ids[5], test_attention_masks[5]):
  print(f"Token id: {i}\Token mask: {m}")

Token id: 1\Token mask: 1.0
Token id: 261\Token mask: 1.0
Token id: 94\Token mask: 1.0
Token id: 32\Token mask: 1.0
Token id: 5\Token mask: 1.0
Token id: 1\Token mask: 1.0
Token id: 171\Token mask: 1.0
Token id: 827\Token mask: 1.0
Token id: 21\Token mask: 1.0
Token id: 3160\Token mask: 1.0
Token id: 5\Token mask: 1.0
Token id: 1\Token mask: 1.0
Token id: 1014\Token mask: 1.0
Token id: 11\Token mask: 1.0
Token id: 5\Token mask: 1.0
Token id: 1\Token mask: 1.0
Token id: 1\Token mask: 1.0
Token id: 11\Token mask: 1.0
Token id: 1147\Token mask: 1.0
Token id: 26\Token mask: 1.0
Token id: 5\Token mask: 1.0
Token id: 349\Token mask: 1.0
Token id: 1165\Token mask: 1.0
Token id: 1\Token mask: 1.0
Token id: 15\Token mask: 1.0
Token id: 631\Token mask: 1.0
Token id: 5\Token mask: 1.0
Token id: 1\Token mask: 1.0
Token id: 1\Token mask: 1.0
Token id: 1\Token mask: 1.0
Token id: 1\Token mask: 1.0
Token id: 1\Token mask: 1.0
Token id: 1\Token mask: 1.0
Token id: 1\Token mask: 1.0
Token id: 171\Token

## Arrays to tensors transformation

In [None]:
tr_inputs = torch.tensor(train_input_ids)
val_inputs = torch.tensor(test_input_ids)
tr_tags = torch.tensor(train_tags)
val_tags = torch.tensor(test_tags)
tr_masks = torch.tensor(train_attention_masks)
val_masks = torch.tensor(test_attention_masks)

In [None]:
print(tr_inputs[5])
print(tr_tags[5])
print(tr_masks[5])

tensor([   1, 1325,   57,  205,    1,    5,    1,  171,  435,  129,  545,    5,
         889,    7,   23,  866,   29,    5, 1049,    1,    5, 4983,    7,    5,
         434,    6,    5, 1049,    8,    5,   69, 3761,   11,   32,    5,    1,
           1,   15, 2295,    1,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,   

In [None]:
print(val_inputs[5])
print(val_tags[5])
print(val_masks[5])

tensor([   1,  261,   94,   32,    5,    1,  171,  827,   21, 3160,    5,    1,
        1014,   11,    5,    1,    1,   11, 1147,   26,    5,  349, 1165,    1,
          15,  631,    5,    1,    1,    1,    1,    1,    1,    1,  171,  817,
           5,  226,  447,   14,   43,  367, 8391,    1,  149,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,   

### Checking sizes match

#### Training

In [None]:
len([x for x in tr_inputs[5] if x != 0]) # How many NO_PADs we have?

40

In [None]:
len([x for x in tr_tags[5] if x != 7])

249

In [None]:
len([x for x in tr_masks[5] if x != 0])

40

#### Test

In [None]:
len([x for x in val_inputs[5] if x != 0]) # How many NO_PADs we have?

45

In [None]:
len([x for x in val_tags[5] if x != 7])

251

In [None]:
len([x for x in val_masks[5] if x != 0])

45

## Creating the DataLoaders to feed the batches during training

In [None]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=TRAIN_BATCH_SIZE)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=TRAIN_BATCH_SIZE)

# Loading the transformer model

In [None]:
transformers.__version__

'4.8.1'

In [None]:
model = BertForTokenClassification.from_pretrained(
    MODEL_TO_TRAIN,
    num_labels=len(tag2idx),
    output_attentions = False,
    output_hidden_states = False
)
model.to(device)

Some weights of the model checkpoint at zlucia/custom-legalbert were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized fr

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

## Setting up the optimizer.
We want to optimize weight values, so we add a decay.
We can get all the weights from `model_named_parameters()`
But we need to remove `bias`, `gamma` and `beta` which are Layer Normalization parameters we don't want to touch.

Activate `FULL_TINETUNING` to modify weights in all the layers.

In [None]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=3e-5,
    eps=1e-8
)


## Setting up the scheduler
It will manage Optimizer and Learning Rate changes. We use warmup

In [None]:
epochs = 15
max_grad_norm = 1.0

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)


Now, let's train

In [None]:
## Store the average loss after each epoch so we can plot them.
loss_values, validation_loss_values = [], []

for EPOCH in trange(epochs, desc="Epoch"):
    # Put the model into training mode.
    model.train()
    # Reset the total loss for this epoch.
    total_loss = 0

    # Training loop
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # Always clear any previously calculated gradients before performing a backward pass.
        model.zero_grad()
        # forward pass
        # This will return the loss (rather than the model output)
        # because we have provided the `labels`.
        outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)
        # get the loss
        loss = outputs[0]
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # track train loss
        total_loss += loss.item()
        # Clip the norm of the gradient
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)
    tr_loss = f"Average train loss: {str(avg_train_loss)}\n"

    # Saving partial models (this creates the folder too)
    if step > epochs - 5:

      tokenizer.save_pretrained(f'{PROJECT_NAME}/{str(EPOCH)}/tokenizer/')
      model.save_pretrained(save_directory=f'{PROJECT_NAME}/{str(EPOCH)}/',
                            save_config=True, state_dict=model.state_dict)
      # Saving checkpoint in case it crashes, to restore work
      torch.save({
          'epoch': EPOCH,
          'model_state_dict': model.state_dict(),
          'optimizer_state_dict': optimizer.state_dict(),
          'loss': avg_train_loss,
          }, f'{PROJECT_NAME}/{str(EPOCH)}/checkpoint.pth')
    else:
      print("Skipping saving the model. Too early")

    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    # Put the model into evaluation mode
    model.eval()
    # Reset the validation loss for this epoch.
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        # Telling the model not to compute or store gradients,
        # saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have not provided labels.
            outputs = model(b_input_ids, token_type_ids=None,
                            attention_mask=b_input_mask, labels=b_labels)
        # Move logits and labels to CPU
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences.
        eval_loss += outputs[0].mean().item()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)

    eval_loss = eval_loss / len(valid_dataloader)
    validation_loss_values.append(eval_loss)

    val_loss = f"Validation loss: {str(eval_loss)}\n"
    
    # Saving losses log
    with open(f'{PROJECT_NAME}/logs/epoch_' + str(EPOCH) + '_loss.log', 'a') as f:
      f.write(tr_loss)
      f.write(val_loss)

    # Calculating metrics
    pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels)
                                 for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
    valid_tags = [tag_values[l_i] for l in true_labels
                                  for l_i in l if tag_values[l_i] != "PAD"]
    
    report = classification_report(valid_tags, pred_tags)
    
    # Saving metrics
    with open(f'{PROJECT_NAME}/logs/epoch_' + str(EPOCH) + '_metrics.log', 'a') as f:
      f.write(report)

    # Printing also to stdout
    print(tr_loss)
    print(val_loss)
    print(report)
