![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/finance-nlp/05.5.BertForTokenClassification_TrainAndSave.ipynb)

# Finance BertForTokenClassification
Using Hugging Face and importing it to Finance NLP for scalability.

This is a transformer-based approach, which usually returns much bigger models (10x) compared to NerModel, but it can improve the performance over NerModel. We don't carry out evaluation in this notebook, only training with full data and export into Spark NLP. To check evaluation, please check previous notebook.

# Installation

In [None]:
! pip -q install seqeval

In [None]:
! pip install transformers==4.8.1
! pip install pyspark==3.1.2

# Setting name of the project

In [None]:
import os
os.environ['PROJECT_NAME'] = 'financial_operations'
PROJECT_NAME = os.getenv('PROJECT_NAME')

In [None]:
PROJECT_NAME

'financial_operations'

# Imports

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig

from keras_preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from google.colab import files

import pandas as pd
import numpy as np
from tqdm import tqdm, trange

import transformers
from transformers import BertForTokenClassification, TFBertForTokenClassification, AdamW
from transformers import get_linear_schedule_with_warmup

from sklearn.metrics import classification_report

## Setting up Torch

In [None]:
torch.__version__

'1.13.0+cu116'

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

torch.cuda.get_device_name(0)

'Tesla T4'

# Check that files are available

In [None]:
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/finance-nlp/data/conll_noO.conll

In [None]:
!head -n 20 conll_noO.conll

( NN NN O
d NN NN O
) NN NN O
OF NN NN O
THE NN NN O
SECURITIES NN NN O
EXCHANGE NN NN O
ACT NN NN O
OF NN NN O
1934 NN NN O
For NN NN O
the NN NN O
annual NN NN O
period NN NN O
ended NN NN O
March NNP NNP B-FISCAL_YEAR
31 NNP NNP I-FISCAL_YEAR
, NNP NNP I-FISCAL_YEAR
2021 NNP NNP I-FISCAL_YEAR
March NNP NNP B-FISCAL_YEAR


# Creating folders for logs and checkpoints

In [None]:
! mkdir {PROJECT_NAME}

mkdir: cannot create directory ‘financial_operations’: File exists


In [None]:
! mkdir {PROJECT_NAME}/logs

mkdir: cannot create directory ‘financial_operations/logs’: File exists


# Starting a Spark Session for SparkNLP

In [None]:
import json
import os

from google.colab import files

license_keys = files.upload()

with open(list(license_keys.keys())[0]) as f:
    license_keys = json.load(f)

# Defining license key-value pairs as local variables
locals().update(license_keys)

# Adding license key-value pairs to environment variables
os.environ.update(license_keys)

Saving 4.2.3.json to 4.2.3 (2).json


In [None]:
# Installing pyspark and spark-nlp
! pip install --upgrade -q pyspark==3.1.2 spark-nlp==$PUBLIC_VERSION

# Installing Spark NLP Healthcare
! pip install --upgrade -q spark-nlp-jsl==$JSL_VERSION  --extra-index-url https://pypi.johnsnowlabs.com/$SECRET

# Installing Spark NLP Display Library for visualization
! pip install -q spark-nlp-display

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spark-nlp-jsl 4.2.3 requires spark-nlp==4.2.4, but you have spark-nlp 4.2.2 which is incompatible.[0m[31m
[0m

In [None]:
import json
import os

import sparknlp
import sparknlp_jsl

from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml import Pipeline,PipelineModel

import warnings
warnings.filterwarnings('ignore')

params = {"spark.driver.memory":"16G", 
          "spark.kryoserializer.buffer.max":"2000M", 
          "spark.driver.maxResultSize":"2000M"} 

print("Spark NLP Version :", sparknlp.version())
print("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark = sparknlp_jsl.start(license_keys['SECRET'],params=params)

spark

Spark NLP Version : 4.2.4
Spark NLP_JSL Version : 4.2.3


# Convert JSL conlls in dataframe format

In [None]:
from sparknlp.training import CoNLL

def get_conll_df(pth):
  data = CoNLL().readDataset(spark, pth)
  data = data.withColumn("sentence_idx", F.monotonically_increasing_id())

  df = data.select('sentence_idx', F.explode(F.arrays_zip('token.result','label.result','pos.result')).alias("cols")) \
  .select('sentence_idx',
          F.expr("cols['0']").alias("word"),
          F.expr("cols['1']").alias("tag"),
          F.expr("cols['2']").alias("pos")).toPandas()
  return df

train_data_df = get_conll_df('./conll_noO.conll')

In [None]:
train_data_df['tag'].value_counts()

O                     51912
I-DATE                 1932
I-FISCAL_YEAR          1812
B-DATE                 1797
B-AMOUNT               1466
B-CURRENCY             1461
I-AMOUNT               1134
B-FISCAL_YEAR           605
I-EXPENSE_INCREASE      546
I-EXPENSE_DECREASE      390
B-PERCENTAGE            350
I-PROFIT_INCREASE       288
I-EXPENSE               280
B-EXPENSE_INCREASE      274
I-PROFIT                228
B-EXPENSE_DECREASE      191
B-PROFIT_INCREASE       164
B-EXPENSE               150
B-PROFIT                122
I-PROFIT_DECLINE         93
B-PROFIT_DECLINE         58
I-PERCENTAGE             12
Name: tag, dtype: int64

# First, train / fine-tune a model on the dataset

## Iterating function to feed the model with sentences
Converting conll sentence annotations to tuples (word, pos, tag)

In [None]:
## convert conll file to sentences

class SentenceGetter(object):
    
    def __init__(self, dataset):
        self.n_sent = 1
        self.dataset = dataset
        self.empty = False
        agg_func = lambda s: [(w,p, t) for w,p, t in zip(s["word"].values.tolist(),
                                                       s['pos'].values.tolist(),
                                                        s["tag"].values.tolist())]
        self.grouped = self.dataset.groupby("sentence_idx").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

train_getter = SentenceGetter(train_data_df)

## Getting sentences and labels
- Sentences: concatenation of first element of tuple (word)
- Labels: concatenation of second element of tuple (label)

In [None]:
# Sentences 
train_sentences = [[word[0] for word in sentence] for sentence in train_getter.sentences]
print("Example of train sentence:")
print (train_sentences[5])

# Labels
train_labels = [[s[2] for s in sentence] for sentence in train_getter.sentences]
print("Example of train sentence:")
print(train_labels[5])

Example of train sentence:
['\ufeff', 'In', '2019', ',', 'we', 'released', 'Evolution', ',', 'the', 'new', 'platform', 'that', 'supersedes', 'and', 'provides', 'an', 'upgrade', 'path', 'to', 'the', 'former', 'loyalty', 'and', 'CVM', 'platforms', 'from', 'both', 'Evolving', 'and', 'its', 'acquired', 'companies', '—', 'BLS', ',', 'Lumata', 'and', 'SSM', '.']
Example of train sentence:
['O', 'O', 'B-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


## Converting tags to numeric values with a dict

In [None]:
tag_values = list(set(train_data_df["tag"].values))
tag_values.append("PAD")
tag2idx = {t: i for i, t in enumerate(tag_values)}

In [None]:
print(tag_values[:10])
print(tag2idx)

['B-EXPENSE_DECREASE', 'I-EXPENSE_DECREASE', 'B-EXPENSE', 'I-PERCENTAGE', 'I-PROFIT_INCREASE', 'B-PROFIT', 'O', 'I-AMOUNT', 'I-EXPENSE_INCREASE', 'B-DATE']
{'B-EXPENSE_DECREASE': 0, 'I-EXPENSE_DECREASE': 1, 'B-EXPENSE': 2, 'I-PERCENTAGE': 3, 'I-PROFIT_INCREASE': 4, 'B-PROFIT': 5, 'O': 6, 'I-AMOUNT': 7, 'I-EXPENSE_INCREASE': 8, 'B-DATE': 9, 'I-FISCAL_YEAR': 10, 'B-PROFIT_DECLINE': 11, 'B-PERCENTAGE': 12, 'B-EXPENSE_INCREASE': 13, 'B-FISCAL_YEAR': 14, 'B-CURRENCY': 15, 'I-PROFIT': 16, 'I-EXPENSE': 17, 'B-PROFIT_INCREASE': 18, 'I-DATE': 19, 'I-PROFIT_DECLINE': 20, 'B-AMOUNT': 21, 'PAD': 22}


## Model metadata

### Bulding on top of biobert

In [None]:
MODEL_TO_TRAIN = 'yiyanghkust/finbert-pretrain'

### Hyperparam settings

In [None]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 256
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 10
LEARNING_RATE = 2e-05

## Instantiating the proper tokenizer

IMPORTANT! Pay attention to the `do_lower_case` param, and set it to True if you have a lowercased language model. That means you will always need to do `lower()` on your inference texts!

If the language model is not lowercase only, then leave it to False.

In [None]:
tokenizer = BertTokenizer.from_pretrained(MODEL_TO_TRAIN, do_lower_case=False)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

### Tokenize and extend the labels in case a word is split

In [None]:
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

## Tokenize and get tokens and labels

In [None]:
train_tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in zip(train_sentences, train_labels)
]

train_tokenized_texts_tokens = [token_label_pair[0] for token_label_pair in train_tokenized_texts_and_labels]

train_tokenized_texts_labels = [token_label_pair[1] for token_label_pair in train_tokenized_texts_and_labels]

In [None]:
print(train_tokenized_texts_tokens[5])
print(train_tokenized_texts_labels[5])

['[UNK]', '2019', ',', 'we', 'released', 'E', '##vol', '##ution', ',', 'the', 'new', 'platform', 'that', 'supersedes', 'and', 'provides', 'an', 'upgrade', 'path', 'to', 'the', 'former', 'loyalty', 'and', '[UNK]', 'platforms', 'from', 'both', 'E', '##vol', '##ving', 'and', 'its', 'acquired', 'companies', '—', '[UNK]', ',', 'L', '##umat', '##a', 'and', 'S', '##S', '##M', '.']
['O', 'B-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


## Converting tokens to id && padding sentences to have fixed length

In [None]:
train_input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in train_tokenized_texts_tokens],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

train_tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in train_tokenized_texts_labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

In [None]:
print(train_input_ids[5])
print(train_tags[5])

[    2  2463   585    13  2688 30856 11634 16597   585     6    56  1241
    15 23819     8   511    33  2661  4205     9     6  1971  5811     8
     2  2937    23   209 30856 11634  6754     8    38   417   193  6318
     2   585 30846 30157   363     8 30802 30690 30694    48     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0   

## Now that sentences are padded, I need to prevent attention from seeing pads (id=0)

In [None]:
train_attention_masks = [[float(i != 0.0) for i in ii] for ii in train_input_ids]

In [None]:
print(train_attention_masks[5])

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,

### Double checking that pairing input-mask is in place

In [None]:
for i,m in zip(train_input_ids[5], train_attention_masks[5]):
  print(f"Token id: {i}\Token mask: {m}")

Token id: 2\Token mask: 1.0
Token id: 2463\Token mask: 1.0
Token id: 585\Token mask: 1.0
Token id: 13\Token mask: 1.0
Token id: 2688\Token mask: 1.0
Token id: 30856\Token mask: 1.0
Token id: 11634\Token mask: 1.0
Token id: 16597\Token mask: 1.0
Token id: 585\Token mask: 1.0
Token id: 6\Token mask: 1.0
Token id: 56\Token mask: 1.0
Token id: 1241\Token mask: 1.0
Token id: 15\Token mask: 1.0
Token id: 23819\Token mask: 1.0
Token id: 8\Token mask: 1.0
Token id: 511\Token mask: 1.0
Token id: 33\Token mask: 1.0
Token id: 2661\Token mask: 1.0
Token id: 4205\Token mask: 1.0
Token id: 9\Token mask: 1.0
Token id: 6\Token mask: 1.0
Token id: 1971\Token mask: 1.0
Token id: 5811\Token mask: 1.0
Token id: 8\Token mask: 1.0
Token id: 2\Token mask: 1.0
Token id: 2937\Token mask: 1.0
Token id: 23\Token mask: 1.0
Token id: 209\Token mask: 1.0
Token id: 30856\Token mask: 1.0
Token id: 11634\Token mask: 1.0
Token id: 6754\Token mask: 1.0
Token id: 8\Token mask: 1.0
Token id: 38\Token mask: 1.0
Token id: 4

## Arrays to tensors transformation

In [None]:
tr_inputs = torch.tensor(train_input_ids)
tr_tags = torch.tensor(train_tags)
tr_masks = torch.tensor(train_attention_masks)

In [None]:
print(tr_inputs[5])
print(tr_tags[5])
print(tr_masks[5])

tensor([    2,  2463,   585,    13,  2688, 30856, 11634, 16597,   585,     6,
           56,  1241,    15, 23819,     8,   511,    33,  2661,  4205,     9,
            6,  1971,  5811,     8,     2,  2937,    23,   209, 30856, 11634,
         6754,     8,    38,   417,   193,  6318,     2,   585, 30846, 30157,
          363,     8, 30802, 30690, 30694,    48,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

### Checking sizes match

#### Training

In [None]:
len([x for x in tr_inputs[5] if x != 0]) # How many NO_PADs we have?

46

In [None]:
len([x for x in tr_tags[5] if x != 7])

256

In [None]:
len([x for x in tr_masks[5] if x != 0])

46

## Creating the DataLoaders to feed the batches during training

In [None]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=TRAIN_BATCH_SIZE)

# Loading the transformer model

In [None]:
transformers.__version__

'4.8.1'

In [None]:
model = transformers.BertForTokenClassification.from_pretrained(
    MODEL_TO_TRAIN,
    num_labels=len(tag2idx),
    output_attentions = False,
    output_hidden_states = False
)
model.to(device)

Downloading:   0%|          | 0.00/359 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of the model checkpoint at yiyanghkust/finbert-pretrain were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at 

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30873, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

## Setting up the optimizer.
We want to optimize weight values, so we add a decay.
We can get all the weights from `model_named_parameters()`
But we need to remove `bias`, `gamma` and `beta` which are Layer Normalization parameters we don't want to touch.

Activate `FULL_TINETUNING` to modify weights in all the layers.

In [None]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=3e-5,
    eps=1e-8
)


## Setting up the scheduler
It will manage Optimizer and Learning Rate changes. We use warmup

In [None]:
epochs = EPOCHS
max_grad_norm = 1.0

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)


Now, let's train

In [None]:

## Store the average loss after each epoch so we can plot them.
loss_values, validation_loss_values = [], []

for EPOCH in trange(epochs, desc="Epoch"):
    # Put the model into training mode.
    model.train()
    # Reset the total loss for this epoch.
    total_loss = 0

    # Training loop
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # Always clear any previously calculated gradients before performing a backward pass.
        model.zero_grad()
        # forward pass
        # This will return the loss (rather than the model output)
        # because we have provided the `labels`.
        outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)
        # get the loss
        loss = outputs[0]
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # track train loss
        total_loss += loss.item()
        # Clip the norm of the gradient
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)
    tr_loss = f"Average train loss: {str(avg_train_loss)}\n"

    if EPOCH % 5 == 0:
      # Saving partial models (this creates the folder too)    
      tokenizer.save_pretrained(f'{PROJECT_NAME}/{str(EPOCH)}/tokenizer/')
      model.save_pretrained(save_directory=f'{PROJECT_NAME}/{str(EPOCH)}/',
                            save_config=True, state_dict=model.state_dict)
      # Saving checkpoint in case it crashes, to restore work
      torch.save({
          'epoch': EPOCH,
          'model_state_dict': model.state_dict(),
          'optimizer_state_dict': optimizer.state_dict(),
          'loss': avg_train_loss,
          }, f'{PROJECT_NAME}/{str(EPOCH)}/checkpoint.pth')
    else:
      print("Skipping saving...")

    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)
    
    if EPOCH % 5 == 0:
      # Saving losses log
      with open(f'{PROJECT_NAME}/logs/epoch_' + str(EPOCH) + '_loss.log', 'a') as f:
        f.write(tr_loss)

    # Printing also to stdout
    print(tr_loss)


Epoch:  10%|█         | 1/10 [01:11<10:44, 71.60s/it]

Average train loss: 0.6457865564869001



Epoch:  20%|██        | 2/10 [02:19<09:15, 69.44s/it]

Skipping saving...
Average train loss: 0.1852664268360688



Epoch:  30%|███       | 3/10 [03:26<07:59, 68.51s/it]

Skipping saving...
Average train loss: 0.12597153292825589



Epoch:  40%|████      | 4/10 [04:34<06:48, 68.03s/it]

Skipping saving...
Average train loss: 0.08897881331638648



Epoch:  50%|█████     | 5/10 [05:41<05:39, 67.84s/it]

Skipping saving...
Average train loss: 0.06342067612478366



Epoch:  60%|██████    | 6/10 [06:55<04:39, 69.85s/it]

Average train loss: 0.050033402450096146



Epoch:  70%|███████   | 7/10 [08:03<03:27, 69.13s/it]

Skipping saving...
Average train loss: 0.03976145678629669



Epoch:  80%|████████  | 8/10 [09:10<02:17, 68.57s/it]

Skipping saving...
Average train loss: 0.0328724947758019



Epoch:  90%|█████████ | 9/10 [10:17<01:08, 68.14s/it]

Skipping saving...
Average train loss: 0.028146676432627898



Epoch: 100%|██████████| 10/10 [11:25<00:00, 68.51s/it]

Skipping saving...
Average train loss: 0.026173199580695767






## Now load the model as TF and save properly

In [None]:
last_successfull_epoch = len(loss_values) - 1
if last_successfull_epoch < 0:
  last_successfull_epoch = None 

In [None]:
if last_successfull_epoch is None:
  print("No epochs finished successfully.")
else:
  print(f"Last successfull epoch: {str(last_successfull_epoch)}")

Last successfull epoch: 9


In [None]:
# first save the model as pytorch model (we'll cast later)
MODEL_NAME_PYTORCH = 'model_epoch_'+str(last_successfull_epoch)+'_pytorch'
MODEL_NAME_TF = 'model_epoch_'+str(last_successfull_epoch)+'_tf'

In [None]:
print(MODEL_NAME_PYTORCH)
print(MODEL_NAME_TF)

model_epoch_9_pytorch
model_epoch_9_tf


In [None]:
# now load the model as TF and save properly
from transformers import TFBertForTokenClassification

tokenizer.save_pretrained(f'./{PROJECT_NAME}/{MODEL_NAME_PYTORCH}_tokenizer/')
model.save_pretrained(f'./{PROJECT_NAME}/{MODEL_NAME_PYTORCH}', saved_model=True, save_format='tf')

**IMPORTANT** If it's a domain-specific model, we need to use an interface to load and save it, that will change the input_signature so that it can only be loaded with sparknlp_jsl.xx.XXBertForTokenClassification

In [None]:
from transformers import TFBertForTokenClassification
import tensorflow as tf

# Creation of a subclass in order to define a new serving signature
class DomainSpecificModel(TFBertForTokenClassification):
    # Decorate the serving method with the new input_signature
    # an input_signature represents the name, the data type and the shape of an expected input
    @tf.function(input_signature=[{
        "input_ids": tf.TensorSpec((None, None), tf.int32, name="medical_input_ids"),
        "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
        "token_type_ids": tf.TensorSpec((None, None), tf.int32, name="token_type_ids"),

    }])
    def serving(self, inputs):
        # call the model to process the inputs
        output = self.call(inputs)

        # return the formated output
        return self.serving_output(output)

In [None]:
loaded_model = DomainSpecificModel.from_pretrained(f'./{PROJECT_NAME}/{MODEL_NAME_PYTORCH}', from_pt=True)
loaded_model.save_pretrained(f'./{PROJECT_NAME}/{MODEL_NAME_TF}', saved_model=True)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model DomainSpecificModel: ['bert.embeddings.position_ids']
- This IS expected if you are initializing DomainSpecificModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DomainSpecificModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of DomainSpecificModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DomainSpecificModel for predictions without further training.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method




### Save label mapping

In [None]:
labels = sorted(tag2idx, key=tag2idx.get)

print (labels)

with open(f'./{PROJECT_NAME}/{MODEL_NAME_TF}/saved_model/1/assets/labels.txt', 'w') as f:
    f.write('\n'.join(labels))

['B-EXPENSE_DECREASE', 'I-EXPENSE_DECREASE', 'B-EXPENSE', 'I-PERCENTAGE', 'I-PROFIT_INCREASE', 'B-PROFIT', 'O', 'I-AMOUNT', 'I-EXPENSE_INCREASE', 'B-DATE', 'I-FISCAL_YEAR', 'B-PROFIT_DECLINE', 'B-PERCENTAGE', 'B-EXPENSE_INCREASE', 'B-FISCAL_YEAR', 'B-CURRENCY', 'I-PROFIT', 'I-EXPENSE', 'B-PROFIT_INCREASE', 'I-DATE', 'I-PROFIT_DECLINE', 'B-AMOUNT', 'PAD']


### Copy files in tf model's assets

In [None]:
vocab_pth = f"./{PROJECT_NAME}/{MODEL_NAME_PYTORCH}_tokenizer/vocab.txt"
saved_model_pth = f'./{PROJECT_NAME}/{MODEL_NAME_TF}/saved_model/1/assets/'

! cp $vocab_pth $saved_model_pth

# Now load the saved model in Spark NLP and save it properly

In [None]:
domain = 'FINANCE' # or 'FINANCE' or 'OPENSOURCE'

if domain == 'OPENSOURCE':
  classifier_class = BertForTokenClassification
elif domain == 'LEGAL':
  classifier_class = sparknlp_jsl.legal.LegalBertForTokenClassification
  classifier_classpath = "com.johnsnowlabs.legal.token_classification.ner.LegalBertForTokenClassification"
elif domain == 'FINANCE':
  classifier_class = sparknlp_jsl.finance.FinanceBertForTokenClassification
  classifier_classpath = "com.johnsnowlabs.finance.token_classification.ner.FinanceBertForTokenClassification"

In [None]:
classifier_class

sparknlp_jsl.finance.token_classification.ner.finance_bert_for_token_classifier.FinanceBertForTokenClassification

In [None]:
classifier_classpath

'com.johnsnowlabs.finance.token_classification.ner.FinanceBertForTokenClassification'

In [None]:
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *

# For Finance
tokenClassifier = classifier_class.loadSavedModel(
     f'./{PROJECT_NAME}/{MODEL_NAME_TF}/saved_model/1',
     spark
 )\
 .setInputCols(["sentence",'token'])\
  .setOutputCol("ner")\
  .setCaseSensitive(True)\
  .setMaxSentenceLength(256)

In [None]:
tokenClassifier.write().overwrite().save(f"./{PROJECT_NAME}/{MODEL_NAME_TF}_spark_nlp")

In [None]:
import json
with open(f"./{PROJECT_NAME}/{MODEL_NAME_TF}_spark_nlp/metadata/part-00000", 'r') as fr:
  metadata = json.load(fr)
metadata['class'] = classifier_classpath
with open(f"./{PROJECT_NAME}/{MODEL_NAME_TF}_spark_nlp/metadata/part-00000", 'w') as fw:
  metadata = json.dump(metadata, fw)

In [None]:
!rm ./{PROJECT_NAME}/{MODEL_NAME_TF}_spark_nlp/metadata/.*.crc

In [None]:
!ls -lah ./{PROJECT_NAME}/{MODEL_NAME_TF}_spark_nlp/metadata/

total 12K
drwxr-xr-x 2 root root 4.0K Jan 12 18:08 .
drwxr-xr-x 4 root root 4.0K Jan 12 18:08 ..
-rw-r--r-- 1 root root  479 Jan 12 18:08 part-00000
-rw-r--r-- 1 root root    0 Jan 12 18:07 _SUCCESS


# Test the imported model in Spark NLP

In [None]:
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline

documentAssembler = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")

sparktokenizer = Tokenizer()\
  .setInputCols("document")\
  .setOutputCol("token")

from sparknlp_jsl.annotator import *

tokenClassifier = classifier_class.load(f"./{PROJECT_NAME}/{MODEL_NAME_TF}_spark_nlp")\
  .setInputCols("token", "document")\
  .setOutputCol("label")\
  .setCaseSensitive(True)


In [None]:
pipeline =  Pipeline(stages=[
  documentAssembler,
  sparktokenizer,
  tokenClassifier
    ]
)

In [None]:
p_model = pipeline.fit(spark.createDataFrame(pd.DataFrame({'text': ['']})))

In [None]:
text = """In 2019 we released Evolution, the new platform that supersedes..."""
res = p_model.transform(spark.createDataFrame([[text]]).toDF("text"))

res.show()

+--------------------+--------------------+--------------------+--------------------+
|                text|            document|               token|               label|
+--------------------+--------------------+--------------------+--------------------+
|In 2019 we releas...|[{document, 0, 65...|[{token, 0, 1, In...|[{named_entity, 0...|
+--------------------+--------------------+--------------------+--------------------+



In [None]:
from pyspark.sql import functions as F

res.select(F.explode(F.arrays_zip('token.result', 'label.result')).alias("cols")) \
               .select(F.expr("cols['0']").alias("token"),
                       F.expr("cols['1']").alias("ner_label"))\
               .show(20, truncate=100)

+----------+---------+
|     token|ner_label|
+----------+---------+
|        In|   B-DATE|
|      2019|   B-DATE|
|        we|        O|
|  released|        O|
| Evolution|        O|
|         ,|        O|
|       the|        O|
|       new|        O|
|  platform|        O|
|      that|        O|
|supersedes|        O|
|       ...|        O|
+----------+---------+



In [None]:
os.environ['SPARKNLP_TF_MODEL'] = MODEL_NAME_TF + "_spark_nlp"

In [None]:
!cd $PROJECT_NAME && zip -r $PROJECT_NAME.zip $SPARKNLP_TF_MODEL

  adding: model_epoch_9_tf_spark_nlp/ (stored 0%)
  adding: model_epoch_9_tf_spark_nlp/bert_classification_tensorflow (deflated 8%)
  adding: model_epoch_9_tf_spark_nlp/metadata/ (stored 0%)
  adding: model_epoch_9_tf_spark_nlp/metadata/_SUCCESS (stored 0%)
  adding: model_epoch_9_tf_spark_nlp/metadata/part-00000 (deflated 39%)
  adding: model_epoch_9_tf_spark_nlp/fields/ (stored 0%)
  adding: model_epoch_9_tf_spark_nlp/fields/labels/ (stored 0%)
  adding: model_epoch_9_tf_spark_nlp/fields/labels/part-00001 (deflated 53%)
  adding: model_epoch_9_tf_spark_nlp/fields/labels/._SUCCESS.crc (stored 0%)
  adding: model_epoch_9_tf_spark_nlp/fields/labels/_SUCCESS (stored 0%)
  adding: model_epoch_9_tf_spark_nlp/fields/labels/part-00000 (deflated 51%)
  adding: model_epoch_9_tf_spark_nlp/fields/labels/.part-00000.crc (stored 0%)
  adding: model_epoch_9_tf_spark_nlp/fields/labels/.part-00001.crc (stored 0%)
  adding: model_epoch_9_tf_spark_nlp/fields/signatures/ (stored 0%)
  adding: model_epoc

# MOUNT DRIVE AND SAVE YOUR MODEL TO IT

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!cp financial_operations/financial_operations.zip /content/gdrive/MyDrive/