![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

# Finance BertForTokenClassification
Using Hugging Face and importing it to Finance NLP for scalability.

This is a transformer-based approach, which usually returns much bigger models (10x) compared to NerModel, but it can improve the performance over NerModel. We don't carry out evaluation in this notebook, only training with full data and export into Spark NLP. To check evaluation, please check previous notebook.

# Installation

In [1]:
!pip -q install seqeval

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 KB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 KB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [2]:
! pip install transformers==4.8.1
! pip install pyspark==3.1.2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.8.1
  Downloading transformers-4.8.1-py3-none-any.whl (2.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m880.6/880.6 KB[0m [31m54.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m79.5 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sacremo

# Setting name of the project

In [3]:
import os
os.environ['PROJECT_NAME'] = 'financial_operations'
PROJECT_NAME = os.getenv('PROJECT_NAME')

In [4]:
PROJECT_NAME

'financial_operations'

# Imports

In [5]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig

from keras_preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from google.colab import files

import pandas as pd
import numpy as np
from tqdm import tqdm, trange

import transformers
from transformers import BertForTokenClassification, TFBertForTokenClassification, AdamW
from transformers import get_linear_schedule_with_warmup

from sklearn.metrics import classification_report

## Setting up Torch

In [6]:
torch.__version__

'1.13.0+cu116'

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

torch.cuda.get_device_name(0)

'Tesla T4'

# Check that files are available

In [14]:
! wget -q https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings_JSL/Legal/data/conll_noO.conll

In [8]:
!head -n 20 conll_noO.conll

head: cannot open 'conll_noO.conll' for reading: No such file or directory


# Creating folders for logs and checkpoints

In [9]:
!mkdir {PROJECT_NAME}

In [10]:
!mkdir {PROJECT_NAME}/logs

# Starting a Spark Session for SparkNLP

In [11]:
import json
import os

from google.colab import files

license_keys = files.upload()

with open(list(license_keys.keys())[0]) as f:
    license_keys = json.load(f)

# Defining license key-value pairs as local variables
locals().update(license_keys)

# Adding license key-value pairs to environment variables
os.environ.update(license_keys)

Saving spark_nlp_for_healthcare_spark_ocr_7187.json to spark_nlp_for_healthcare_spark_ocr_7187.json


In [12]:
# Installing pyspark and spark-nlp
! pip install --upgrade -q pyspark==3.1.2 spark-nlp==$PUBLIC_VERSION

# Installing Spark NLP Healthcare
! pip install --upgrade -q spark-nlp-jsl==$JSL_VERSION  --extra-index-url https://pypi.johnsnowlabs.com/$SECRET

# Installing Spark NLP Display Library for visualization
! pip install -q spark-nlp-display

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m448.4/448.4 KB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m358.9/358.9 KB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.6/95.6 KB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.9/66.9 KB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [13]:
import json
import os

import sparknlp
import sparknlp_jsl

from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml import Pipeline,PipelineModel

import warnings
warnings.filterwarnings('ignore')

params = {"spark.driver.memory":"16G", 
          "spark.kryoserializer.buffer.max":"2000M", 
          "spark.driver.maxResultSize":"2000M"} 

print("Spark NLP Version :", sparknlp.version())
print("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark = sparknlp_jsl.start(license_keys['SECRET'],params=params)

spark

Spark NLP Version : 4.2.4
Spark NLP_JSL Version : 4.2.4


# Convert JSL conlls in dataframe format

In [15]:
from sparknlp.training import CoNLL

def get_conll_df(pth):
  data = CoNLL().readDataset(spark, pth)
  data = data.withColumn("sentence_idx", F.monotonically_increasing_id())

  df = data.select('sentence_idx', F.explode(F.arrays_zip('token.result','label.result','pos.result')).alias("cols")) \
  .select('sentence_idx',
          F.expr("cols['0']").alias("word"),
          F.expr("cols['1']").alias("tag"),
          F.expr("cols['2']").alias("pos")).toPandas()
  return df

train_data_df = get_conll_df('./conll_noO.conll')

In [16]:
train_data_df['tag'].value_counts()

type="application/javascript"                                     39
viewBox="0                                                        39
d-block                                                           26
                                                                  22
flex-shrink-0                                                     13
                                                                  ..
rel="author"                                                       1
color-fg-muted">/</span>                                           1
flex-self-stretch">                                                1
href="/JohnSnowLabs/spark-nlp-workshop">spark-nlp-workshop</a>     1
aria-live="polite"                                                 1
Name: tag, Length: 246, dtype: int64

# First, train / fine-tune a model on the dataset

## Iterating function to feed the model with sentences
Converting conll sentence annotations to tuples (word, pos, tag)

In [17]:
## convert conll file to sentences

class SentenceGetter(object):
    
    def __init__(self, dataset):
        self.n_sent = 1
        self.dataset = dataset
        self.empty = False
        agg_func = lambda s: [(w,p, t) for w,p, t in zip(s["word"].values.tolist(),
                                                       s['pos'].values.tolist(),
                                                        s["tag"].values.tolist())]
        self.grouped = self.dataset.groupby("sentence_idx").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

train_getter = SentenceGetter(train_data_df)

## Getting sentences and labels
- Sentences: concatenation of first element of tuple (word)
- Labels: concatenation of second element of tuple (label)

In [18]:
# Sentences 
train_sentences = [[word[0] for word in sentence] for sentence in train_getter.sentences]
print("Example of train sentence:")
print (train_sentences[5])

# Labels
train_labels = [[s[2] for s in sentence] for sentence in train_getter.sentences]
print("Example of train sentence:")
print(train_labels[5])

Example of train sentence:
['<script', '<script', '<script', '<script', '<script', '<script', '<script', '<script', '<script', '<script', '<script', '<script', '<script', '<script', '<script', '<script', '<script', '<script', '<script', '<script', '<script', '<script', '<script', '<script', '<script', '<script', '<script', '<script', '<script', '<script', '<script', '<script', '<script', '<script', '<script', '<script', '<script', '<script']
Example of train sentence:
['type="application/javascript"', 'type="application/javascript"', 'type="application/javascript"', 'type="application/javascript"', 'type="application/javascript"', 'type="application/javascript"', 'type="application/javascript"', 'type="application/javascript"', 'type="application/javascript"', 'type="application/javascript"', 'type="application/javascript"', 'type="application/javascript"', 'type="application/javascript"', 'type="application/javascript"', 'type="application/javascript"', 'type="application/javascript"'

## Converting tags to numeric values with a dict

In [19]:
tag_values = list(set(train_data_df["tag"].values))
tag_values.append("PAD")
tag2idx = {t: i for i, t in enumerate(tag_values)}

In [20]:
print(tag_values[:10])
print(tag2idx)

['', '3.073L9.573.677A.25.25', 'data-turbo-transient="true"', 'title="GitHub">', 'btn-octicon', 'crossOrigin="use-credentials">', 'class="">', 'data-view-component="true"', 'vulnerabilities', 'position-relative']
{'': 0, '3.073L9.573.677A.25.25': 1, 'data-turbo-transient="true"': 2, 'title="GitHub">': 3, 'btn-octicon': 4, 'crossOrigin="use-credentials">': 5, 'class="">': 6, 'data-view-component="true"': 7, 'vulnerabilities': 8, 'position-relative': 9, 'dark"': 10, 'data-filter-placeholder="Filter': 11, 'env-production': 12, 'tooltipped': 13, 'rounded-2': 14, 'sign': 15, 'mb-2': 16, '4.22a.75.75': 17, 'flex-justify-end': 18, 'does': 19, 'data-light-theme="light"': 20, 'flex-auto': 21, 'overflow-hidden': 22, 'd-flex">': 23, 'role="img"': 24, 'my-1">By': 25, 'd-flex': 26, '2.5a.75.75': 27, 'data-hotkey="t"': 28, 'data-hpc>': 29, 'href="https://github.com/pricing"': 30, 'tooltipped-nw': 31, 'data-turbo-track="reload">': 32, 'px-lg-0': 33, '1A1.75': 34, 'all': 35, 'border-0': 36, 'container

## Model metadata

### Bulding on top of biobert

In [21]:
MODEL_TO_TRAIN = 'yiyanghkust/finbert-pretrain'

### Hyperparam settings

In [22]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 256
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 10
LEARNING_RATE = 2e-05

## Instantiating the proper tokenizer

IMPORTANT! Pay attention to the `do_lower_case` param, and set it to True if you have a lowercased language model. That means you will always need to do `lower()` on your inference texts!

If the language model is not lowercase only, then leave it to False.

In [23]:
tokenizer = BertTokenizer.from_pretrained(MODEL_TO_TRAIN, do_lower_case=False)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

### Tokenize and extend the labels in case a word is split

In [24]:
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

## Tokenize and get tokens and labels

In [25]:
train_tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in zip(train_sentences, train_labels)
]

train_tokenized_texts_tokens = [token_label_pair[0] for token_label_pair in train_tokenized_texts_and_labels]

train_tokenized_texts_labels = [token_label_pair[1] for token_label_pair in train_tokenized_texts_and_labels]

In [26]:
print(train_tokenized_texts_tokens[5])
print(train_tokenized_texts_labels[5])

['<', 'script', '<', 'script', '<', 'script', '<', 'script', '<', 'script', '<', 'script', '<', 'script', '<', 'script', '<', 'script', '<', 'script', '<', 'script', '<', 'script', '<', 'script', '<', 'script', '<', 'script', '<', 'script', '<', 'script', '<', 'script', '<', 'script', '<', 'script', '<', 'script', '<', 'script', '<', 'script', '<', 'script', '<', 'script', '<', 'script', '<', 'script', '<', 'script', '<', 'script', '<', 'script', '<', 'script', '<', 'script', '<', 'script', '<', 'script', '<', 'script', '<', 'script', '<', 'script', '<', 'script']
['type="application/javascript"', 'type="application/javascript"', 'type="application/javascript"', 'type="application/javascript"', 'type="application/javascript"', 'type="application/javascript"', 'type="application/javascript"', 'type="application/javascript"', 'type="application/javascript"', 'type="application/javascript"', 'type="application/javascript"', 'type="application/javascript"', 'type="application/javascript"',

## Converting tokens to id && padding sentences to have fixed length

In [27]:
train_input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in train_tokenized_texts_tokens],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

train_tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in train_tokenized_texts_labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

In [28]:
print(train_input_ids[5])
print(train_tags[5])

[30838 11356 30838 11356 30838 11356 30838 11356 30838 11356 30838 11356
 30838 11356 30838 11356 30838 11356 30838 11356 30838 11356 30838 11356
 30838 11356 30838 11356 30838 11356 30838 11356 30838 11356 30838 11356
 30838 11356 30838 11356 30838 11356 30838 11356 30838 11356 30838 11356
 30838 11356 30838 11356 30838 11356 30838 11356 30838 11356 30838 11356
 30838 11356 30838 11356 30838 11356 30838 11356 30838 11356 30838 11356
 30838 11356 30838 11356     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0   

## Now that sentences are padded, I need to prevent attention from seeing pads (id=0)

In [29]:
train_attention_masks = [[float(i != 0.0) for i in ii] for ii in train_input_ids]

In [30]:
print(train_attention_masks[5])

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,

### Double checking that pairing input-mask is in place

In [31]:
for i,m in zip(train_input_ids[5], train_attention_masks[5]):
  print(f"Token id: {i}\Token mask: {m}")

Token id: 30838\Token mask: 1.0
Token id: 11356\Token mask: 1.0
Token id: 30838\Token mask: 1.0
Token id: 11356\Token mask: 1.0
Token id: 30838\Token mask: 1.0
Token id: 11356\Token mask: 1.0
Token id: 30838\Token mask: 1.0
Token id: 11356\Token mask: 1.0
Token id: 30838\Token mask: 1.0
Token id: 11356\Token mask: 1.0
Token id: 30838\Token mask: 1.0
Token id: 11356\Token mask: 1.0
Token id: 30838\Token mask: 1.0
Token id: 11356\Token mask: 1.0
Token id: 30838\Token mask: 1.0
Token id: 11356\Token mask: 1.0
Token id: 30838\Token mask: 1.0
Token id: 11356\Token mask: 1.0
Token id: 30838\Token mask: 1.0
Token id: 11356\Token mask: 1.0
Token id: 30838\Token mask: 1.0
Token id: 11356\Token mask: 1.0
Token id: 30838\Token mask: 1.0
Token id: 11356\Token mask: 1.0
Token id: 30838\Token mask: 1.0
Token id: 11356\Token mask: 1.0
Token id: 30838\Token mask: 1.0
Token id: 11356\Token mask: 1.0
Token id: 30838\Token mask: 1.0
Token id: 11356\Token mask: 1.0
Token id: 30838\Token mask: 1.0
Token id

## Arrays to tensors transformation

In [32]:
tr_inputs = torch.tensor(train_input_ids)
tr_tags = torch.tensor(train_tags)
tr_masks = torch.tensor(train_attention_masks)

In [33]:
print(tr_inputs[5])
print(tr_tags[5])
print(tr_masks[5])

tensor([30838, 11356, 30838, 11356, 30838, 11356, 30838, 11356, 30838, 11356,
        30838, 11356, 30838, 11356, 30838, 11356, 30838, 11356, 30838, 11356,
        30838, 11356, 30838, 11356, 30838, 11356, 30838, 11356, 30838, 11356,
        30838, 11356, 30838, 11356, 30838, 11356, 30838, 11356, 30838, 11356,
        30838, 11356, 30838, 11356, 30838, 11356, 30838, 11356, 30838, 11356,
        30838, 11356, 30838, 11356, 30838, 11356, 30838, 11356, 30838, 11356,
        30838, 11356, 30838, 11356, 30838, 11356, 30838, 11356, 30838, 11356,
        30838, 11356, 30838, 11356, 30838, 11356,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

### Checking sizes match

#### Training

In [34]:
len([x for x in tr_inputs[5] if x != 0]) # How many NO_PADs we have?

76

In [35]:
len([x for x in tr_tags[5] if x != 7])

256

In [36]:
len([x for x in tr_masks[5] if x != 0])

76

## Creating the DataLoaders to feed the batches during training

In [37]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=TRAIN_BATCH_SIZE)

# Loading the transformer model

In [38]:
transformers.__version__

'4.8.1'

In [39]:
model = transformers.BertForTokenClassification.from_pretrained(
    MODEL_TO_TRAIN,
    num_labels=len(tag2idx),
    output_attentions = False,
    output_hidden_states = False
)
model.to(device)

Downloading:   0%|          | 0.00/359 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of the model checkpoint at yiyanghkust/finbert-pretrain were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at 

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30873, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

## Setting up the optimizer.
We want to optimize weight values, so we add a decay.
We can get all the weights from `model_named_parameters()`
But we need to remove `bias`, `gamma` and `beta` which are Layer Normalization parameters we don't want to touch.

Activate `FULL_TINETUNING` to modify weights in all the layers.

In [40]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=3e-5,
    eps=1e-8
)


## Setting up the scheduler
It will manage Optimizer and Learning Rate changes. We use warmup

In [41]:
epochs = EPOCHS
max_grad_norm = 1.0

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)


Now, let's train

In [42]:

## Store the average loss after each epoch so we can plot them.
loss_values, validation_loss_values = [], []

for EPOCH in trange(epochs, desc="Epoch"):
    # Put the model into training mode.
    model.train()
    # Reset the total loss for this epoch.
    total_loss = 0

    # Training loop
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # Always clear any previously calculated gradients before performing a backward pass.
        model.zero_grad()
        # forward pass
        # This will return the loss (rather than the model output)
        # because we have provided the `labels`.
        outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)
        # get the loss
        loss = outputs[0]
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # track train loss
        total_loss += loss.item()
        # Clip the norm of the gradient
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)
    tr_loss = f"Average train loss: {str(avg_train_loss)}\n"

    if EPOCH % 5 == 0:
      # Saving partial models (this creates the folder too)    
      tokenizer.save_pretrained(f'{PROJECT_NAME}/{str(EPOCH)}/tokenizer/')
      model.save_pretrained(save_directory=f'{PROJECT_NAME}/{str(EPOCH)}/',
                            save_config=True, state_dict=model.state_dict)
      # Saving checkpoint in case it crashes, to restore work
      torch.save({
          'epoch': EPOCH,
          'model_state_dict': model.state_dict(),
          'optimizer_state_dict': optimizer.state_dict(),
          'loss': avg_train_loss,
          }, f'{PROJECT_NAME}/{str(EPOCH)}/checkpoint.pth')
    else:
      print("Skipping saving...")

    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)
    
    if EPOCH % 5 == 0:
      # Saving losses log
      with open(f'{PROJECT_NAME}/logs/epoch_' + str(EPOCH) + '_loss.log', 'a') as f:
        f.write(tr_loss)

    # Printing also to stdout
    print(tr_loss)


Epoch:  10%|█         | 1/10 [00:18<02:49, 18.86s/it]

Average train loss: 5.278835713863373



Epoch:  20%|██        | 2/10 [00:28<01:47, 13.42s/it]

Skipping saving...
Average train loss: 4.50340610742569



Epoch:  30%|███       | 3/10 [00:38<01:22, 11.72s/it]

Skipping saving...
Average train loss: 4.00476148724556



Epoch:  40%|████      | 4/10 [00:48<01:06, 11.02s/it]

Skipping saving...
Average train loss: 3.6690904796123505



Epoch:  50%|█████     | 5/10 [00:58<00:53, 10.67s/it]

Skipping saving...
Average train loss: 3.520973891019821



Epoch:  60%|██████    | 6/10 [01:14<00:50, 12.73s/it]

Average train loss: 3.310168117284775



Epoch:  70%|███████   | 7/10 [01:25<00:36, 12.01s/it]

Skipping saving...
Average train loss: 3.160410374403



Epoch:  80%|████████  | 8/10 [01:36<00:23, 11.62s/it]

Skipping saving...
Average train loss: 3.076319247484207



Epoch:  90%|█████████ | 9/10 [01:47<00:11, 11.40s/it]

Skipping saving...
Average train loss: 2.984026178717613



Epoch: 100%|██████████| 10/10 [01:57<00:00, 11.78s/it]

Skipping saving...
Average train loss: 2.8937235176563263






## Now load the model as TF and save properly

In [43]:
last_successfull_epoch = len(loss_values) - 1
if last_successfull_epoch < 0:
  last_successfull_epoch = None 

In [44]:
if last_successfull_epoch is None:
  print("No epochs finished successfully.")
else:
  print(f"Last successfull epoch: {str(last_successfull_epoch)}")

Last successfull epoch: 9


In [45]:
# first save the model as pytorch model (we'll cast later)
MODEL_NAME_PYTORCH = 'model_epoch_'+str(last_successfull_epoch)+'_pytorch'
MODEL_NAME_TF = 'model_epoch_'+str(last_successfull_epoch)+'_tf'

In [46]:
print(MODEL_NAME_PYTORCH)
print(MODEL_NAME_TF)

model_epoch_9_pytorch
model_epoch_9_tf


In [47]:
# now load the model as TF and save properly
from transformers import TFBertForTokenClassification

tokenizer.save_pretrained(f'./{PROJECT_NAME}/{MODEL_NAME_PYTORCH}_tokenizer/')
model.save_pretrained(f'./{PROJECT_NAME}/{MODEL_NAME_PYTORCH}', saved_model=True, save_format='tf')

**IMPORTANT** If it's a domain-specific model, we need to use an interface to load and save it, that will change the input_signature so that it can only be loaded with sparknlp_jsl.xx.XXBertForTokenClassification

In [48]:
from transformers import TFBertForTokenClassification
import tensorflow as tf

# Creation of a subclass in order to define a new serving signature
class DomainSpecificModel(TFBertForTokenClassification):
    # Decorate the serving method with the new input_signature
    # an input_signature represents the name, the data type and the shape of an expected input
    @tf.function(input_signature=[{
        "input_ids": tf.TensorSpec((None, None), tf.int32, name="medical_input_ids"),
        "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
        "token_type_ids": tf.TensorSpec((None, None), tf.int32, name="token_type_ids"),

    }])
    def serving(self, inputs):
        # call the model to process the inputs
        output = self.call(inputs)

        # return the formated output
        return self.serving_output(output)

In [49]:
loaded_model = DomainSpecificModel.from_pretrained(f'./{PROJECT_NAME}/{MODEL_NAME_PYTORCH}', from_pt=True)
loaded_model.save_pretrained(f'./{PROJECT_NAME}/{MODEL_NAME_TF}', saved_model=True)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model DomainSpecificModel: ['bert.embeddings.position_ids']
- This IS expected if you are initializing DomainSpecificModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DomainSpecificModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of DomainSpecificModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DomainSpecificModel for predictions without further training.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method




### Save label mapping

In [50]:
labels = sorted(tag2idx, key=tag2idx.get)

print (labels)

with open(f'./{PROJECT_NAME}/{MODEL_NAME_TF}/saved_model/1/assets/labels.txt', 'w') as f:
    f.write('\n'.join(labels))

['', '3.073L9.573.677A.25.25', 'data-turbo-transient="true"', 'title="GitHub">', 'btn-octicon', 'crossOrigin="use-credentials">', 'class="">', 'data-view-component="true"', 'vulnerabilities', 'position-relative', 'dark"', 'data-filter-placeholder="Filter', 'env-production', 'tooltipped', 'rounded-2', 'sign', 'mb-2', '4.22a.75.75', 'flex-justify-end', 'does', 'data-light-theme="light"', 'flex-auto', 'overflow-hidden', 'd-flex">', 'role="img"', 'my-1">By', 'd-flex', '2.5a.75.75', 'data-hotkey="t"', 'data-hpc>', 'href="https://github.com/pricing"', 'tooltipped-nw', 'data-turbo-track="reload">', 'px-lg-0', '1A1.75', 'all', 'border-0', 'container-xl', 'data-tab-item="i1issues-tab"', '1.75', 'my-1"></div></span>', 'developers', 'href="/JohnSnowLabs/spark-nlp-workshop">spark-nlp-workshop</a>', 'py-4', 'hide-xl', 'flash-warn', 'data-tab-item="i6security-tab"', 'data-hydro-click-hmac="8b04d14cb260ea481882d62122584d47371ff9520c5b5507f3f8c984908640b3"', '"', 'href="/JohnSnowLabs/spark-nlp-worksho

### Copy files in tf model's assets

In [51]:
vocab_pth = f"./{PROJECT_NAME}/{MODEL_NAME_PYTORCH}_tokenizer/vocab.txt"
saved_model_pth = f'./{PROJECT_NAME}/{MODEL_NAME_TF}/saved_model/1/assets/'

! cp $vocab_pth $saved_model_pth

# Now load the saved model in Spark NLP and save it properly

In [52]:
domain = 'FINANCE' # or 'FINANCE' or 'OPENSOURCE'

if domain == 'OPENSOURCE':
  classifier_class = BertForTokenClassification
elif domain == 'LEGAL':
  classifier_class = sparknlp_jsl.legal.LegalBertForTokenClassification
  classifier_classpath = "com.johnsnowlabs.legal.token_classification.ner.LegalBertForTokenClassification"
elif domain == 'FINANCE':
  classifier_class = sparknlp_jsl.finance.FinanceBertForTokenClassification
  classifier_classpath = "com.johnsnowlabs.finance.token_classification.ner.FinanceBertForTokenClassification"

In [53]:
classifier_class

sparknlp_jsl.finance.token_classification.ner.finance_bert_for_token_classifier.FinanceBertForTokenClassification

In [54]:
classifier_classpath

'com.johnsnowlabs.finance.token_classification.ner.FinanceBertForTokenClassification'

In [55]:
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *

# For Finance
tokenClassifier = classifier_class.loadSavedModel(
     f'./{PROJECT_NAME}/{MODEL_NAME_TF}/saved_model/1',
     spark
 )\
 .setInputCols(["sentence",'token'])\
  .setOutputCol("ner")\
  .setCaseSensitive(True)\
  .setMaxSentenceLength(256)

In [56]:
tokenClassifier.write().overwrite().save(f"./{PROJECT_NAME}/{MODEL_NAME_TF}_spark_nlp")

In [57]:
import json
with open(f"./{PROJECT_NAME}/{MODEL_NAME_TF}_spark_nlp/metadata/part-00000", 'r') as fr:
  metadata = json.load(fr)
metadata['class'] = classifier_classpath
with open(f"./{PROJECT_NAME}/{MODEL_NAME_TF}_spark_nlp/metadata/part-00000", 'w') as fw:
  metadata = json.dump(metadata, fw)

In [58]:
!rm ./{PROJECT_NAME}/{MODEL_NAME_TF}_spark_nlp/metadata/.*.crc

In [59]:
!ls -lah ./{PROJECT_NAME}/{MODEL_NAME_TF}_spark_nlp/metadata/

total 12K
drwxr-xr-x 2 root root 4.0K Jan 11 16:19 .
drwxr-xr-x 4 root root 4.0K Jan 11 16:19 ..
-rw-r--r-- 1 root root  479 Jan 11 16:19 part-00000
-rw-r--r-- 1 root root    0 Jan 11 16:18 _SUCCESS


# Test the imported model in Spark NLP

In [60]:
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline

documentAssembler = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")

sparktokenizer = Tokenizer()\
  .setInputCols("document")\
  .setOutputCol("token")

from sparknlp_jsl.annotator import *

tokenClassifier = classifier_class.load(f"./{PROJECT_NAME}/{MODEL_NAME_TF}_spark_nlp")\
  .setInputCols("token", "document")\
  .setOutputCol("label")\
  .setCaseSensitive(True)


In [61]:
pipeline =  Pipeline(stages=[
  documentAssembler,
  sparktokenizer,
  tokenClassifier
    ]
)

In [62]:
p_model = pipeline.fit(spark.createDataFrame(pd.DataFrame({'text': ['']})))

In [63]:
text = """Fox grants to Licensee a limited, exclusive (except as otherwise may be provided in this Agreement), 
non-transferable (except as permitted in Paragraph 17(d)) right and license""".lower()
res = p_model.transform(spark.createDataFrame([[text]]).toDF("text"))

res.show()

+--------------------+--------------------+--------------------+--------------------+
|                text|            document|               token|               label|
+--------------------+--------------------+--------------------+--------------------+
|fox grants to lic...|[{document, 0, 17...|[{token, 0, 2, fo...|[{named_entity, 0...|
+--------------------+--------------------+--------------------+--------------------+



In [64]:
from pyspark.sql import functions as F

res.select(F.explode(F.arrays_zip('token.result', 'label.result')).alias("cols")) \
               .select(F.expr("cols['0']").alias("token"),
                       F.expr("cols['1']").alias("ner_label"))\
               .show(20, truncate=100)

+----------------+----------------------------------------------------------------------------------------------------+
|           token|                                                                                           ner_label|
+----------------+----------------------------------------------------------------------------------------------------+
|             fox|                                                                                             d-block|
|          grants|                                                                                             d-block|
|              to|                                                                                          flash-warn|
|        licensee|                                                                                          col-lg-8">|
|               a|                                                                                             d-block|
|         limited|                      

In [65]:
os.environ['SPARKNLP_TF_MODEL'] = MODEL_NAME_TF + "_spark_nlp"

In [66]:
!cd $PROJECT_NAME && zip -r $PROJECT_NAME.zip $SPARKNLP_TF_MODEL

  adding: model_epoch_9_tf_spark_nlp/ (stored 0%)
  adding: model_epoch_9_tf_spark_nlp/bert_classification_tensorflow (deflated 8%)
  adding: model_epoch_9_tf_spark_nlp/metadata/ (stored 0%)
  adding: model_epoch_9_tf_spark_nlp/metadata/_SUCCESS (stored 0%)
  adding: model_epoch_9_tf_spark_nlp/metadata/part-00000 (deflated 39%)
  adding: model_epoch_9_tf_spark_nlp/fields/ (stored 0%)
  adding: model_epoch_9_tf_spark_nlp/fields/vocabulary/ (stored 0%)
  adding: model_epoch_9_tf_spark_nlp/fields/vocabulary/.part-00000.crc (stored 0%)
  adding: model_epoch_9_tf_spark_nlp/fields/vocabulary/.part-00001.crc (stored 0%)
  adding: model_epoch_9_tf_spark_nlp/fields/vocabulary/_SUCCESS (stored 0%)
  adding: model_epoch_9_tf_spark_nlp/fields/vocabulary/part-00000 (deflated 78%)
  adding: model_epoch_9_tf_spark_nlp/fields/vocabulary/._SUCCESS.crc (stored 0%)
  adding: model_epoch_9_tf_spark_nlp/fields/vocabulary/part-00001 (deflated 78%)
  adding: model_epoch_9_tf_spark_nlp/fields/labels/ (stored 

# MOUNT DRIVE AND SAVE YOUR MODEL TO IT

In [67]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [68]:
!cp financial_operations/financial_operations.zip /content/gdrive/MyDrive/