In [None]:
#requirements
!pip install sentencepiece
!pip install tokenizers
!pip install transformers
!pip install nltk

import urllib
import torch.nn as nn
import json
import torch
import pandas as pd
import numpy as np
import transformers
from transformers import BertTokenizer, BertForPreTraining, BertConfig, AdamW  

from transformers import AutoTokenizer, BertForMaskedLM
from transformers import TFBertForTokenClassification, TFTrainer, TFTrainingArguments
from tokenizers import BertWordPieceTokenizer

from nltk.tokenize import WhitespaceTokenizer
import tensorflow as tf

import os
import re #regular expression

import string 
from pathlib import Path
from torch.utils.data import Dataset , DataLoader
# /https://huggingface.co/docs/transformers/model_doc/bert#transformers.TFBertForTokenClassification.**kwargs

In [17]:
#PARAMETERS
# special tokens
special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "<S>", "<T>"]
# special_tokens=['<s>', '<pad>', '</s>', '<unk>', '<mask>'])
# 30,522 vocab is BERT's default vocab size, feel free to tweak
vocab_size = 30_522
# maximum sequence length, lowering will result to faster training (when increasing batch size)
max_length = 512
# whether to truncate
truncate_longer_samples = True
EPOCHS=25
ACCUM_STEPS=5
NUM_EPOCHS=100
BATCH_SIZE=4
SEED =42
RETURN_TENSORS = False
LEARNING_RATE = 5e-5

In [18]:
# Define tokenizer
TOKENIZER = BertTokenizer.from_pretrained('bert-base-cased')
# TOKENIZER = WhitespaceTokenizer()

In [19]:
# Download data

##fn has unique paragraphs per row 409
url='https://raw.githubusercontent.com/MikeDoes/ETH_NLP_Project/main/fin_num_merged.json'
response = urllib.request.urlopen(url)
data_fn=json.loads(response.read())

##fn3 has duplicated paragraphs per number found 1100 original
url='https://raw.githubusercontent.com/MikeDoes/ETH_NLP_Project/main/FinNum-3_ConCall_dev.json'
response = urllib.request.urlopen(url)
data_fn3=json.loads(response.read())

In [20]:
#PRE PROCESS Part I
def convertdata_pd(data, tokenize_labels=False, tokenizer=TOKENIZER,return_tensors=RETURN_TENSORS):
    """
    Converts the json to a pd. 
    arg: data set finNum (dics and shape 409)
    outputs: 4 >> pd dataframe with X and Y, another with X, another with Y and the list of categories found in the data set.

    optionally it can perform the tokenization on the labels already and/or can return tensors format
    """
    data_rows=[]
    labels =[]
    #data_fn
    if data == data_fn:
        for i in range(len(data)):
            temp = []
            paragraph = data[i]["paragraph"]
            numbers = data[i]["entities"]
            for j in range(len(numbers)):
                num = numbers[j]["target_num"]
                category = numbers[j]["category"]
                offset_start = numbers[j]["offset_start"]
                if tokenize_labels==False:
                    temp.append({
                        "number":num,
                        "label":category,
                        "pos":offset_start})
                    labels.append({"labels":category})
                elif return_tensors == False:
                    category_token = tokenizer(list(category.split(" "))) #tokenize the category 
                    temp.append({
                        "number":num,
                        "label":category_token["input_ids"][0][1],
                        "pos":offset_start})
                    labels.append({"labels":category_token["input_ids"][0][1]})
                else:
                    category_token = tokenizer(list(category.split(" ")),return_tensors='tf') #tokenize the category
                    temp.append({
                        "number":num,
                        "label":category_token["input_ids"][0][1],
                        "pos":offset_start})
                    labels.append({"labels":category_token["input_ids"][0][1]})
                     # add the category. Warning: category can only be one word
            data_rows.append({
                "paragraph":paragraph,
                "labels":temp
            })
    #data_fn3
    elif data == data_fn3:
        for i in range(len(data)):
            paragraph=data[i]["paragraph"]
            target=data[i]["target_num"]
            category=data[i]["category"]
            offset_start=data[i]["offset_start"]
            offset_end=data[i]["offset_end"]
            data_rows.append({
                "paragraph":paragraph,
                "target":target,
                "category":category,
                "offset_start":offset_start,
                "offset_end":offset_end,
                "model_prediction_category":'',
                "model_prediction_entity":''
            })
    x = pd.DataFrame(data_rows).iloc[:,0]
    y = pd.DataFrame(data_rows).iloc[:,1]
    
    labels.append({"labels":0}) #decided for 0 as everything is already a token, therefore assumed NULL==0
    labels_uniq =[]
    if tokenize_labels == False:
        labels_uniq=pd.DataFrame(labels).labels.unique()
        #todo: check before 0 is a token for padding, so probably we can t add as 0 
    
    return pd.DataFrame(data_rows), x, y, labels_uniq

In [21]:
##Example
data_rows, x ,y, cat =convertdata_pd(data_fn,tokenize_labels=False,return_tensors='tf')
# data_rows, x ,y,cat =convertdata_pd(data_fn,tokenize_labels=True,return_tensors='tf')

In [22]:
# PREPROCESS Part II Prep for token
def clean(par):
    """
    args: str
    receives a paragraph and cleans its text
    example: data_rows.iloc[0,0]    
    """
    par
    par = par.lower() #remove capital letters
    par = par.replace("[^a-zA-Z]", " ") #remove non english characters
    # remove pontuation, as the punctuation i also converted into tokens
    for char in par:
        if char in string.punctuation:
            par = par.replace(char,"")
    return par
            
#todo implement for non english https://pypi.org/project/langdetect/
#todo keep % as it is important for identification

In [None]:
##Example
parpar = clean (data_rows.iloc[0,0])

In [79]:
# PREPROCESS Part III 
## Construct target vector
def constructor2(par,target,categories,tokenizer=TOKENIZER, return_tensors=RETURN_TENSORS):
    """
    Args:
      par: paragraph (original paragraph so we can calculate the position 
      targets: format [{'number': '80', 'label': 'percentage', 'pos': 373}]
      categories: full list of categories
    Outputs: a vector of the same size as the tokenized paragraph, with 0s and the label in the word positioning
    """
    
    #clean the paragraph
    par_clean=clean(par)
    par_token=tokenizer(par_clean)["input_ids"]
    
    #initiates a vector with the same size as the tokenized paragraph
    y_vector = np.zeros(len(par_token)) 
    y_vector_numb = np.zeros(len(par_token))

    #if the y (target) has still the original categories and not tokenized
    for i in range(len(target)):
      pos=target[i]["pos"] #get the position of the number
      spl = par[0:pos] #split the original paragraph untill position of wanted number
      spl_clean = clean(spl) #now we clean the splitted paragraph
              
      # tokenize the splitted and clean paragraph and calculate the length of the tokenized vector
      if return_tensors == False:
        spl_token = tokenizer(spl_clean)["input_ids"]
        size = len(spl_token)-1 #-1 to remove token=102 which is the end of the vector
      else:
        spl_token = tokenizer(spl_clean, return_tensors='tf')["input_ids"]
        size = len(spl_token[0])-1  #-1 to remove token=102 which is the end of the vector
              
      #in the equivalent position of the number found, we add the category label, which is given by the position in the list
      y_vector[size] = np.where(categories==target[i]["label"])[0][0] 
      y_vector_numb[size] = target[i]["number"]
   
    y_vector.tolist()
    if return_tensors == 'tf':
        par_token=tokenizer(par_clean,return_tensors='tf') #todo: check if we need to add max_length =512, truncation=True, padding ='max_length')above and how to handle
        y_vector=tf.convert_to_tensor(y_vector,np.float32)
     
    return par_token, y_vector, y_vector_numb #todo

In [80]:
#Example
x_3=x[3]
y_3=y[3]
par,y_v,y_token=constructor2(x_3,y_3,cat, return_tensors='tf')

In [None]:
# MODEL

#Parameters
optimizer = tf.keras.optimizers.Adam()
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
metrics = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
bert_model = "bert-base-cased"
num_labels = 10

##Model
model = TFBertForTokenClassification.from_pretrained( bert_model , from_pt = True) #https://huggingface.co/docs/transformers/model_doc/bert#transformers.TFBertForTokenClassification.**kwargs
model.config.num_labels = 10

model.layers[-1].activation = tf.keras.layers.Dense(10, activation=tf.keras.activations.softmax, trainable=True) 
model.layers[0].trainable = False 

model.compile(optimizer=optimizer, loss = loss, metrics = [metrics])

In [None]:
# TRAIN MODEL: Approach 1 basic

#define train data
data_train = data_rows.iloc[0:404]
data_eval = data_rows.iloc[404:]
EPOCHS=1

# train point by point
for j in range(EPOCHS):
  for i in range(len(data_train)):
    par,y_v,y_token=constructor2(data_train["paragraph"][i],data_train["labels"][i],cat, return_tensors='tf')
    oneinput = par
    oneinput["labels"] = tf.reshape(y_v, (-1, tf.size(y_v)))
    model.fit(oneinput["input_ids"][0], oneinput["labels"][0])

# the output of the model are probabilities of categories check here https://www.tensorflow.org/tutorials/keras/classification

In [None]:
# EVALUATE THE MODEL
test_loss, test_acc = model.evaluate(oneinput["input_ids"][0], oneinput["labels"][0])

In [None]:
# OVERVIEW OF ONE DATA POINT
paragraph,y_vector,y_token=constructor2(data_rows["paragraph"][3],data_rows["labels"][3],cat, return_tensors='tf')
inn = paragraph
inn["labels"] = tf.reshape(y_vector, (-1, tf.size(y_vector)))
print("PARAGRAPH",len(data_rows["paragraph"][3]), data_rows["paragraph"][3])
print("TOKENIZED", len(par),par)
print("VECTOR CATEGORIES",len(y_v),y_v)
print("OUTPUT",model.predict(inn["input_ids"]))

**THIS IS SUPPORT INFORMATION**

In [None]:
# SUPPORT: other things tried out in the model

# model.summary()
# configg = BertConfig.from_pretrained(bert_model,num_labels =10)
# model.hidden_size = 1

# model.layers[-1].activation = tf.keras.activations.softmax ##ERROR
# model.layers[-1].activation = tf.keras.layers.Dense(10, activation=tf.keras.activations.softmax, trainable=True) #"relu" tf.keras.activations.softmax ### RETURNS 1 85 10  BEST

# model.layers[-1].activation = tf.keras.layers.Dense(32, activation=tf.keras.activations.softmax, trainable=True) #"relu" tf.keras.activations.softmax ### RETURNS 1 85 32
# model.layers[-1].activation = tf.keras.layers.Dense(32, activation='relu', trainable=True) #"relu" tf.keras.activations.softmax ### RETURNS 1 85 32
# a = tf.keras.layers.Dense(10, activation='silu', trainable=True) #"relu" tf.keras.activations.softmax ### RETURNS 1 85 32


# model.layers[-1].activation = tf.keras.layers.Maximum() #"relu" tf.keras.activations.softmax ## ERROR
# inputt = tf.keras.layers.Input(shape=(85,10))
# x2 = tf.keras.layers.Dense(8, activation='relu')(inputt)
# x1 = tf.keras.layers.Dense(8, activation='relu')(inputt)
# model.layers[-1] = tf.keras.layers.Add()(x2)
# model.layers[-1].activation = tf.keras.layers.Maximum()(x1)

# model.layers[-1].activation = nn.Linear(85, 10) ##error
# model.classifier = tf.keras.layers.Linear(85, trainable=True)

# model.layers[-1].trainable = True 
# model.layers[0].trainable = False 

# model.compile(optimizer=optimizer, loss = loss, metrics = [metrics])
# model.summary()
# model.config
# model.save('bert-base-cased')


### options on how to train the model
# 1 https://stackoverflow.com/questions/62797376/tensorflow-bert-for-token-classification-exclude-pad-tokens-from-accuracy-whil
# model.fit(inputs2)
# 2 https://huggingface.co/docs/transformers/master/en/main_classes/model#transformers.TFPreTrainedModel.train_step
# model.train_step(inputs2)
# 3
# outputs = model(inputs2,output_hidden_states=True, return_dict =True, training = True)

#https://towardsdatascience.com/how-to-use-bert-from-the-hugging-face-transformer-library-d373a22b0209

# # Clemente https://huggingface.co/docs/transformers/model_doc/bert
# model = TFBertForTokenClassification.from_pretrained('bert-base-cased',config= configg)
# model.compile(optimizer=optimizer, loss = loss, metrics = [metrics])
# # model = TFBertForTokenClassification.from_pretrained('bert-base-cased')
# outs = model(inputs2)
# outs.loss
# a = np.argmax(cc)

In [67]:
# Prepare data for model
inputs_t=[]
labels_t=[]
for i in range(2):
  par,y_v,y_token=constructor2(data_rows["paragraph"][i],data_rows["labels"][i],cat, return_tensors='tf')
  # oneinput['input_ids'] = par['input_ids'][0]
  oneinput = par
  oneinput["labels"] = tf.reshape(y_v, (-1, tf.size(y_v)))
  # model.fit(oneinput["input_ids"][0], oneinput["labels"][0])
  # print(i)
  inputs_t.append(oneinput["input_ids"][0])
  labels_t.append(oneinput["labels"][0])

In [None]:
# model.fit(list, epochs=5)
inputs2 = par
#inputs2["labels"] = tf.reshape(tf.constant([1] * tf.size(y_v).numpy()), (-1, tf.size(y_v)))
inputs2["labels"] = tf.reshape(y_v, (-1, tf.size(y_v)))
model.fit(inputs2["input_ids"][0], inputs2["labels"][0], epochs=3)
outputs=model.predict(inputs2["input_ids"])
outputs
out['logits'][0]

In [None]:
# https://github.com/huggingface/transformers/blob/master/examples/pytorch/token-classification/run_ner.py
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
  acc = accuracy_score(labels, preds)
  return {'accuracy': acc,'f1': f1,'precision': precision,'recall': recall}

In [None]:
# https://github.com/huggingface/transformers/issues/8292
## this seems now to be outdated
training_args = TFTrainingArguments(
output_dir='./bert_test', # output directory
num_train_epochs=5, # total # of training epochs
per_device_train_batch_size=32, # batch size per device during training
per_device_eval_batch_size=32, # batch size for evaluation
warmup_steps=500, # number of warmup steps for learning rate scheduler
weight_decay=0.01, # strength of weight decay
logging_dir='./logs', # directory for storing logs
learning_rate=3e-5,
)


In [None]:
# tips for model
# use AdamW optimizer
# you can get the loss via
loss = outputs.loss
logits = outputs.logits

#
for epoch in range(2):
    loop = tqdm(loader, leave = True)
    for batch in loop:
        optim.zero_grad()
        outputs = model(inputs)
        loss = output.loss
        loss.backward()
        optim.step()
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())