#### Set-up for Colab
Install dependencies, mount drive, etc.

In [1]:
pip install -q -U tensorflow-text

In [2]:
pip install -q tf-models-official==2.4.0

In [None]:
pip install transformers

In [None]:
pip install bert-for-tf2

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
path = 'drive/MyDrive/MIDS/chemical_patent_cer_ee'

#### Import Libraries

In [2]:
import os
import io
import re
import sys
import sys
sys.path.append('/content/drive/MyDrive/MIDS/chemical_patent_cer_ee/notebooks')

import numpy as np
import pandas as pd
import argparse
from time import time
import matplotlib.pyplot as plt

import pickle
from csv import reader

import tensorflow as tf
from transformers import BertTokenizer
import bert

from sre_inputs import *
from train_test import *
from sre_models import *

#### BERT Model
- Load BERT model and tokenizer
- Set max length for inputs

In [3]:
# path for bert model
bert_model_dir = f'{path}/bert/scibert_scivocab_cased'

# set tokenizer
vocab_file = os.path.join(bert_model_dir, "vocab.txt")
tokenizer = BertTokenizer(vocab_file=vocab_file, do_lower_case=False)

# set BERT model
bert_params = bert.params_from_pretrained_ckpt(bert_model_dir)
bert = bert.BertModelLayer.from_params(bert_params, name="bert")

# set max length for inputs
max_length = 500

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


#### Data
- Upload preprocessed chemical patent file(s)
- Use `sre_inputs` module to generate inputs for model
- Sample only: split into train/test using `train_test` module
- Need to one hot encode labels before using in model

*NB: Make sure that preprocessed data being uploaded and parameters chosen for generating inputs **both** match the type of model it will be used for!*

In [4]:
#### TRAIN/DEV DATASET PROCESSING ####

# paths for preprocessed data
train_path = f'{path}/data/sre_em/sre_em_train.csv'
dev_path = f'{path}/data/sre_em/sre_em_dev.csv'

# indicate parameters for generating inputs
marker_type = 'em'
head_type = 'start'

# generate inputs for model
train_lists = generate_entity_inputs(train_path, tokenizer, marker_type, head_type, max_length)
dev_lists = generate_entity_inputs(dev_path, tokenizer, marker_type, head_type, max_length)

# generate inputs and labels
# one hot encode labels
model_inputs_train = [x for x in train_lists[0][:5]]
train_labels = train_lists[1]
model_labels_train = tf.one_hot(train_labels, depth=2)

model_inputs_dev = [x for x in dev_lists[0][:5]]
dev_labels = dev_lists[1]
model_labels_dev = tf.one_hot(dev_labels, depth=2)

In [None]:
#### TEST DATASET PROCESSING ####

# path for preprocessed data
test_path = f'{path}/data/sre_em/sre_em_test.csv'

# indicate parameters for generating inputs
marker_type = 'em'
head_type = 'start'

# generate inputs for model
test_lists = generate_entity_inputs(test_path, tokenizer, marker_type, head_type, max_length)

# generate inputs and labels
# one hot encode labels
model_inputs_test = [x for x in test_lists[0][:5]]
test_labels = test_lists[1]
model_labels_test = tf.one_hot(test_labels, depth=2)

In [7]:
#### SAMPLE PROCESSING ####

# path for preprocessed data
full_path = f'{path}/data/sre_em/sre_em_sample.csv'

# indicate which model the data will be used for
marker_type = 'em'
head_type = 'start'

# generate inputs for model
all_lists = generate_entity_inputs(full_path, tokenizer, marker_type, head_type, max_length=500)

# SAMPLE ONLY: split into train/test
train_all, test_all = train_test_split(all_lists)

# generate inputs and labels
# one hot encode labels
model_inputs_train = [x for x in train_all[0][:5]]
model_labels_train = train_all[1]
model_labels_train = tf.one_hot(model_labels_train, depth=2)

model_inputs_test = [x for x in test_all[0][:5]]
model_labels_test = test_all[1]
model_labels_test = tf.one_hot(model_labels_test, depth=2)

#### Run Model(s)

In [15]:
#### SAMPLE RUN ####

tf.keras.backend.clear_session()
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

model = sre_start_model(bert_model, max_length)

model.fit(
    model_inputs_train, 
    {"sre": model_labels_train},
    validation_data=(model_inputs_test, {"sre": model_labels_test}),
    epochs=5,
    batch_size=16
)


=== SRE Start Entity Model ===
BERT layer output: KerasTensor(type_spec=TensorSpec(shape=(None, 500, 768), dtype=tf.float32, name=None), name='tf_bert_model/bert/encoder/layer_._11/output/LayerNorm/batchnorm/add_1:0', description="created by layer 'tf_bert_model'")
Prediction: KerasTensor(type_spec=TensorSpec(shape=(None, 2), dtype=tf.float32, name=None), name='sre/Softmax:0', description="created by layer 'sre'")

Model: "sre_pool"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 500)]        0           []                               
                                                                                                  
 input_masks (InputLayer)       [(None, 500)]        0           []                               
                                                                   

<keras.callbacks.History at 0x7f62fbd7c350>

In [None]:
#### TRAIN/DEV RUN ####

tf.keras.backend.clear_session()
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

model = sre_start_model(bert_model, max_length)

model.fit(
    model_inputs_train, 
    {"sre": model_labels_train},
    validation_data=(model_inputs_dev, {"sre": model_labels_dev}),
    epochs=5,
    batch_size=16
)

In [1]:
# OPTIONAL: visualize model
#tf.keras.utils.plot_model(model, show_shapes=True, dpi=48)

In [2]:
# save model
model_name = 'sample'
model.save(f'{path}/models/{model_name}')