#### Set-up for Colab
Install dependencies, mount drive, etc.

In [None]:
#pip install -q -U tensorflow-text

In [None]:
#pip install -q tf-models-official==2.4.0

In [None]:
pip uninstall tensorflow

In [None]:
pip install tensorflow==2.5.0

In [None]:
pip install transformers

In [None]:
pip install bert-for-tf2

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
path = 'drive/MyDrive/MIDS/chemical_patent_cer_ee'
# path = '..'

#### Import Libraries

In [None]:
import os
import io
import re
import sys
import sys
sys.path.append('/content/drive/MyDrive/MIDS/chemical_patent_cer_ee/notebooks')

import numpy as np
import pandas as pd
import argparse
from time import time
import matplotlib.pyplot as plt

import pickle
from csv import reader

import tensorflow as tf
from transformers import BertTokenizer
import bert

from sre_inputs import *
from train_test import *
from sre_models import *

#### BERT Model
- Load BERT model and tokenizer
- Set max length for inputs

In [None]:
# path for bert model
bert_model_dir = f'{path}/bert/bert_mini'
bert_type = bert_model_dir.split('/')[-1]

# set tokenizer
vocab_file = os.path.join(bert_model_dir, "vocab.txt")
tokenizer = BertTokenizer(vocab_file=vocab_file, do_lower_case=False)

# set BERT model
bert_params = bert.params_from_pretrained_ckpt(bert_model_dir)
bert = bert.BertModelLayer.from_params(bert_params, name="bert")

# set max length for inputs
max_length = 512

# set parameters for model type
marker_type = 'ner' # 'em', 'ner', or 'std'
head_type = 'ner' # 'cls', 'start', 'pool', or 'ner'
subsampled = False

#### Data
- Upload preprocessed chemical patent file(s)
- Use `sre_inputs` module to generate inputs for model
- Sample only: split into train/test using `train_test` module
- Need to one hot encode labels before using in model

*NB: Make sure that preprocessed data being uploaded and parameters chosen for generating inputs **both** match the type of model it will be used for!*

In [None]:
#### TRAIN/DEV DATASET PROCESSING ####

# paths for preprocessed data
if (marker_type == 'em' or marker_type == 'std') and not subsampled:
    train_path = f'{path}/data/sre_em/sre_em_train.csv'
    dev_path = f'{path}/data/sre_em/sre_em_dev.csv'
elif (marker_type == 'em' or marker_type == 'std') and subsampled:
    train_path = f'{path}/data/sre_em/sre_em_train_subsampled.csv'
    dev_path = f'{path}/data/sre_em/sre_em_dev_subsampled.csv'
elif marker_type == 'ner' and not subsampled:
    train_path = f'{path}/data/sre_ner/sre_ner_train.csv'
    dev_path = f'{path}/data/sre_ner/sre_ner_dev.csv'
elif marker_type == 'ner' and subsampled:
    train_path = f'{path}/data/sre_ner/sre_ner_train_subsampled.csv'
    dev_path = f'{path}/data/sre_ner/sre_ner_dev_subsampled.csv'

print(f'Loaded {train_path}')
print(f'Loaded {dev_path}')

# generate inputs for model
if marker_type == 'em' or marker_type == 'ner':
    train_lists = generate_entity_inputs(train_path, tokenizer, marker_type, head_type, max_length)
    dev_lists = generate_entity_inputs(dev_path, tokenizer, marker_type, head_type, max_length)
elif marker_type == 'std':
    train_lists = generate_standard_inputs(train_path, tokenizer, max_length)
    dev_lists = generate_standard_inputs(dev_path, tokenizer, max_length)

# generate inputs and labels
# one hot encode labels
model_inputs_train = [x for x in train_lists[0][:5]]
train_labels = train_lists[1]
model_labels_train = tf.one_hot(train_labels, depth=3)

model_inputs_dev = [x for x in dev_lists[0][:5]]
dev_labels = dev_lists[1]
model_labels_dev = tf.one_hot(dev_labels, depth=3)

Loaded drive/MyDrive/MIDS/chemical_patent_cer_ee/data/sre_ner/sre_ner_train.csv
Loaded drive/MyDrive/MIDS/chemical_patent_cer_ee/data/sre_ner/sre_ner_dev.csv


In [None]:
#### TEST DATASET PROCESSING ####

# path for preprocessed data
if marker_type == 'em' or marker_type == 'std':
    test_path = f'{path}/data/sre_em/sre_em_test.csv'
elif marker_type == 'ner':
    test_path = f'{path}/data/sre_ner/sre_ner_test.csv'

# generate inputs for model
if marker_type == 'em' or marker_type == 'ner':
    test_lists = generate_entity_inputs(test_path, tokenizer, marker_type, head_type, max_length)
elif marker_type == 'std':
    test_lists = generate_standard_inputs(test_path, tokenizer, max_length)

# generate inputs and labels
# one hot encode labels
model_inputs_test = [x for x in test_lists[0][:5]]
test_labels = test_lists[1]
model_labels_test = tf.one_hot(test_labels, depth=3)

#### Run Model(s)

In [None]:
#### TRAIN/DEV RUN ####

tf.keras.backend.clear_session()
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

epochs = 10
batch_size = 32
train_layers = 0

if head_type == 'cls':
    model = sre_cls_model(bert, max_length, train_layers)
elif head_type == 'start':
    model = sre_start_model(bert, max_length, train_layers)
elif head_type == 'pool':
    model = sre_pool_model(bert, max_length, train_layers)
elif head_type == 'ner':
    model = sre_pool_model(bert, max_length, train_layers)

if marker_type == 'std' or head_type == 'cls':
    model.fit(
        model_inputs_train[:3], 
        {"sre": model_labels_train},
        validation_data=(model_inputs_dev[:3], {"sre": model_labels_dev}),
        epochs=epochs,
        batch_size=batch_size
    )
else:
    model.fit(
    model_inputs_train, 
    {"sre": model_labels_train},
    validation_data=(model_inputs_dev, {"sre": model_labels_dev}),
    epochs=epochs,
    batch_size=batch_size
)


=== SRE Max Pool Model ===
BERT layer output: KerasTensor(type_spec=TensorSpec(shape=(None, 512, 256), dtype=tf.float32, name=None), name='bert/encoder/layer_3/output/LayerNorm/add_1:0', description="created by layer 'bert'")
Prediction: KerasTensor(type_spec=TensorSpec(shape=(None, 3), dtype=tf.float32, name=None), name='sre/Softmax:0', description="created by layer 'sre'")

Model: "sre_pool"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 512)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 512)]        0                                            
__________________________________________________________________________________________________
e1_mask 

In [None]:
# evaluate model on test data
print('Evaluate on test data')

if marker_type == 'std' or head_type == 'cls':
    results = model.evaluate(model_inputs_test[:3], model_labels_test, batch_size=batch_size)
else:
    results = model.evaluate(model_inputs_test, model_labels_test, batch_size=batch_size)

# generate predictions on new data (probabilities -- the output of the last layer)
print("Generate predictions for new samples")

if marker_type == 'std' or head_type == 'cls':
    predictions = model.predict(model_inputs_test[:3])
else: 
    predictions = model.predict(model_inputs_test)

print("predictions shape:", predictions.shape)

# save stuff
if subsampled:
    model_name = f'SRE_{bert_type}_{marker_type}_{head_type}_sub'
else:
    model_name = f'SRE_{bert_type}_{marker_type}_{head_type}'

# save results and predictions
outputs = [results, predictions]
with open(f'{path}/results/{model_name}.pickle', "wb") as f:
    pickle.dump(outputs, f)

# save model
# model.save(f'{path}/models/{model_name}')

Evaluate on test data
Generate predictions for new samples
predictions shape: (18515, 3)


In [None]:
# # how to open saved file
# with open(f'{path}/results/{model_name}.pickle', "rb") as f:
#     saved_outputs = pickle.load(f)

In [None]:
# OPTIONAL: visualize model
#tf.keras.utils.plot_model(model, show_shapes=True, dpi=48)