#### Import Libraries

In [1]:
import os
import io
import re
import sys
import sys

import numpy as np
import pandas as pd
import argparse
from time import time
import matplotlib.pyplot as plt

import pickle
from csv import reader

import tensorflow as tf
from transformers import BertTokenizer
import bert

from sre_inputs import *
from train_test import *
from sre_models import *

#### BERT Model
- Load BERT model and tokenizer
- Set max length for inputs

In [2]:
path = '..'

# path for bert model
bert_model_dir = f'{path}/bert/bert_mini'
bert_type = bert_model_dir.split('/')[-1]

# set tokenizer
vocab_file = os.path.join(bert_model_dir, "vocab.txt")
tokenizer = BertTokenizer(vocab_file=vocab_file, do_lower_case=False)

# set BERT model
bert_params = bert.params_from_pretrained_ckpt(bert_model_dir)
bert = bert.BertModelLayer.from_params(bert_params, name="bert")

# set max length for inputs
max_length = 512

# set parameters for model type
marker_type = 'std' # 'em', 'ner', or 'std'
head_type = 'cls' # 'cls', 'start', 'pool', or 'ner'
subsampled = False

#### Data
- Upload preprocessed chemical patent file(s)
- Use `sre_inputs` module to generate inputs for model
- Sample only: split into train/test using `train_test` module
- Need to one hot encode labels before using in model

*NB: Make sure that preprocessed data being uploaded and parameters chosen for generating inputs **both** match the type of model it will be used for!*

In [3]:
#### SAMPLE PROCESSING ####

# path for preprocessed data
if marker_type == 'em' or marker_type == 'std':
    sample_path = f'{path}/data/sre_em/sre_em_sample.csv'
elif marker_type == 'ner':
    sample_path = f'{path}/data/sre_ner/sre_ner_sample.csv'

# generate inputs for model
if marker_type == 'em' or marker_type == 'ner':
    all_lists = generate_entity_inputs(sample_path, tokenizer, marker_type, head_type, max_length)
elif marker_type == 'std':
    all_lists = generate_standard_inputs(sample_path, tokenizer, max_length)

# SAMPLE ONLY: split into train/test
train_all, test_all = train_test_split(all_lists)

# generate inputs and labels
# one hot encode labels
sample_inputs_train = [x for x in train_all[0][:5]]
sample_labels_train = train_all[1]
sample_labels_train = tf.one_hot(sample_labels_train, depth=3)

sample_inputs_test = [x for x in test_all[0][:5]]
sample_labels_test = test_all[1]
sample_labels_test = tf.one_hot(sample_labels_test, depth=3)

In [4]:
#### TRAIN/DEV DATASET PROCESSING ####

# paths for preprocessed data
if (marker_type == 'em' or marker_type == 'std') and not subsampled:
    train_path = f'{path}/data/sre_em/sre_em_train.csv'
    dev_path = f'{path}/data/sre_em/sre_em_dev.csv'
elif (marker_type == 'em' or marker_type == 'std') and subsampled:
    train_path = f'{path}/data/sre_em/sre_em_train_subsampled.csv'
    dev_path = f'{path}/data/sre_em/sre_em_dev_subsampled.csv'
elif marker_type == 'ner' and not subsampled:
    train_path = f'{path}/data/sre_ner/sre_ner_train.csv'
    dev_path = f'{path}/data/sre_ner/sre_ner_dev.csv'
elif marker_type == 'ner' and subsampled:
    train_path = f'{path}/data/sre_ner/sre_ner_train_subsampled.csv'
    dev_path = f'{path}/data/sre_ner/sre_ner_dev_subsampled.csv'

print(f'Loaded {train_path}')
print(f'Loaded {dev_path}')

# generate inputs for model
if marker_type == 'em' or marker_type == 'ner':
    train_lists = generate_entity_inputs(train_path, tokenizer, marker_type, head_type, max_length)
    dev_lists = generate_entity_inputs(dev_path, tokenizer, marker_type, head_type, max_length)
elif marker_type == 'std':
    train_lists = generate_standard_inputs(train_path, tokenizer, max_length)
    dev_lists = generate_standard_inputs(dev_path, tokenizer, max_length)

# generate inputs and labels
# one hot encode labels
model_inputs_train = [x for x in train_lists[0][:5]]
train_labels = train_lists[1]
model_labels_train = tf.one_hot(train_labels, depth=3)

model_inputs_dev = [x for x in dev_lists[0][:5]]
dev_labels = dev_lists[1]
model_labels_dev = tf.one_hot(dev_labels, depth=3)

Loaded ../data/sre_em/sre_em_train.csv
Loaded ../data/sre_em/sre_em_dev.csv


In [5]:
#### TEST DATASET PROCESSING ####

# path for preprocessed data
if marker_type == 'em' or marker_type == 'std':
    test_path = f'{path}/data/sre_em/sre_em_test.csv'
elif marker_type == 'ner':
    test_path = f'{path}/data/sre_ner/sre_ner_test.csv'

# generate inputs for model
if marker_type == 'em' or marker_type == 'ner':
    test_lists = generate_entity_inputs(test_path, tokenizer, marker_type, head_type, max_length)
elif marker_type == 'std':
    test_lists = generate_standard_inputs(test_path, tokenizer, max_length)

# generate inputs and labels
# one hot encode labels
model_inputs_test = [x for x in test_lists[0][:5]]
test_labels = test_lists[1]
model_labels_test = tf.one_hot(test_labels, depth=3)

#### Train Data EDA

In [6]:
# total number of snippets
print(f'Train: {len(model_inputs_train[0])}')
print(f'Dev: {len(model_inputs_dev[0])}')
print(f'Test: {len(model_inputs_test[0])}')

Train: 45805
Dev: 10673
Test: 18488


In [7]:
# train label break down
uniqueValues, occurCount = np.unique(train_labels, return_counts=True)
print("Unique Values : " , uniqueValues)
print("Occurrence Count : ", occurCount)

Unique Values :  [0 1 2]
Occurrence Count :  [31516  9692  4597]


In [8]:
# dev label break down
uniqueValues, occurCount = np.unique(dev_labels, return_counts=True)
print("Unique Values : " , uniqueValues)
print("Occurrence Count : ", occurCount)

Unique Values :  [0 1 2]
Occurrence Count :  [7341 2247 1085]


In [9]:
# test label break down
uniqueValues, occurCount = np.unique(test_labels, return_counts=True)
print("Unique Values : " , uniqueValues)
print("Occurrence Count : ", occurCount)

Unique Values :  [0 1 2]
Occurrence Count :  [12712  3896  1880]


In [10]:
# snippet length EDA
print(f'Max: {max(train_lists[2][1])}')
print(f'Min: {min(train_lists[2][1])}')
print(f'Mean: {np.mean(train_lists[2][1]):.2f}')
print(f'Median: {np.median(train_lists[2][1])}')

Max: 287
Min: 6
Mean: 58.02
Median: 51.0


In [11]:
# snippet length EDA
print(f'Max: {max(dev_lists[2][1])}')
print(f'Min: {min(dev_lists[2][1])}')
print(f'Mean: {np.mean(dev_lists[2][1]):.2f}')
print(f'Median: {np.median(dev_lists[2][1])}')

Max: 287
Min: 6
Mean: 57.47
Median: 50.0


In [12]:
# snippet length EDA
print(f'Max: {max(test_lists[2][1])}')
print(f'Min: {min(test_lists[2][1])}')
print(f'Mean: {np.mean(test_lists[2][1]):.2f}')
print(f'Median: {np.median(test_lists[2][1])}')

Max: 245
Min: 6
Mean: 57.87
Median: 51.0


In [13]:
train_lists[2][2]

[]