## Experiments with prometheus BERT models

In [None]:
################################################################################
# Strings and constants
################################################################################

# Machine specific local variables.
# Define these as a class, like the below.
# Select which local variables to use by assigning
# a class to LOCALS.

# Locals when on colab
class ColabLocals:
  TORCH_TARGET_DEVICE = 'cpu'

# Locals when on cusp_vm
class CuspVmLocals:
  TORCH_TARGET_DEVICE = 'cpu'
    
# Locals when on icat
class IcatLocals:
  TORCH_TARGET_DEVICE = 'cpu'

# Dataset descriptions
# Define these as a class, like the below
# Select which dataset to use by assigning
# a class to DATA

# Imdb movie reviews dataset
# Standard sentiment classification task
# Contains around 5000 rows
# Note that on colab, you might need to limit data slice to 1000 or less
class ImdbDescription: 
  PATH = './sentence_classifier/sentence_classifier/imdb_5k_reviews.csv'
  TEXT_COL = 'review'
  LABEL_COL = 'sentiment'
  TEST_TEXTS = ['hello my name is link i am in love with princess zelda',
                'this is just a test sentence',
                'project']

  # We can limit the data we will use to just some of the dataset
  # Use None if you want to specify start and end of dataset 
  SLICE_FROM = None
  SLICE_TO = 100

  # We can limit the text that will be used from the text column
  # Use None if you want to specify start and end of the text
  TEXT_FROM = None
  TEXT_TO = 1000   

# Model related parameters
# BERT_BASE = 'bert-base-uncased'
BERT_BASE ='monologg/biobert_v1.0_pubmed_pmc'
EPOCHS = 3
# Proportion to hold out in test
TEST_HOLDOUT = 0.2
# Number of folds in a kfold cross validation
FOLDS = 5

# Classifier results and scoring
PREDICTION_LABEL = 'preds'
PROBABILITY_LABEL = 'probs'
THRESHOLD = 0.0

# thresholder takes a tuple of a probability
# and a label value e.g. (0.9864, 1)
thresholder = lambda x: x[1] if (x[0] > THRESHOLD) else 0

In [None]:
################################################################################
# Imports
################################################################################

# these try blocks are for sometimes probelmatic imports
# which are not installed on some machines being used

# transformers is needed by BERT.py
try:
  import transformers
except ImportError as e:
  !pip install transformers

# nltk is used by NLP_Utils.py, which is imported by BERT.py
try:
  import nltk
except ImportError as e:
  !pip install nltk

# spacy is used by NLP_Utils.py, which is imported by BERT.py
try:
  import spacy
except ImportError as e:
  !pip install spacy

# Prometheus classes
from sentence_classifier.sentence_classifier.BERT import train_BERT
from sentence_classifier.sentence_classifier.BERT import load_and_run_BERT
from sentence_classifier.sentence_classifier.BERT import BERT_KFOLD
from sentence_classifier.sentence_classifier.NLP_utils import convert_to_cat

# Other imports
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support as scorer
import torch

In [None]:
################################################################################
# Load test data and run Prometheus sentence classifier
################################################################################

# Select which locals and dataset to use
LOCALS = IcatLocals
DATA = ImdbDescription

# Read in some data for training
df = pd.read_csv(DATA.PATH,header=1)
df = df[DATA.SLICE_FROM:DATA.SLICE_TO]

df[DATA.TEXT_COL] = df[DATA.TEXT_COL].apply(
    lambda x: x.strip().lower()[DATA.TEXT_FROM:DATA.TEXT_TO]
    )
df.tail()

In [None]:
# Train model
bert_model = train_BERT(sentences=df[DATA.TEXT_COL],
                        labels=df[DATA.LABEL_COL],
                        BERT_tokenizer=BERT_BASE,
                        test_size=TEST_HOLDOUT,
                        n_epochs=EPOCHS,
                        output_dir=None)
bert_model['stats']

In [None]:
# Run the model on some test sentences
# First need to move the model to the device being used
model=bert_model['model']
model.to(torch.device(LOCALS.TORCH_TARGET_DEVICE))

load_and_run_BERT(sentences=DATA.TEST_TEXTS,
                  trained_bert_model=model,
                  BERT_tokenizer=BERT_BASE)

In [None]:
#########################################################################################
# Training and testing CRIS violence models - kfold trained, with best fold saved
#########################################################################################

annots_train_file = '/home/icat/angus/annotations/violence_adjudicated_final-amended.xlsx'
train_sheet = 'adjudication'
text_col= 'clean_text'
labels = ['physical', 'sexual', 'domestic', 'status_binary', 'perpetrator', 'victim']

model_top_dir = '/home/icat/angus/models/from-am-brc-drive/'

annots_test_file = '/home/icat/angus/annotations/violence_test_set_riley-possible-1411.xlsx'
test_sheet = 'adjudicated-only'

# test the first rows_to_test rows, set to None to test all
rows_to_test = None 
results_file = 'test-results-no-threshold-biobert.csv'

# List pf tuples giving sklearn method for f1/p/r averageing and where appropriate
# label to evaluate (None if label does not need to be specified or all labels)
eval_methods = [('binary',0),('binary',1),('micro',None),('macro',None),('weighted',None)]
# columns for evaluation output
eval_headers = ('class','method','label','p','r','f1')

In [None]:
# Read in annotations to train with
annots = pd.read_excel(annots_train_file,
                    sheet_name=train_sheet, header=0,
                    usecols=[text_col] + labels
                    )

print("Read in", annots.shape[0], "annotations") 

In [None]:
# Run kfold cross validation over the labels and
# save the best model

# Setting CUDA_LAUNCH_BLOCKING = 1 gives more debug info ig GPU crashes
# import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

# Train a model for each label, saving 
for label in labels:
    kf_best = BERT_KFOLD(sentences=annots[text_col], labels=annots[label],
                         n_splits=FOLDS, n_epochs=EPOCHS, random_state=666,
                         BERT_tokenizer=BERT_BASE,
                         output_dir = model_top_dir + label)

In [None]:
# Read in annotations to test with
test = pd.read_excel(annots_test_file,
                      sheet_name=test_sheet, header=0,
                      usecols=[text_col] + labels,
                      nrows = rows_to_test
                     )

print("Read in", test.shape[0], "annotations") 

In [None]:
# Load each model and test against test file

# dataframe to store the results
all_scores = pd.DataFrame(columns=eval_headers)

# iterate over the lables
for label in labels:
    response = load_and_run_BERT(sentences=test[text_col],
                                 trained_bert_model=model_top_dir + label,
                                 BERT_tokenizer=BERT_BASE)
    result = pd.merge(response, test, left_index=True, right_index=True)
    
    # write out raw results for this label
    result.to_csv(model_top_dir + label + '/' + results_file, header=True, index=False)

    # convert label column to a categorical, using same method as
    # used when building model from labelled data
    result[label] = convert_to_cat(result[label])['labels']
    
    # apply thresholding
    result[PREDICTION_LABEL] = result[[PROBABILITY_LABEL,PREDICTION_LABEL]].apply(
        thresholder, axis=1
    )
    
    # evaluate the prediction response series against
    # the reference standard key series for a 
    # number of metrics and methods
    for method, cls in eval_methods:
        score = scorer(result[label], result[PREDICTION_LABEL], pos_label=cls, average=method)
        score = tuple(map(lambda x: round(x, 2), score[0:3]))
        score_series = pd.Series((label, method, cls) + score, index=all_scores.columns)
        all_scores = all_scores.append(score_series, ignore_index=True)
        
# write out the results
all_scores.to_csv(model_top_dir + results_file, header=True, index=False)

In [None]:
#########################################################################################
# Run a model over Jyoti's annotations, and write out as csv for
# visual inspection and analysis
#########################################################################################

file_to_test = '/home/icat/angus/annotations/jyotis_sexual_violence_annotations.xls'
sheet = 'output'
text_column = 'text'
label_column = 'output_JJ'

#model_top_dir = '/home/icat/angus/models/'
model_top_dir = '/home/icat/angus/models/from-am-brc-drive/'
model_to_test = 'sexual'

output_file = 'jj-test-biobert.csv'


# read in the data
test_df = pd.read_excel(file_to_test,
                        sheet_name=sheet, header=0,
                        usecols=[text_column, label_column]
                       )

# remove whitesapce and lowercase
test_df[text_column] = test_df[text_column].apply(lambda x: " ".join(x.lower().split()[:]))

# load model
response = load_and_run_BERT(sentences=test_df[text_column],
                             trained_bert_model=model_top_dir + model_to_test,
                             BERT_tokenizer=BERT_BASE)

# merge model response in to data
result = pd.merge(response, test_df, left_index=True, right_index=True)

result.head(50)

# write out raw result
result.to_csv(model_top_dir + model_to_test + '/' + output_file, header=True, index=False)

In [None]:
#########################################################################################
# Testing with different versions of models
#########################################################################################


model_dirs = ['/home/icat/angus/models/', '/home/icat/angus/models/from-am-brc-drive/']
model_to_test = 'sexual'

base_models = ['bert-base-uncased', 'bert-base-cased']

# these words fail if wrong tokeniser used with models
sents_to_test = ['rubbish', 'relapse']

for model_dir in model_dirs:
    for base in base_models:
        try:
            response = load_and_run_BERT(sentences=sents_to_test,
                                         trained_bert_model=model_dir + model_to_test,
                                         BERT_tokenizer=base)
            print(model_dir.ljust(50), base.ljust(20), 'OK')
        except IndexError:
            print(model_dir.ljust(50), base.ljust(20), 'IndexError')  
    


In [None]:
#########################################################################################
# Testing problematic words
#########################################################################################

model_dir = '/home/icat/angus/models/from-am-brc-drive/'
model_to_test = 'sexual'

# these words fail if wrong tokeniser used with models
sents_to_test = ['rubbish', 'relapse']

response = load_and_run_BERT(sentences=sents_to_test,
                             trained_bert_model=model_dir + model_to_test,
                             BERT_tokenizer=BERT_BASE)
 
response.head()   