## Experiments with prometheus BERT models

In [None]:
################################################################################
# Strings and constants
################################################################################

# Machine specific local variables.
# Define these as a class, like the below.
# Select which local variables to use by assigning
# a class to LOCALS.

# Locals when on colab
class ColabLocals:
  TORCH_TARGET_DEVICE = 'cpu'

# Locals when on cusp_vm
class CuspVmLocals:
  TORCH_TARGET_DEVICE = 'cpu'
    
# Locals when on icat
class IcatLocals:
  TORCH_TARGET_DEVICE = 'cpu'

# Dataset descriptions
# Define these as a class, like the below
# Select which dataset to use by assigning
# a class to DATA

# Imdb movie reviews dataset
# Standard sentiment classification task
# Contains around 5000 rows
# Note that on colab, you might need to limit data slice to 1000 or less
class ImdbDescription: 
  PATH = './sentence_classifier/sentence_classifier/imdb_5k_reviews.csv'
  TEXT_COL = 'review'
  LABEL_COL = 'sentiment'
  TEST_TEXTS = ['hello my name is link i am in love with princess zelda',
                'this is just a test sentence',
                'project']

  # We can limit the data we will use to just some of the dataset
  # Use None if you want to specify start and end of dataset 
  SLICE_FROM = None
  SLICE_TO = 100

  # We can limit the text that will be used from the text column
  # Use None if you want to specify start and end of the text
  TEXT_FROM = None
  TEXT_TO = 1000   

# Model related parameters
TOKENIZER = 'bert-base-uncased'
TRAINING_EPOCHS = 1
TRAINING_TEST_SIZE = 0.2

In [None]:
################################################################################
# Imports
################################################################################

# these try blocks are for sometimes probelmatic imports
# which are not installed on some machines being used

# transformers is needed by BERT.py
try:
  import transformers
except ImportError as e:
  !pip install transformers

# nltk is used by NLP_Utils.py, which is imported by BERT.py
try:
  import nltk
except ImportError as e:
  !pip install nltk

# spacy is used by NLP_Utils.py, which is imported by BERT.py
try:
  import spacy
except ImportError as e:
  !pip install spacy

# Prometheus classes
from sentence_classifier.sentence_classifier.BERT import train_BERT
from sentence_classifier.sentence_classifier.BERT import load_and_run_BERT

# Other imports
import pandas as pd
import torch

In [None]:
################################################################################
# Load data and run Prometheus sentence classifier
################################################################################

# Select which locals and dataset to use
LOCALS = IcatLocals
DATA = ImdbDescription

# Read in some data for training
df = pd.read_csv(DATA.PATH,header=1)
df = df[DATA.SLICE_FROM:DATA.SLICE_TO]

df[DATA.TEXT_COL] = df[DATA.TEXT_COL].apply(
    lambda x: x.strip().lower()[DATA.TEXT_FROM:DATA.TEXT_TO]
    )
df.tail()

In [None]:
# Train model
bert_model = train_BERT(sentences=df[DATA.TEXT_COL],
                        labels=df[DATA.LABEL_COL],
                        BERT_tokenizer=TOKENIZER,
                        test_size=TRAINING_TEST_SIZE,
                        n_epochs=TRAINING_EPOCHS,
                        output_dir=None)
bert_model['stats']

In [None]:
# Run the model on some test sentences
# First need to move the model to the device being used
model=bert_model['model']
model.to(torch.device(LOCALS.TORCH_TARGET_DEVICE))

load_and_run_BERT(sentences=DATA.TEST_TEXTS,
                  trained_bert_model=model,
                  BERT_tokenizer=TOKENIZER)