## Install Classes

In [1]:
%%capture
!pip install wandb
!pip install simpletransformers

In [2]:
%%writefile setup.sh

git clone https://github.com/NVIDIA/apex
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex

Overwriting setup.sh


In [3]:
%%capture
!sh setup.sh

In [4]:
# memory footprint support libraries/code
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize

ln: /usr/bin/nvidia-smi: Operation not permitted


## Import Classes

In [11]:
from google.colab import drive
from sqlite3 import Cursor, Connection, connect
from simpletransformers.classification import ClassificationModel
import pandas as pd
import wandb
import logging
import sklearn
from sklearn.metrics import f1_score, precision_score, recall_score
import GPUtil as GPU
import psutil
import humanize
import os
from os import listdir



## Setup GPU

In [13]:
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
    process = psutil.Process(os.getpid())
    print("Gen RAM Free: " + humanize.naturalsize(psutil.virtual_memory().available), " |     Proc size: " + humanize.naturalsize(process.memory_info().rss))
    print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total     {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()

[]


IndexError: list index out of range

## Connect to database

In [None]:
#Mount Google Drive for persistent storage
drive.mount('/content/drive', force_remount=True)

# Initialise DB connection
_data_base: Connection = connect('/content/drive/My Drive/TAR/DB.db')
_cursor: Cursor = _data_base.cursor()

## Data Preprocessing

In [None]:
def build_dataset_term_label_classifier(table: str):
    query = "SELECT Text, Type FROM " + table + " ORDER BY Type, Text;"
    table_content = list(_cursor.execute(query))
    X = list(map(lambda entry: entry[0], table_content))
    Y = list(map(lambda entry: entry[1], table_content))
    return X, Y
    

In [None]:
term_label_train_X, term_label_train_Y = build_dataset_term_label_classifier(table="Train_TermLabel")
term_label_dev_X, term_label_dev_Y = build_dataset_term_label_classifier(table="Dev_TermLabel")
term_label_test_X, term_label_test_Y = build_dataset_term_label_classifier(table="Test_TermLabel")

In [None]:
print(f'Term label train size: {len(term_label_train_X)}')
print(f'Term label dev size: {len(term_label_dev_X)}')
print(f'Term label test size: {len(term_label_test_X)}')

In [None]:
print(f'[Train] Action labels: {len(list(filter(lambda entry: entry == "Action", term_label_train_Y)))}')
print(f'[Train] Modifier labels: {len(list(filter(lambda entry: entry == "Modifier", term_label_train_Y)))}')
print(f'[Train] Object labels: {len(list(filter(lambda entry: entry == "Object", term_label_train_Y)))}')
print(f'[Train] Subject labels: {len(list(filter(lambda entry: entry == "Subject", term_label_train_Y)))}')
print()

print(f'[Dev] Action labels: {len(list(filter(lambda entry: entry == "Action", term_label_dev_Y)))}')
print(f'[Dev] Modifier labels: {len(list(filter(lambda entry: entry == "Modifier", term_label_dev_Y)))}')
print(f'[Dev] Object labels: {len(list(filter(lambda entry: entry == "Object", term_label_dev_Y)))}')
print(f'[Dev] Subject labels: {len(list(filter(lambda entry: entry == "Subject", term_label_dev_Y)))}')
print()

print(f'[Test] Action labels: {len(list(filter(lambda entry: entry == "Action", term_label_test_Y)))}')
print(f'[Test] Modifier labels: {len(list(filter(lambda entry: entry == "Modifier", term_label_test_Y)))}')
print(f'[Test] Object labels: {len(list(filter(lambda entry: entry == "Object", term_label_test_Y)))}')
print(f'[Test] Subject labels: {len(list(filter(lambda entry: entry == "Subject", term_label_test_Y)))}')

In [None]:
term_label_train = {'text':term_label_train_X, 'labels':term_label_train_Y}
term_label_dev = {'text':term_label_dev_X, 'labels':term_label_dev_Y}
term_label_test = {'text':term_label_test_X, 'labels':term_label_test_Y}

term_label_train_df = pd.DataFrame(term_label_train)
term_label_dev_df = pd.DataFrame(term_label_dev)
term_label_test_df = pd.DataFrame(term_label_test)

## Relevance Classifier

In [None]:
print('term_label_train_df balance:')
print(term_label_train_df['labels'].value_counts())

term_label_train_df['labels'].value_counts().plot(kind='bar', title='Count (target)')

In [None]:
term_label_train_df['labels'] = term_label_train_df['labels'].replace(['Action', 'Modifier', 'Object', 'Subject'], [0, 1, 2, 3]).astype(int)
term_label_test_df['labels'] = term_label_test_df['labels'].replace(['Action', 'Modifier', 'Object', 'Subject'], [0, 1, 2, 3]).astype(int)
term_label_dev_df['labels'] = term_label_dev_df['labels'].replace(['Action', 'Modifier', 'Object', 'Subject'], [0, 1, 2, 3]).astype(int)
print(term_label_train_df['labels'].value_counts())

In [None]:
term_label_train_df['labels'] = term_label_train_df['labels'].astype(int)
term_label_test_df['labels'] = term_label_test_df['labels'].astype(int)
term_label_dev_df['labels'] = term_label_dev_df['labels'].astype(int)


term_label_test_df['labels'].describe()

# term_label_train_df['labels'] = term_label_train_df['labels'].apply(lambda x: list(map(int, x)))
# term_label_test_df['labels'] = term_label_test_df['labels'].apply(lambda x: list(map(int, x)))
# term_label_dev_df['labels'] = term_label_dev_df['labels'].apply(lambda x: list(map(int, x)))

## Model Comparison

In [None]:
!wandb login 

In [None]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

models_dict = {
  'type': ["bert", "roberta", "distilbert", "roberta", "electra", "electra"],
  'name': ["bert-base-cased", "roberta-base", "distilbert-base-cased", "distilroberta-base", "google/electra-base-discriminator", "google/electra-small-discriminator"]
}

models_df = pd.DataFrame(data=models_dict)

# Same args for all the models
train_args = {
    "reprocess_input_data": True,
    "overwrite_output_dir": True,
    "use_cached_eval_features": True,
    "evaluate_during_training": True,
    "max_seq_length": 128,
    "num_train_epochs": 3,
    "evaluate_during_training_steps": 1000,
    "wandb_project": "tar-type-model-comparison",
    "wandb_kwargs": {"name": 'test'},
    "save_model_every_epoch": False,
    "save_eval_checkpoints": False,
    "train_batch_size": 64,
    "eval_batch_size": 64,
}

In [None]:
def f1_multiclass(labels, preds):
  return f1_score(labels, preds, average='micro')

def precision_multiclass(labels, preds):
  return precision_score(labels, preds, average='micro')

def recall_multiclass(labels, preds):
  return recall_score(labels, preds, average='micro')

In [None]:
for _, model in models_df.iterrows():
  train_args["output_dir"] = f'outputs/{model["type"]}'
  train_args["best_model_dir"] = f'outputs/{model["type"]}/best_model'
  train_args["wandb_kwargs"] = {"name": f'1/{model["name"]}'}

  class_model = ClassificationModel(
    model["type"], 
    model["name"], 
    num_labels = 4,
    args = train_args
  )

  print(f'>>>Training model {model["name"]}')

  class_model.train_model(
    term_label_train_df, 
    eval_df=term_label_test_df, 
    acc=sklearn.metrics.accuracy_score, 
    f1=f1_multiclass, 
    precision=precision_multiclass, 
    recall=recall_multiclass
  )
