## Install Classes

In [1]:
%%capture
!pip install wandb
!pip install simpletransformers

In [2]:
%%writefile setup.sh

git clone https://github.com/NVIDIA/apex
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex

Overwriting setup.sh


In [3]:
%%capture
!sh setup.sh

In [4]:
# memory footprint support libraries/code
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize

ln: /usr/bin/nvidia-smi: Operation not permitted


## Import Classes

In [5]:
from google.colab import drive
from sqlite3 import Cursor, Connection, connect
from simpletransformers.classification import ClassificationModel
import pandas as pd
import wandb
import logging
import sklearn
from sklearn.metrics import f1_score, precision_score, recall_score
import GPUtil as GPU
import psutil
import humanize
import os
from os import listdir

ModuleNotFoundError: No module named 'google.colab'

## Setup GPU

In [None]:
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
    process = psutil.Process(os.getpid())
    print("Gen RAM Free: " + humanize.naturalsize(psutil.virtual_memory().available), " |     Proc size: " + humanize.naturalsize(process.memory_info().rss))
    print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total     {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()

## Connect to database

In [None]:
#Mount Google Drive for persistent storage
drive.mount('/content/drive', force_remount=True)

# Initialise DB connection
_data_base: Connection = connect('/content/drive/My Drive/TAR/DB.db')
_cursor: Cursor = _data_base.cursor()

## Data Preprocessing

In [None]:
def build_dataset_relevance_classifier(table: str):
    X = []
    Y = []
    sentence = ''
    relevance = 0
    query = "SELECT Word,TokenSet FROM " + table + " Order BY FileName, Line"
    table_contents = list(_cursor.execute(query))
    for table_content in table_contents:
        word = table_content[0]
        token_set = table_content[1]
        if len(word) == 0:
            if sentence != '':
                X.append(sentence)
                Y.append(relevance)
                sentence = ''
                relevance = 0
        else:
            if token_set == "" or token_set != 'O':
                relevance = 1
            if word in [",", "-", "."]:
                sentence = (sentence + word).strip()
            else:
                sentence = (sentence + ' ' + word).strip()
    return X, Y

In [None]:
relevance_train_X, relevance_train_Y = build_dataset_relevance_classifier(table="Train_Token")
relevance_dev_X, relevance_dev_Y = build_dataset_relevance_classifier(table="Dev_Token")
relevance_test_X, relevance_test_Y = build_dataset_relevance_classifier(table="Test_Token")

In [None]:
print(f'Relevance train size: {len(relevance_train_X)}')
print(f'Relevance dev size: {len(relevance_dev_X)}')
print(f'Relevance test size: {len(relevance_test_X)}')

In [None]:
print(f'[Train] relevant sentences: {relevance_train_Y.count(1)}')
print(f'[Train] irrelevant sentences: {relevance_train_Y.count(0)}')
print(f'[Train] % relevant: {relevance_train_Y.count(1) / len(relevance_train_Y) * 100}')
print()

print(f'[Dev] relevant sentences: {relevance_dev_Y.count(1)}')
print(f'[Dev] irrelevant sentences: {relevance_dev_Y.count(0)}')
print(f'[Dev] % relevant: {relevance_dev_Y.count(1) / len(relevance_dev_Y) * 100}')
print()

print(f'[Test] relevant sentences: {relevance_test_Y.count(1)}')
print(f'[Test] irrelevant sentences: {relevance_test_Y.count(0)}')
print(f'[Test] % relevant: {relevance_test_Y.count(1) / len(relevance_test_Y) * 100}')
print()

In [None]:
relevance_train = {'text':relevance_train_X, 'labels':relevance_train_Y}
relevance_dev = {'text':relevance_dev_X, 'labels':relevance_dev_Y}
relevance_test = {'text':relevance_test_X, 'labels':relevance_test_Y}

In [None]:
relevance_train_df = pd.DataFrame(relevance_train)
relevance_dev_df = pd.DataFrame(relevance_dev)
relevance_test_df = pd.DataFrame(relevance_test)

## Data Balancing

In [None]:
print('train_df balance:')
print(relevance_train_df['labels'].value_counts())

relevance_train_df['labels'].value_counts().plot(kind='bar', title='Count (target)')

relevance_train_df['labels'].describe() 

In [None]:
print('train_df balance:')
print(relevance_train_df['labels'].value_counts())

relevance_train_df['labels'].value_counts().plot(kind='bar', title='Count (target)')

relevance_train_df['labels'].describe() 

In [None]:
# Balance dataset: over-sampling

# Class count
count_class_0, count_class_1 = relevance_train_df['labels'].value_counts()

# Divide by class
df_class_0 = relevance_train_df[relevance_train_df['labels'] == 0]
df_class_1 = relevance_train_df[relevance_train_df['labels'] == 1]

df_class_1_over = df_class_1.sample(count_class_0, replace=True)
train_over = pd.concat([df_class_0, df_class_1_over], axis=0)

print('Random over-sampling:')
print(train_over['labels'].value_counts())

train_over['labels'].value_counts().plot(kind='bar', title='Count (labels)')

## Model Comparison

In [None]:
!wandb login 

In [None]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# models_dict = {
#   'type': ["bert", "roberta", "distilbert", "roberta", "electra", "electra"],
#   'name': ["bert-base-cased", "roberta-base", "distilbert-base-cased", "distilroberta-base", "google/electra-base-discriminator", "google/electra-small-discriminator"]
# }

models_dict = {
  'type': ["roberta", "distilbert", "roberta", "electra", "electra"],
  'name': ["roberta-base", "distilbert-base-cased", "distilroberta-base", "google/electra-base-discriminator", "google/electra-small-discriminator"]
}

models_df = pd.DataFrame(data=models_dict)

# Same args for all the models
train_args = {
    "reprocess_input_data": True,
    "overwrite_output_dir": True,
    "use_cached_eval_features": True,
    "evaluate_during_training": True,
    "max_seq_length": 128,
    "num_train_epochs": 3,
    "evaluate_during_training_steps": 1000,
    "wandb_project": "tar-classification-model-comparison",
    "wandb_kwargs": {"name": 'test'},
    "save_model_every_epoch": False,
    "save_eval_checkpoints": False,
    "train_batch_size": 64,
    "eval_batch_size": 64,
}

In [None]:
class_model = ClassificationModel(
  'roberta', 
  './drive/My Drive/TAR/relevance_outputs/roberta/best_model', 
  args=train_args,
  use_cuda=False
)

In [None]:
predictions, raw_outputs = class_model.predict(["Double Dragon\nAPT41, a dual espionage and\n cyber crime operation\n APT41"])
print(predictiraw_outputsons)


In [None]:
for _, model in models_df.iterrows():
  train_args["output_dir"] = f'outputs/{model["type"]}'
  train_args["best_model_dir"] = f'outputs/{model["type"]}/best_model'
  train_args["wandb_kwargs"] = {"name": f'oversampled/{model["name"]}'}

  class_model = ClassificationModel(
    model["type"],
    model["name"],
    args=train_args
  )

  print(f'>>>Training model {model["name"]}')

  class_model.train_model(
    relevance_train_df, 
    eval_df=relevance_test_df, 
    acc=sklearn.metrics.accuracy_score, 
    f1=sklearn.metrics.f1_score, 
    precision=sklearn.metrics.precision_score, 
    recall=sklearn.metrics.recall_score
  )

  #%rm -r cache_dir
  #%rm -r outputs 


In [None]:
# Evaluate the model
result, model_outputs, wrong_predictions = class_model.eval_model(
  relevance_test_df, 
  acc=sklearn.metrics.accuracy_score, 
  f1=sklearn.metrics.f1_score, 
  precision=sklearn.metrics.precision_score, 
  recall=sklearn.metrics.recall_score

)

In [None]:
print(result)
#print(model_outputs)


In [None]:
!zip -r /content/relevance_balanced.zip /content/outputs


In [None]:
print("")