### Install needed libraries

In [None]:

!pip install torch torchvision
!pip install transformers 
!pip install seqeval
!pip install tensorboardx
!pip install simpletransformers 

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 5.5MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 17.4MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 24.6MB/s 
Installing collected packages: sacremoses, tokenizers, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1
Collecting seqeval
[?25l  Downloading https://files.pythonhosted.

### Import needed libraries

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np

import gc
import requests
import os

from simpletransformers.classification import ClassificationModel
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, recall_score, precision_score, precision_recall_curve, auc

import torch
print("Cuda available" if torch.cuda.is_available() is True else "CPU")
print("PyTorch version: ", torch.__version__)

Cuda available
PyTorch version:  1.8.1+cu101


In [None]:
# mount drive if using colab to train
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Import Labeled Tweets

In [None]:
# location of labeled data
url = '/content/drive/MyDrive/Data/reconciled_labes.csv'
df = pd.read_csv(url, error_bad_lines=False)

(2000, 2)


Unnamed: 0,text,label
0,"#BidensACoward A coward indeed, Biden wants a...",1
1,@PandemicMy about how people they dislike 'use...,0
2,EXCLUSIVE: Workers and their worried spouses r...,0
3,We need direct Pandemic Relief Money Now! @Tra...,0
4,Refer to the thread here for explanation of th...,0


### Split into training and test sets

In [None]:
train = df.sample(frac = 0.8, random_state = 42)
test = df.drop(train.index).reset_index(drop=True)
train = train.reset_index(drop = True)

### Train the classifier

In [None]:
args = {
   'output_dir': '/content/drive/MyDrive/Colab Notebooks/covid_electra',
   'cache_dir': 'cache/',

   'fp16': False,
   'fp16_opt_level': 'O1',
   'max_seq_length': 256,
   'train_batch_size': 16,
   'eval_batch_size': 8,
   'gradient_accumulation_steps': 1,
   'num_train_epochs': 5,
   'weight_decay': 0,
   'learning_rate': 2.1608519231372816e-05, #4.203713852453231e-05, #
   #'adam_epsilon': 1e-6,
   #'warmup_ratio': 0.06,
   'warmup_steps': 0,
   'max_grad_norm': 1.0,

   'logging_steps': 100,
   'evaluate_during_training': True,
   'evaluate_during_training_steps': 100,
   'save_steps': 2000,
   'eval_all_checkpoints': True,
   'use_tensorboard': True,

   'overwrite_output_dir': True,
   'reprocess_input_data': True,
}

train_df = train[['text', 'label']]
eval_df = test[['text', 'label']]
train_df['text'] = train_df['text'].str.lower()
eval_df['text'] = eval_df['text'].str.lower()

model = ClassificationModel(
        "electra",
        "google/electra-base-discriminator",
        use_cuda=True,
        args=args
    )

model.train_model(train_df, eval_df = test)

result, model_outputs, wrong_predictions = model.eval_model(eval_df)

test['Electra_label'] = np.argmax(model_outputs, axis = 1)

del model
del result
del model_outputs
del wrong_predictions
gc.collect()
torch.cuda.empty_cache()

### Results

In [None]:
from sklearn.metrics import precision_recall_curve, auc

In [None]:
def report_results(A, B):
    A_name = A.name
    B_name = B.name
    
    df = pd.DataFrame({'A':A,
                       'B':B})
    df = df.dropna()
    A = df['A']
    B = df['B']
    
    acc = accuracy_score(B, A)
    f1 = f1_score(B, A, average='weighted')
    prec = precision_score(B, A, average='weighted')
    rec = recall_score(B, A, average='weighted')
    ROC = roc_auc_score(B, A, average='weighted')
    precision, recall, thresholds = precision_recall_curve(B, A)
    AUPRC = auc(recall, precision)

    
    print('Candidate: '+A_name+' | Ground Truth: '+B_name+'\n')
    print('accuracy: %0.2f \nprecision: %0.2f \nrecall: %0.2f \nF1 score: %0.2f \nROC AUC: %0.2f \nAUPRC: %0.2f \n' % (acc, prec, rec, f1, ROC, AUPRC))
    

In [None]:
report_results(test['Electra_label'], test['label'])

Candidate: Electra_label | Ground Truth: label

accuracy: 0.95 
precision: 0.94 
recall: 0.95 
F1 score: 0.94 
ROC AUC: 0.69 
AUPRC: 0.53 



# W&B Sweep

This sweep is used to isolate optimal hyperparameters for the final training of the model

In [None]:
# import and log in to W&B
import wandb
import logging
from simpletransformers.classification import ClassificationArgs
wandb.login()

In [None]:
sweep_config = {
    "method": "bayes",  # grid, random
    "metric": {"name": "auprc", "goal": "maximize"},
    "parameters": {
        "num_train_epochs": {"min": 5, "max": 10},
        "learning_rate": {"min": 0, "max": 5.4441040703663804e-05},
    },
#    "early_terminate": {"type": "hyperband", "min_iter":6,},
}

sweep_id = wandb.sweep(sweep_config, project="Covid Classifier2")

# logging
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# data import
url = '/content/drive/MyDrive/Data/reconciled_labes.csv'
df = pd.read_csv(url, error_bad_lines=False)

#t = df[df['label'] == 1]
#f = df[df['label'] == 0].sample(n = 500)
#df = pd.concat([t,f])

# train test split
train_df = df.sample(frac = 0.8, random_state = 907)
test_df = df.drop(train_df.index).reset_index(drop=True)
train_df = train_df.reset_index(drop = True)

# convert strings to lower
train_df['text'] = train_df['text'].str.lower()
test_df['text'] = test_df['text'].str.lower()

# args
model_args = ClassificationArgs()
model_args.evaluate_during_training = True
model_args.evaluate_during_training_silent = False
model_args.evaluate_during_training_steps = 10
#model_args.learning_rate = 5.4441040703663804e-05
model_args.manual_seed = 907
model_args.max_seq_length = 256
model_args.no_cache = True
model_args.no_save = True
#model_args.num_train_epochs = 10
model_args.overwrite_output_dir = True
model_args.reprocess_input_data = True
model_args.train_batch_size = 16
model_args.eval_batch_size = 16
model_args.train_custom_parameters_only = False
model_args.wandb_project = "Covid Classifier2"

# training function
def train():
    # Initialize a new wandb run
    wandb.init(resume = True)

    # Create a TransformerModel
    model = ClassificationModel(
        #"electra",
        #"google/electra-base-discriminator",
        "deberta",
        "microsoft/deberta-base",
        use_cuda=True,
        args=model_args,
        sweep_config=wandb.config,
    )

    # Train the model
    model.train_model(
        train_df,
        eval_df=test_df,
        accuracy=lambda truth, predictions: accuracy_score(
            truth, [round(p) for p in predictions]
        ),
    )

    # Sync wandb
    wandb.join()

# train
wandb.agent('pu8t3yd8', train)