# Text Classification with Bio-med RoBERTa


In [None]:
#!pip install tensorflow
#!pip install transformers
!pip install datasets
!pip install transformers[torch]
#!pip install wandb
#!pip install text-preprocessing
#!pip install numba
#!pip install scikit-learn

In [None]:
import pandas as pd
import datasets
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
import torch.nn as nn
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import os

## Prepare dataset


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
dataset = 'new'

if dataset == 'new':
    df1 = pd.read_excel('/content/drive/MyDrive/Bio-med Roberta/dataset/01. NSCLC - Sent to CapeStart.xlsx',
                        usecols=['Title', 'Abstract', 'Include/Exclude'], header=0).replace({'Exclude': 0, 'Include':1}).dropna()
    df2 = pd.read_excel('/content/drive/MyDrive/Bio-med Roberta/dataset/09. NSCLC - Sent to CapeStart.xlsx',
                        usecols=['Title', 'Abstract', 'Decision'], header=0).replace({'Exclude': 0, 'Include':1}).dropna().rename(columns={'Decision': 'Include/Exclude'})
    df3 = pd.read_excel('/content/drive/MyDrive/Bio-med Roberta/dataset/12. NSCLC - Sent to CapeStart.xlsx',
                        usecols=['Title', 'Abstract', 'First pass final decision'], header=0).replace(
                        {'E1 - Review/editorial': 0, 'E3 - Study design': 0, 'E4 - Intervention': 0, 'E5 - Disease (non-NSCLC)': 0,
                        'E6 - Population (non-RET+ NSCLC)': 0,'E7 - Animal/in vitro': 0,'I1 - Include clinical': 1,'I2 - Include EE': 1,
                        'I3 - Include HSUV': 1,'I4 - Include cost': 1}).dropna().rename(columns={'First pass final decision': 'Include/Exclude'})
    df4 = pd.read_excel('/content/drive/MyDrive/Bio-med Roberta/dataset/A. NSCLC - Sent to CapeStart.xlsx', header=0,
                        usecols=['Title', 'Abstract', 'Accept or Reject Code']).replace({'Reject': 0, 'Accept':1}).dropna().rename(
                        columns={'Accept or Reject Code': 'Include/Exclude'})
    df  = pd.concat([df1, df2, df3, df4], axis=0)
    df['info'] = df['Title'].astype(str) + df['Abstract'].astype(str)
    df0 = df[df['Include/Exclude']==0]
    df1 = df[df['Include/Exclude']==1]

    """Train Test Split for include 10, exclude 30"""
    df_train_0 = df0.iloc[:30, :]
    df_test_0 = df0.iloc[30:, :]

    df_train_1 = df1.iloc[:10, :]
    df_test_1 = df1.iloc[10:, :]

    df_train = pd.concat([df_train_0, df_train_1], axis=0)
    df_test = pd.concat([df_test_0, df_test_1], axis=0)


elif dataset == 'old':
    df_train = pd.read_csv('/content/drive/MyDrive/Bio-med Roberta/dataset/hpv1_train.csv', header=0,
                           usecols=['title', 'abstract', 'level1_labels']).dropna().rename(
                           columns={'title': 'Title', 'abstract': 'Abstract', 'level1_labels': 'Include/Exclude'})
    df_train['info'] = df_train['Title'].astype(str) + df_train['Abstract'].astype(str)
    X_train = df_train['info'].to_numpy().reshape(-1)
    y_train = df_train['Include/Exclude'].to_numpy().reshape(-1)

    df_test = pd.read_csv('/content/drive/MyDrive/Bio-med Roberta/dataset/hpv1_test.csv', header=0,
                          usecols=['title', 'abstract', 'level1_labels']).dropna().rename(
                          columns={'title': 'Title', 'abstract': 'Abstract', 'level1_labels': 'Include/Exclude'})
    df_test['info'] = df_test['Title'].astype(str) + df_test['Abstract'].astype(str)
    X_test = df_test['info'].to_numpy().reshape(-1)
    y_test = df_test['Include/Exclude'].to_numpy().reshape(-1)

    # print(df_train.head(5))

# Train Split

In [None]:
#print(df1.shape, df2.shape, df.shape)

(200, 4) (195, 4) (395, 4)


In [None]:
if dataset=='new':
    X_train = df_train['info'].to_numpy().reshape(-1)
    y_train = df_train['Include/Exclude'].to_numpy().reshape(-1)
    X_test = df_test['info'].to_numpy().reshape(-1)
    y_test = df_test['Include/Exclude'].to_numpy().reshape(-1)

else:
    X_train, X_test = np.append(X_data[:30], X_data[200:230], axis=0), np.append(X_data[30:200], X_data[230:], axis=0)
    y_train, y_test = np.append(y_data[:30], y_data[200:230], axis=0), np.append(y_data[30:200], y_data[230:], axis=0)

X_train = X_train.reshape((-1,)).tolist()
X_test = X_test.reshape((-1,)).tolist()

In [None]:
y_train

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

# Load Model

In [None]:
MODEL_NAME = 'allenai/biomed_roberta_base'
MAX_LEN = 512
model = RobertaForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME, max_length = MAX_LEN)

Downloading (…)lve/main/config.json:   0%|          | 0.00/430 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/656M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at allenai/biomed_roberta_base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

# Tokenize & encode

In [None]:
def roberta_encoder(tokenizer, data, label):


  encoded_inputs = tokenizer(data, padding=True, truncation=True, return_tensors='pt')

  dataset_dict = {
    'input_ids': encoded_inputs['input_ids'],
    'attention_mask': encoded_inputs['attention_mask'],
    'label': torch.tensor(label)}

  dataset = datasets.Dataset.from_dict(dataset_dict)

  # You can also set the format explicitly
  #dataset = dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

  return dataset


train_data = roberta_encoder(tokenizer, X_train, y_train)
test_data = roberta_encoder(tokenizer, X_test, y_test)

# Compute Metrics

In [None]:
def compute_metrics(pred):

    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    report = classification_report(labels, preds, output_dict=True)

    precision_0, recall_0, f1_0, _ = report['0'].values()
    precision_1, recall_1, f1_1, _ = report['1'].values()

    ad_hoc_1 = recall_1 * .8 + precision_1 * .2
    acc = accuracy_score(labels, preds)
    return {
            'accuracy': acc,
            'f1': f1,
            'precision_0': precision_0,
            'recall_0': recall_0,
            'precision_1': precision_1,
            'recall_1': recall_1,
            'adhoc_1': ad_hoc_1
            }

# Training Arguements

In [None]:
training_args = TrainingArguments(
                                  output_dir = '/content/drive/MyDrive/Bio-med Roberta',
                                  overwrite_output_dir= True,
                                  num_train_epochs=10,
                                  per_device_train_batch_size = 15,
                                  # gradient_accumulation_steps = 16,
                                  per_device_eval_batch_size= 15,
                                  evaluation_strategy = "epoch",
                                  save_strategy = 'epoch',
                                  disable_tqdm = False,
                                  load_best_model_at_end=True,
                                  warmup_steps=500,
                                  weight_decay=0.01,
                                  logging_steps = 8,
                                  fp16 = False,
                                  report_to='none',
                                  save_total_limit=1,
                                  # logging_dir='/media/jlealtru/data_files/github/website_tutorials/logs',
                                  # dataloader_num_workers = 8,
                                  # run_name = 'roberta-classification'
                                  )


trainer = Trainer(
                  model=model,
                  args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=train_data,
                  eval_dataset=test_data
                  )

device = 'cuda' if torch.cuda.is_available() else 'cpu'

## Train model

In [None]:
trainer.train(resume_from_checkpoint=False)

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision 0,Recall 0,Precision 1,Recall 1,Adhoc 1
1,No log,0.757224,0.045735,0.08747,0.0,0.0,0.045735,1.0,0.809147
2,No log,0.75462,0.045735,0.08747,0.0,0.0,0.045735,1.0,0.809147
3,0.733200,0.750444,0.045735,0.08747,0.0,0.0,0.045735,1.0,0.809147


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision 0,Recall 0,Precision 1,Recall 1,Adhoc 1
1,No log,0.757224,0.045735,0.08747,0.0,0.0,0.045735,1.0,0.809147
2,No log,0.75462,0.045735,0.08747,0.0,0.0,0.045735,1.0,0.809147
3,0.733200,0.750444,0.045735,0.08747,0.0,0.0,0.045735,1.0,0.809147
4,0.733200,0.744388,0.045735,0.08747,0.0,0.0,0.045735,1.0,0.809147
5,0.733200,0.736561,0.045735,0.08747,0.0,0.0,0.045735,1.0,0.809147
6,0.726800,0.727256,0.045735,0.08747,0.0,0.0,0.045735,1.0,0.809147
7,0.726800,0.716445,0.045735,0.08747,0.0,0.0,0.045735,1.0,0.809147
8,0.705100,0.704008,0.065019,0.088675,0.9875,0.020466,0.046406,0.994595,0.804957
9,0.705100,0.689723,0.716934,0.041841,0.947282,0.744819,0.024752,0.135135,0.113059
10,0.705100,0.674526,0.942151,0.040984,0.954842,0.98601,0.084746,0.027027,0.038571


TrainOutput(global_step=30, training_loss=0.7115946451822917, metrics={'train_runtime': 1677.435, 'train_samples_per_second': 0.238, 'train_steps_per_second': 0.018, 'total_flos': 105244422144000.0, 'train_loss': 0.7115946451822917, 'epoch': 10.0})

In [None]:
trainer.eval()

AttributeError: ignored

relevant=10
irrelevant=30

In [None]:
res = trainer.predict(test_data)
print(classification_report(res.label_ids, np.argmax(res.predictions, axis=1)))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97      3860
           1       0.08      0.03      0.04       185

    accuracy                           0.94      4045
   macro avg       0.52      0.51      0.51      4045
weighted avg       0.92      0.94      0.93      4045



## Evaluation

In a confusion matrix, we can see how many categories are classified c

In [None]:
#print(classification_report(res.label_ids, np.argmax(res.predictions, axis=1)))
print(precision_recall_fscore_support(res.label_ids, np.argmax(res.predictions, axis=1), average='binary'))

(0.0847457627118644, 0.02702702702702703, 0.04098360655737705, None)
