In [1]:
import numpy as np
import pandas as pd
from simpletransformers.classification import MultiLabelClassificationModel, MultiLabelClassificationArgs

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score

In [3]:
train_data = pd.read_csv('BC7-LitCovid-Train.csv')

In [4]:
train_data.head()

Unnamed: 0,pmid,journal,title,abstract,keywords,pub_type,authors,doi,label
0,32519164,J Thromb Thrombolysis,Potential role for tissue factor in the pathog...,"In December 2019, a new and highly contagious ...",covid-19;il-6;sars-cov-2;tnf-alpha;thrombosis;...,Journal Article;Review,"Bautista-Vargas, Mario;Bonilla-Abadia, Fabio;C...",10.1007/s11239-020-02172-x,Treatment;Mechanism
1,32691006,J Tradit Complement Med,Dietary therapy and herbal medicine for COVID-...,"A novel coronavirus disease (COVID-19), transm...",covid-19;coronavirus;dietary therapy;herbal me...,Journal Article;Review,"Panyod, Suraphan;Ho, Chi-Tang;Sheen, Lee-Yan",10.1016/j.jtcme.2020.05.004,Treatment;Prevention
2,32858315,J Affect Disord,First report of manic-like symptoms in a COVID...,"BACKGROUND: In December 2019, the novel corona...",cerebrospinal fluid;igg;manic-like symptoms;sa...,Case Reports;Journal Article,"Lu, Shaojia;Wei, Ning;Jiang, Jiajun;Wu, Lingli...",10.1016/j.jad.2020.08.031,Case Report
3,32985329,J Dent Res,Epidemiological Investigation of OHCWs with CO...,During the coronavirus disease 2019 (COVID-19)...,dental education;dental public health;infectio...,"Journal Article;Research Support, Non-U.S. Gov't","Meng, L;Ma, B;Cheng, Y;Bian, Z",10.1177/0022034520962087,Prevention
4,32812051,J Antimicrob Chemother,The impact of sofosbuvir/daclatasvir or ribavi...,OBJECTIVES: Sofosbuvir and daclatasvir are dir...,,Journal Article;Randomized Controlled Trial;Re...,"Eslami, Gholamali;Mousaviasl, Sajedeh;Radmanes...",10.1093/jac/dkaa331,Treatment


In [5]:
train_data.shape

(24960, 9)

In [6]:
processed_train_data = pd.DataFrame()
def split_semicolon(string):
    return str(string).split(';')

processed_train_data['labels']=train_data['label'].apply(split_semicolon)
processed_train_data['text']=train_data['abstract']

In [7]:
processed_train_data.head()

Unnamed: 0,labels,text
0,"[Treatment, Mechanism]","In December 2019, a new and highly contagious ..."
1,"[Treatment, Prevention]","A novel coronavirus disease (COVID-19), transm..."
2,[Case Report],"BACKGROUND: In December 2019, the novel corona..."
3,[Prevention],During the coronavirus disease 2019 (COVID-19)...
4,[Treatment],OBJECTIVES: Sofosbuvir and daclatasvir are dir...


In [8]:
label_mlb = MultiLabelBinarizer()
label_mle = label_mlb.fit_transform(processed_train_data['labels'])
print(label_mle.shape)
print(label_mlb.classes_)

(24960, 7)
['Case Report' 'Diagnosis' 'Epidemic Forecasting' 'Mechanism' 'Prevention'
 'Transmission' 'Treatment']


In [9]:
label_mle.sum(axis=0)

array([ 2063,  6193,   645,  4438, 11102,  1088,  8717])

In [10]:
processed_train_data['labels'] = label_mle.tolist()
processed_train_data.head()

Unnamed: 0,labels,text
0,"[0, 0, 0, 1, 0, 0, 1]","In December 2019, a new and highly contagious ..."
1,"[0, 0, 0, 0, 1, 0, 1]","A novel coronavirus disease (COVID-19), transm..."
2,"[1, 0, 0, 0, 0, 0, 0]","BACKGROUND: In December 2019, the novel corona..."
3,"[0, 0, 0, 0, 1, 0, 0]",During the coronavirus disease 2019 (COVID-19)...
4,"[0, 0, 0, 0, 0, 0, 1]",OBJECTIVES: Sofosbuvir and daclatasvir are dir...


In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(processed_train_data['text'], processed_train_data['labels'],test_size=1000)

In [12]:
train = pd.DataFrame()
train['text'] = X_train
train['labels'] = Y_train

test = pd.DataFrame()
test['text'] = X_test
test['labels'] = Y_test

train.head()

Unnamed: 0,text,labels
4210,"Recently, 6 percent of COVID-19 patients requi...","[0, 0, 0, 0, 1, 0, 0]"
17493,The COVID-19 pandemic has caused many Veterans...,"[0, 0, 0, 0, 1, 0, 0]"
19075,We sought to provide a clinical practice proto...,"[0, 0, 0, 0, 1, 0, 0]"
1608,Severe acute respiratory syndrome coronavirus ...,"[0, 1, 0, 1, 1, 1, 1]"
9403,This study aimed to develop risk scores based ...,"[0, 1, 0, 0, 0, 0, 1]"


In [13]:
# train.to_csv('train_data.csv')
# test.to_csv('test_data.csv')

In [14]:
from sklearn.metrics import accuracy_score

In [15]:
def weighted_f1(labels, preds, threshold=0.5):
  """ Converts probabilities to labels using the [threshold] and calculates metrics. 
  Parameters ---------- labels preds threshold 
  Returns ------- """ 
  preds[preds > threshold] = 1
  preds[preds <= threshold] = 0 

  scores = f1_score(labels, preds, average='weighted') 

  #print("Scores: " ,scores)
  return scores

In [23]:
model_args = MultiLabelClassificationArgs(
    num_train_epochs=40,
    evaluate_each_epoch=True,
    overwrite_output_dir= True,
    # evaluate_during_training=True,
    save_model_every_epoch = True
)

In [24]:
model = MultiLabelClassificationModel(
    "bert",
    "bert-base-uncased",
    num_labels=7,
    use_cuda=True,
    args=model_args
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMultiLabelSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForMultiLabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMultiLabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultiLabelSequenceClassification were not 

In [25]:
# temp_train = train.iloc[:10,:]
# temp_test = test.iloc[:10,:]

In [26]:
model.train_model(train, eval_df = test)

  0%|          | 0/23960 [00:00<?, ?it/s]

Epoch:   0%|          | 0/40 [00:00<?, ?it/s]

Running Epoch 0 of 40:   0%|          | 0/2995 [00:00<?, ?it/s]

Running Epoch 1 of 40:   0%|          | 0/2995 [00:00<?, ?it/s]

  model.parameters(), args.max_grad_norm


Running Epoch 2 of 40:   0%|          | 0/2995 [00:00<?, ?it/s]

Running Epoch 3 of 40:   0%|          | 0/2995 [00:00<?, ?it/s]

Running Epoch 4 of 40:   0%|          | 0/2995 [00:00<?, ?it/s]

Running Epoch 5 of 40:   0%|          | 0/2995 [00:00<?, ?it/s]

Running Epoch 6 of 40:   0%|          | 0/2995 [00:00<?, ?it/s]

Running Epoch 7 of 40:   0%|          | 0/2995 [00:00<?, ?it/s]

Running Epoch 8 of 40:   0%|          | 0/2995 [00:00<?, ?it/s]

Running Epoch 9 of 40:   0%|          | 0/2995 [00:00<?, ?it/s]

Running Epoch 10 of 40:   0%|          | 0/2995 [00:00<?, ?it/s]

Running Epoch 11 of 40:   0%|          | 0/2995 [00:00<?, ?it/s]

Running Epoch 12 of 40:   0%|          | 0/2995 [00:00<?, ?it/s]

Running Epoch 13 of 40:   0%|          | 0/2995 [00:00<?, ?it/s]

Running Epoch 14 of 40:   0%|          | 0/2995 [00:00<?, ?it/s]

Running Epoch 15 of 40:   0%|          | 0/2995 [00:00<?, ?it/s]

Running Epoch 16 of 40:   0%|          | 0/2995 [00:00<?, ?it/s]

Running Epoch 17 of 40:   0%|          | 0/2995 [00:00<?, ?it/s]

Running Epoch 18 of 40:   0%|          | 0/2995 [00:00<?, ?it/s]

Running Epoch 19 of 40:   0%|          | 0/2995 [00:00<?, ?it/s]

Running Epoch 20 of 40:   0%|          | 0/2995 [00:00<?, ?it/s]

Running Epoch 21 of 40:   0%|          | 0/2995 [00:00<?, ?it/s]

Running Epoch 22 of 40:   0%|          | 0/2995 [00:00<?, ?it/s]

Running Epoch 23 of 40:   0%|          | 0/2995 [00:00<?, ?it/s]

Running Epoch 24 of 40:   0%|          | 0/2995 [00:00<?, ?it/s]

Running Epoch 25 of 40:   0%|          | 0/2995 [00:00<?, ?it/s]

Running Epoch 26 of 40:   0%|          | 0/2995 [00:00<?, ?it/s]

Running Epoch 27 of 40:   0%|          | 0/2995 [00:00<?, ?it/s]

Running Epoch 28 of 40:   0%|          | 0/2995 [00:00<?, ?it/s]

Running Epoch 29 of 40:   0%|          | 0/2995 [00:00<?, ?it/s]

Running Epoch 30 of 40:   0%|          | 0/2995 [00:00<?, ?it/s]

Running Epoch 31 of 40:   0%|          | 0/2995 [00:00<?, ?it/s]

Running Epoch 32 of 40:   0%|          | 0/2995 [00:00<?, ?it/s]

Running Epoch 33 of 40:   0%|          | 0/2995 [00:00<?, ?it/s]

Running Epoch 34 of 40:   0%|          | 0/2995 [00:00<?, ?it/s]

Running Epoch 35 of 40:   0%|          | 0/2995 [00:00<?, ?it/s]

Running Epoch 36 of 40:   0%|          | 0/2995 [00:00<?, ?it/s]

Running Epoch 37 of 40:   0%|          | 0/2995 [00:00<?, ?it/s]

Running Epoch 38 of 40:   0%|          | 0/2995 [00:00<?, ?it/s]

Running Epoch 39 of 40:   0%|          | 0/2995 [00:00<?, ?it/s]

(119800, 0.033227868420262666)

In [27]:
model.save_model('Baseline_Model')

In [28]:
result, model_outputs, wrong_predictions = model.eval_model(test,metrics=weighted_f1)

  0%|          | 0/1000 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/125 [00:00<?, ?it/s]

In [29]:
result

{'LRAP': 0.8393779761904755,
 'metrics': 0.8284641065786332,
 'eval_loss': 0.5571619349184767}