In [1]:
import numpy as np
import pandas as pd
from simpletransformers.classification import MultiLabelClassificationModel, MultiLabelClassificationArgs

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score

In [3]:
# train_data = pd.read_csv('/content/drive/MyDrive/Information Retrieval/BC7-LitCovid-Train.csv')
train_data = pd.read_csv('BC7-LitCovid-Train.csv')

In [4]:
processed_train_data = pd.DataFrame()
def split(string):
    return str(string).split(';')

processed_train_data['labels']=train_data['label'].apply(split)
processed_train_data['text']=train_data['abstract']



In [5]:
processed_train_data.head()

Unnamed: 0,labels,text
0,"[Treatment, Mechanism]","In December 2019, a new and highly contagious ..."
1,"[Treatment, Prevention]","A novel coronavirus disease (COVID-19), transm..."
2,[Case Report],"BACKGROUND: In December 2019, the novel corona..."
3,[Prevention],During the coronavirus disease 2019 (COVID-19)...
4,[Treatment],OBJECTIVES: Sofosbuvir and daclatasvir are dir...


In [6]:
label_mlb = MultiLabelBinarizer()
label_mle = label_mlb.fit_transform(processed_train_data['labels'])
print(label_mle.shape)
print(label_mlb.classes_)

(24960, 7)
['Case Report' 'Diagnosis' 'Epidemic Forecasting' 'Mechanism' 'Prevention'
 'Transmission' 'Treatment']


In [7]:
processed_train_data['labels'] = label_mle.tolist()

In [8]:
processed_train_data.head()

Unnamed: 0,labels,text
0,"[0, 0, 0, 1, 0, 0, 1]","In December 2019, a new and highly contagious ..."
1,"[0, 0, 0, 0, 1, 0, 1]","A novel coronavirus disease (COVID-19), transm..."
2,"[1, 0, 0, 0, 0, 0, 0]","BACKGROUND: In December 2019, the novel corona..."
3,"[0, 0, 0, 0, 1, 0, 0]",During the coronavirus disease 2019 (COVID-19)...
4,"[0, 0, 0, 0, 0, 0, 1]",OBJECTIVES: Sofosbuvir and daclatasvir are dir...


In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(processed_train_data['text'], processed_train_data['labels'],test_size=1000)

In [10]:
X_test


7224     Objective: To study the early dynamics of the ...
11111    The coronavirus disease-19 (COVID-19) global p...
5201     The novel Coronavirus (CoVid-19) outbreak is n...
16642    COVID-19 has led to disruption in routine immu...
7388     The coronavirus SARS-CoV-2 was identified as t...
                               ...                        
5202     COVID-19 emerged in late 2019 and has rapidly ...
20200    PURPOSE: To study propensity of aerosol and dr...
24773    Several months into the ongoing novel coronavi...
7684     The clinical and laboratory features of COVID-...
8439     The clinical spectrum of COVID-19 is still not...
Name: text, Length: 1000, dtype: object

In [11]:
train = pd.DataFrame()
train['text'] = X_train
train['labels'] = Y_train

test = pd.DataFrame()
test['text'] = X_test
test['labels'] = Y_test

In [12]:
train.head()

Unnamed: 0,text,labels
23481,The COVID-19 pandemic exacerbates existing hea...,"[0, 0, 0, 0, 1, 0, 0]"
19195,The COVID-19 pandemic will present a range of ...,"[0, 0, 0, 0, 1, 0, 0]"
3055,The World Health Organization has declared nov...,"[1, 0, 0, 0, 0, 0, 0]"
6063,Identifying drugs effective in the new coronav...,"[0, 0, 0, 1, 0, 0, 1]"
16755,We present a three-dimensional (3D) approach f...,"[0, 1, 0, 0, 0, 0, 0]"


In [13]:
test.head()

Unnamed: 0,text,labels
7224,Objective: To study the early dynamics of the ...,"[0, 0, 1, 0, 0, 0, 0]"
11111,The coronavirus disease-19 (COVID-19) global p...,"[0, 0, 0, 0, 1, 0, 0]"
5201,The novel Coronavirus (CoVid-19) outbreak is n...,"[0, 0, 0, 0, 1, 0, 0]"
16642,COVID-19 has led to disruption in routine immu...,"[0, 0, 0, 0, 1, 0, 0]"
7388,The coronavirus SARS-CoV-2 was identified as t...,"[0, 1, 0, 1, 0, 0, 1]"


In [14]:
from sklearn.metrics import accuracy_score

In [15]:
def weighted_f1(labels, preds, threshold=0.5):
    """ 
    Converts probabilities to labels using the [threshold] and calculates metrics. 
    Parameters ---------- labels preds threshold 
    Returns ------- 
    """ 

    preds[preds > threshold] = 1
    preds[preds <= threshold] = 0 

    scores = f1_score(labels, preds, average='weighted') 
    print(scores)
    return scores

In [16]:
model_args = MultiLabelClassificationArgs(num_train_epochs=3,
                                          evaluate_during_training=True, 
                                          evaluate_each_epoch = True,
                                          overwrite_output_dir= True,
                                          train_batch_size=4
                                          )

In [17]:
model = MultiLabelClassificationModel(
    "bert",
    "bert-base-uncased",
    num_labels=7,
    use_cuda=False,
    args=model_args,
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMultiLabelSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForMultiLabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMultiLabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultiLabelSequenceClassification were not 

In [19]:
model.train_model(train.head(2000),eval_df=test.head(), f1=weighted_f1)

  0%|          | 0/2000 [00:00<?, ?it/s]

In [21]:
result, model_outputs, wrong_predictions = model.eval_model(test,f1=weighted_f1)

  0%|          | 2/1000 [00:04<41:06,  2.47s/it]  
Running Evaluation: 100%|██████████| 125/125 [01:09<00:00,  1.80it/s]


In [16]:
model_args_full = MultiLabelClassificationArgs(num_train_epochs=3,
                                        #   evaluate_during_training=True, 
                                          evaluate_each_epoch = True,
                                          overwrite_output_dir= True,
                                          output_dir='all_100epoch'
                                          )

model_full = MultiLabelClassificationModel(
    "bert",
    "bert-base-uncased",
    num_labels=7,
    use_cuda=True,
    args=model_args_full,
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMultiLabelSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForMultiLabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMultiLabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultiLabelSequenceClassification were not 

In [17]:
model_full.train_model(train_df=train.head(2000),eval_df=test,f1=weighted_f1)

  0%|          | 1/2000 [00:04<2:45:31,  4.97s/it]

In [25]:
wrong_predictions

[[[0, 0, 0, 0, 1, 0, 0],
  [0, 1, 0, 0, 0, 0, 1],
  [1, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 1, 0, 0],
  [0, 1, 0, 0, 0, 0, 0]]]

In [None]:
model =  MultiLabelClassificationModel(
    "bert",
    'outputs',
    num_labels=7,
    use_cuda=False,
    args=model_args,
)