In [None]:
!pip3 install simpletransformers 
!pip3 install wandb

In [None]:
# Importing essential Libraries
import numpy as np
import pandas as pd

from simpletransformers.classification import MultiLabelClassificationModel, MultiLabelClassificationArgs
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score

### Helper functions


In [None]:
def split(string):
    """ To split the label values """
    return str(string).split(';')


def preprocess_data(df,val_size=0.5,is_test_data=False):
    """ Return preprocessed text and labels """

    # Append the title and abstract information for text
    processed_data = pd.DataFrame()
    processed_data['labels'] = df['label'].apply(split)
    processed_data['text'] = df.apply(lambda row: row['title'] + ' [SEP] ' +row['abstract'],axis=1)
    
    # Converting labels to One-Hot Encoded list
    label_mlb = MultiLabelBinarizer()
    label_mle = label_mlb.fit_transform(processed_data['labels'])
    print("Label classes:",label_mlb.classes_)
    processed_data['labels'] = label_mle.tolist()

    if not is_test_data:
        # Splitting the data into training and validation set
        train,val = train_test_split(processed_data,test_size = val_size)
        return train,val
    else:
        return processed_data

In [None]:
def weighted_f1(labels, preds, threshold=0.5):
    """ Converts probabilities to labels using the [threshold] and calculates metrics. 
    Parameters ---------- labels preds threshold 
    Returns ------- score""" 
    preds[preds > threshold] = 1
    preds[preds <= threshold] = 0 

    scores = f1_score(labels, preds, average='weighted') 
    return scores

### Preprocessing Train Data

In [None]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [None]:
train_data = pd.read_csv('/content/drive/MyDrive/Information Retrieval/BC7-LitCovid-Train.csv')
train, val = preprocess_data(train_data,val_size=1000)

### Initializing model

In [None]:
# Setting the model configuration arguments
model_args = MultiLabelClassificationArgs(
    num_train_epochs=40,
    evaluate_each_epoch = True,
    overwrite_output_dir= True,
    output_dir='/content/drive/MyDrive/Information Retrieval/output',
    save_model_every_epoch = True
)

In [None]:
# Initializing model
model = MultiLabelClassificationModel(
    "bert",
    "bert-base-uncased",
    num_labels=7,
    use_cuda=True,
    args=model_args,
)

### Training Model

In [None]:
# Training the model on the train set and validating using weighted f1
model.train_model(train,f1 = weighted_f1)

  0%|          | 0/23960 [00:00<?, ?it/s]

Epoch:   0%|          | 0/30 [00:00<?, ?it/s]

Running Epoch 0 of 30:   0%|          | 0/2995 [00:00<?, ?it/s]

  model.parameters(), args.max_grad_norm


Running Epoch 1 of 30:   0%|          | 0/2995 [00:00<?, ?it/s]

Running Epoch 2 of 30:   0%|          | 0/2995 [00:00<?, ?it/s]

Running Epoch 3 of 30:   0%|          | 0/2995 [00:00<?, ?it/s]

### Preprocessing Test Data

In [None]:
test_data = pd.read_csv('/content/drive/MyDrive/Information Retrieval/BC7-LitCovid-Dev.csv')
test = preprocess_data(test_data,is_test_data=True)

### Predictions and Evaluations on test data

In [None]:
# Load the trained model if needed
model = MultiLabelClassificationModel(
    "bert",
    "/content/drive/MyDrive/Information Retrieval/GCP model + extras/Baseline model",
    num_labels=7,
    use_cuda=True,
    args=model_args,  # Model args are the same as training args
)

In [None]:
result, model_outputs, wrong_predictions = model.eval_model(test,f1=weighted_f1)

  0%|          | 0/6239 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/780 [00:00<?, ?it/s]

In [None]:
result

{'LRAP': 0.8403232864444642,
 'eval_loss': 0.5424484101206546,
 'f1': 0.8324095895507378}

Results with 'title' appended:<br>
```
{'LRAP': 0.8508616752786522,
 'f1': 0.841965364434017,
 'eval_loss': 0.5241743215936863}
```



### Further Analysis

In [None]:
# Getting outputs and raw predictions for the presentation
predictions, raw_outputs = model.predict(['The current coronavirus pandemic is an ongoing global health crisis due to COVID-19, caused by severe acute respiratory syndrome coronavirus 2. Although COVID-19 leads to little or mild flu-like symptoms in the majority of affected patients, the disease may cause severe, frequently lethal complications such as progressive pneumonia, acute respiratory distress syndrome and organ failure driven by hyperinflammation and a cytokine storm syndrome. This situation causes various major challenges for gastroenterology. In the context of IBD, several key questions arise. For instance, it is an important question to understand whether patients with IBD (eg, due to intestinal ACE2 expression) might be particularly susceptible to COVID-19 and the cytokine release syndrome associated with lung injury and fatal outcomes. Another highly relevant question is how to deal with immunosuppression and immunomodulation during the current pandemic in patients with IBD and whether immunosuppression affects the progress of COVID-19. Here, the current understanding of the pathophysiology of COVID-19 is reviewed with special reference to immune cell activation. Moreover, the potential implications of these new insights for immunomodulation and biological therapy in IBD are discussed.'])

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
predictions

[[0, 0, 0, 0, 0, 0, 1]]

In [None]:
raw_outputs

array([[4.03523445e-05, 7.65919685e-05, 7.74860382e-07, 5.51939011e-05,
        9.91821289e-05, 3.51667404e-06, 1.00000000e+00]])

### References

Simple Transformers Docs: https://simpletransformers.ai/docs/multi-label-classification/