In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
import re

In [2]:
data = 'Multi-Label Text Classification Dataset.csv'
df = pd.read_csv(data)

df['labels'] = df[df.columns[6:]].values.tolist()
selected_columns = ['Title', 'abstractText', 'meshMajor', 'labels']
df = df[selected_columns]

#Training (70%), Testing (15%), Validation (15%)
train_data, test_data = train_test_split(df, test_size=0.3, random_state=42)
test_data, val_data = train_test_split(test_data, test_size=0.5, random_state=42)

train_data = train_data.reset_index(drop = True)
test_data = test_data.reset_index(drop = True)
val_data = val_data.reset_index(drop = True)

features = ['Title', 'abstractText', 'meshMajor']
targets_list = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'L', 'M', 'N', 'Z']
MAX_LEN = 512
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [4]:
class modelArchitecture(nn.Module):
    def __init__(self):
        super(modelArchitecture, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased', return_dict = True)
        self.dropout = nn.Dropout(0.3)
        self.layer = nn.Linear(768, 14)
    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert(
            input_ids, 
            attention_mask=attn_mask, 
            token_type_ids=token_type_ids
        )
        output = self.dropout(output.pooler_output)
        output = self.layer(output)
        return output

In [5]:
model_path = 'model.pth'
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = modelArchitecture()
model.load_state_dict(torch.load(model_path, map_location=device))
model = model.to(device)

model.eval()

modelArchitecture(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise

In [6]:
def infer(x, tokenizer, model):
    wordList = []
    wordList.append('Title:')
    title_text = str(x['Title'])
    wordList.extend(title_text.split())
    wordList.append('Abstract:')
    abstract_text = str(x['abstractText'])
    wordList.extend(abstract_text.split())
    wordList.append('Terms:')
    mesh_text = str(x['meshMajor'])
    wordList.extend(re.findall(r"'(.*?)'", mesh_text))

    txt = " ".join(wordList)

    encodedText = tokenizer.encode_plus(
        txt,
        None,
        add_special_tokens=True,
        max_length=MAX_LEN,
        truncation = True,
        padding = "max_length",
        return_token_type_ids=True,
        return_tensors='pt'
    )
    ids = encodedText['input_ids'].to(device)
    mask = encodedText['attention_mask'].to(device)
    token_type_ids = encodedText["token_type_ids"].to(device)

    output = model(ids, mask, token_type_ids)
    output = torch.sigmoid(output).detach().cpu()
    output = output.flatten().round().numpy().astype(int)
    return list(output)

In [7]:
random_samples = test_data.sample(n = 5).reset_index(drop = True)

for idx, row in random_samples.iterrows():
    print(idx+1)
    print("Title:", row['Title'])
    print("Abstract:", row['abstractText'])
    print("Meshmajor:", row['meshMajor'])
    predictions = infer(row[features], tokenizer, model)
    labels = row['labels']
    predicted_targets = [target for i, target in enumerate(targets_list) if predictions[i] == 1]
    actual_targets = [target for i, target in enumerate(targets_list) if labels[i] == 1]
    print("Predicted Labels:", predicted_targets)
    print("Actual Labels:", actual_targets)

1
Title: Hydroxychloroquine in steroid dependent asthma.
Abstract: A recent case report suggested that hydroxychloroquine had a steroid sparing effect in a patient with severe chronic asthma. We have studied the effect of hydroxychloroquine in a group of nine steroid dependent adult asthmatic patients using a randomised double blind crossover comparison of hydroxychloroquine and placebo. Each patient received hydroxychloroquine (400 mg/day) or placebo for 2 month periods. The effect of hydroxychloroquine or placebo on asthma control was assessed by change in steroid dosage, visual analogue symptom scores, response to beta 2 agonist and peak expiratory flow rate (PFR) measurement. The dose of prednisolone required during hydroxychloroquine treatment did not differ from that during placebo treatment or in pre-trial period. There was no significant change in symptom scores of PFR measurement. In this study an 8 week treatment with hydroxychloroquine was of no benefit to patients with chro