In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dataset/Multi-Label Text Classification Dataset.csv


In [2]:
from sklearn import metrics
import transformers
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel
import re
import tqdm.notebook as tq

# DATA LOADING AND PREPROCESSING

In [3]:
data = '/kaggle/input/dataset/Multi-Label Text Classification Dataset.csv'
df = pd.read_csv(data)
df.head()

Unnamed: 0,Title,abstractText,meshMajor,pmid,meshid,meshroot,A,B,C,D,E,F,G,H,I,J,L,M,N,Z
0,Expression of p53 and coexistence of HPV in pr...,Fifty-four paraffin embedded tissue sections f...,"['DNA Probes, HPV', 'DNA, Viral', 'Female', 'H...",8549602,"[['D13.444.600.223.555', 'D27.505.259.750.600....","['Chemicals and Drugs [D]', 'Organisms [B]', '...",0,1,1,1,1,0,0,1,0,0,0,0,0,0
1,Vitamin D status in pregnant Indian women acro...,The present cross-sectional study was conducte...,"['Adult', 'Alkaline Phosphatase', 'Breast Feed...",21736816,"[['M01.060.116'], ['D08.811.277.352.650.035'],...","['Named Groups [M]', 'Chemicals and Drugs [D]'...",0,1,1,1,1,1,1,0,1,1,0,1,1,1
2,[Identification of a functionally important di...,The occurrence of individual amino acids and d...,"['Amino Acid Sequence', 'Analgesics, Opioid', ...",19060934,"[['G02.111.570.060', 'L01.453.245.667.060'], [...","['Phenomena and Processes [G]', 'Information S...",1,1,0,1,1,0,1,0,0,0,1,0,0,0
3,Multilayer capsules: a promising microencapsul...,"In 1980, Lim and Sun introduced a microcapsule...","['Acrylic Resins', 'Alginates', 'Animals', 'Bi...",11426874,"[['D05.750.716.822.111', 'D25.720.716.822.111'...","['Chemicals and Drugs [D]', 'Technology, Indus...",1,1,1,1,1,0,1,0,0,1,0,0,0,0
4,"Nanohydrogel with N,N'-bis(acryloyl)cystine cr...",Substantially improved hydrogel particles base...,"['Antineoplastic Agents', 'Cell Proliferation'...",28323099,"[['D27.505.954.248'], ['G04.161.750', 'G07.345...","['Chemicals and Drugs [D]', 'Phenomena and Pro...",1,1,0,1,1,0,1,0,0,1,0,0,0,0


In [4]:
from sklearn.model_selection import train_test_split
df['labels'] = df[df.columns[6:]].values.tolist()
selected_columns = ['Title', 'abstractText', 'meshMajor', 'labels']
df = df[selected_columns]

#Training (70%), Testing (15%), Validation (15%)
train_data, test_data = train_test_split(df, test_size=0.3, random_state=42)
test_data, val_data = train_test_split(test_data, test_size=0.5, random_state=42)

train_data = train_data.reset_index(drop = True)
test_data = test_data.reset_index(drop = True)
val_data = val_data.reset_index(drop = True)

In [5]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
TEST_BATCH_SIZE = 8
EPOCHS = 3
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [6]:
class myDataset(Dataset):
    def __init__(self, dataframe, tokenizer, maxlen):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.targets = dataframe.labels
        self.max_len = maxlen
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        wordList = []
        wordList.append('Title:')
        title_text = str(self.data.Title[index])
        wordList.extend(title_text.split())
        wordList.append('Abstract:')
        abstract_text = str(self.data.abstractText[index])
        wordList.extend(abstract_text.split())
        wordList.append('Terms:')
        mesh_text = str(self.data.meshMajor[index])
        wordList.extend(re.findall(r"'(.*?)'", mesh_text))
        
        x = " ".join(wordList)
        
        inputs = self.tokenizer.encode_plus(
            x,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation = True,
            padding = "max_length",
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float),
            'combinedText': x
        }

In [7]:
train_dataset = myDataset(train_data, tokenizer, MAX_LEN)
test_dataset = myDataset(test_data, tokenizer, MAX_LEN)
val_dataset = myDataset(val_data, tokenizer, MAX_LEN)
 
train_data_loader = torch.utils.data.DataLoader(train_dataset, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(val_dataset, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

test_data_loader = torch.utils.data.DataLoader(test_dataset, 
    batch_size=TEST_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

# MODEL ARCHITECTURE

In [8]:
class modelArchitecture(nn.Module):
    def __init__(self):
        super(modelArchitecture, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased', return_dict = True)
        self.dropout = nn.Dropout(0.3)
        self.layer = nn.Linear(768, 14)
    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert(
            input_ids, 
            attention_mask=attn_mask, 
            token_type_ids=token_type_ids
        )
        output = self.dropout(output.pooler_output)
        output = self.layer(output)
        return output

In [9]:
model = modelArchitecture()

#for param in model.bert.parameters():
#    param.requires_grad = False
    
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

modelArchitecture(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise

In [10]:
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), LEARNING_RATE)

# MODEL TRAINING

In [11]:
def trainModel(train_dataloader, model, optimizer, criterion):
    losses = []
    model.train()
    loop = tq.tqdm(enumerate(train_dataloader), total=len(train_dataloader), 
                      leave=True, colour='steelblue')
    for batch_idx, data in loop:
        ids = data['ids'].to(device, dtype = torch.long)
        attn_mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)
        
        #forward
        outputs = model(ids, attn_mask, token_type_ids)
        loss = criterion(outputs, targets)
        losses.append(loss.item())
        #print(loss.item())
        #backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    return model, losses

def evalModel(val_dataloader, model, optimizer, criterion):
    losses = []
    model.eval()
    
    with torch.no_grad():
        for batch_idx, data in enumerate(val_dataloader):
            ids = data['ids'].to(device, dtype = torch.long)
            attn_mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            
            outputs = model(ids, attn_mask, token_type_ids)
            loss = criterion(outputs, targets)
            losses.append(loss.item())
            
    return losses

In [12]:
bestLoss = 1
model_save_path = '/kaggle/working/model.pth'
for epoch in range(EPOCHS):
    print(f'Epoch: {epoch}')
    model, train_losses = trainModel(train_data_loader, model, optimizer, criterion)
    val_losses = evalModel(val_data_loader, model, optimizer, criterion)
    train_loss = np.mean(train_losses)
    val_loss = np.mean(val_losses)
    print(f'train_loss={train_loss:.4f}, val_loss={val_loss:.4f}')
    
    if train_loss < bestLoss:
        torch.save(model.state_dict(), model_save_path)
        bestLoss = train_loss

Epoch: 0


  0%|          | 0/2188 [00:00<?, ?it/s]

train_loss=0.2848, val_loss=0.1750
Epoch: 1


  0%|          | 0/2188 [00:00<?, ?it/s]

train_loss=0.1445, val_loss=0.1156
Epoch: 2


  0%|          | 0/2188 [00:00<?, ?it/s]

train_loss=0.1001, val_loss=0.0959


# MODEL EVALUATION

In [16]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = modelArchitecture()
model.load_state_dict(torch.load(model_save_path))
model = model.to(device)

model.eval()

modelArchitecture(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise

In [19]:
test_losses = evalModel(test_data_loader, model, optimizer, criterion)
print(np.mean(test_losses))

0.09406954222031112


In [22]:
from sklearn.metrics import classification_report
def get_predictions(model, data_loader):
    """
    Outputs:
      predictions - 
    """
    model = model.eval()
    
    predictions = []
    prediction_probs = []
    target_values = []

    with torch.no_grad():
      for data in data_loader:
        ids = data["ids"].to(device, dtype = torch.long)
        mask = data["mask"].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data["targets"].to(device, dtype = torch.float)
        
        outputs = model(ids, mask, token_type_ids)
        # add sigmoid, for the training sigmoid is in BCEWithLogitsLoss
        outputs = torch.sigmoid(outputs).detach().cpu()
        # thresholding at 0.5
        preds = outputs.round()
        targets = targets.detach().cpu()

        predictions.extend(preds)
        prediction_probs.extend(outputs)
        target_values.extend(targets)
    
    predictions = torch.stack(predictions)
    prediction_probs = torch.stack(prediction_probs)
    target_values = torch.stack(target_values)
    
    return predictions, prediction_probs, target_values

In [23]:
predictions, prediction_probs, target_values = get_predictions(model, test_data_loader)

In [28]:
target_list = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'L', 'M', 'N', 'Z']
print(classification_report(target_values, predictions, target_names=target_list))

              precision    recall  f1-score   support

           A       0.97      0.95      0.96      3488
           B       0.99      1.00      0.99      6957
           C       0.97      0.97      0.97      3910
           D       0.98      0.99      0.98      4651
           E       0.96      0.97      0.97      5909
           F       0.94      0.93      0.94      1337
           G       0.96      0.96      0.96      5059
           H       0.94      0.87      0.91       967
           I       0.92      0.84      0.88       873
           J       0.96      0.78      0.86       873
           L       0.94      0.94      0.94      1161
           M       0.98      0.99      0.99      3131
           N       0.93      0.96      0.95      3396
           Z       0.98      0.96      0.97      1221

   micro avg       0.97      0.96      0.96     42933
   macro avg       0.96      0.94      0.95     42933
weighted avg       0.97      0.96      0.96     42933
 samples avg       0.97   

  _warn_prf(average, modifier, msg_start, len(result))
