In [1]:
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import re
%matplotlib inline

# Preprocessing Data

In [2]:
#Read in text, create inclusion and exclusion columns, and clean
df = pd.read_csv("data_tier-1_sample.csv")
df.reset_index(inplace = True)
df.columns = ["query", "cohort", "intent"]
df["cohort"] = df["cohort"].apply(json.loads) 

# Convert labels to number
label_values = list(set(df["intent"].values))
labels2idx = {t:i for i, t in enumerate(label_values)}
df["intent"] = df["intent"].apply(labels2idx.get)

df.head(5)

Unnamed: 0,query,cohort,intent
0,what are the risks of high risk pregnancy due ...,{'inclusion': ['high risk pregnancy due to his...,2
1,what are the most effective treatments for pat...,{'inclusion': ['cardiac arrest with successful...,6
2,what is the spectrum of illness severity and c...,"{'inclusion': [], 'exclusion': ['deep venous t...",4
3,How can disorder of eye region care be modifie...,"{'inclusion': ['disorder of eye region'], 'exc...",2
4,Should 5-bromo-3-(pyrrolidin-1-ylsulfonyl)-1h-...,{'inclusion': ['5-bromo-3-(pyrrolidin-1-ylsulf...,6


# BERT 

In [3]:
import torch
from torch import nn

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig

from transformers import BertModel, BertForSequenceClassification
from transformers.optimization import AdamW

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from seqeval.metrics import f1_score, accuracy_score
from sklearn.metrics import f1_score as f1

import copy

Using TensorFlow backend.


In [4]:
device = torch.device("cuda")

In [5]:
msk = np.random.rand(len(df)) < 0.7
train = df[msk]
val = df[~msk]

In [6]:
# Creating function to tokenize inputs, create masks, and output data as tensor dataset
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True)
def bert_load(data, inputs, labels, max_len):
    '''
    Load in data
    Return BERT's preprocessed inputs including token_id, mask, label
    '''
    token_ids = []
    attention_masks = []
    for row in data[str(inputs)]:
        encoded_dict = tokenizer.encode_plus(row,
                                            add_special_tokens= True, #add [CLS], [SEP]
                                            max_length = max_len,  
                                            pad_to_max_length = True, #pad and truncate
                                            return_attention_mask = True, #construct attention mask
                                            return_tensors = 'pt') #return pytorch tensor
        
        token_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    
    token_ids = torch.cat(token_ids,dim=0)
    attention_masks = torch.cat(attention_masks,dim=0)
    labels = torch.tensor(data[str(labels)].values)
    data_out = TensorDataset(token_ids, attention_masks, labels)
    return data_out
        
max_len = max([len(tokenizer.tokenize(query)) for query in df["query"]])
datatrain = bert_load(train, "query", "intent", max_len)   
dataval = bert_load(val, "query", "intent", max_len)

In [7]:
BATCH_SIZE = 32
trainloader = DataLoader(datatrain,
                           batch_size=BATCH_SIZE,
                           shuffle=True)

valloader = DataLoader(dataval,
                           batch_size=BATCH_SIZE,
                           shuffle=True)
                           

In [8]:
# Driver function
def trainBERT(model, trainloader, val_loader, num_epoch=20):
    
    # Training steps
    loss_fn = nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), lr=2e-5, eps= 1e-8) 
    
    train_loss = []
    train_acc = []
    val_loss = []
    val_acc = []
    best_f1 = 0.

    all_f1 = []
    all_pred = []
    all_label = []

    for epoch in range(num_epoch):
        model.train()
        #Initialize
        correct = 0
        total = 0
        total_loss = 0
        pred_list = []
        labels_list = []
        f1_scores = []

        for i, (data, mask, labels) in enumerate(trainloader):
            data, mask, labels = data.to(device), mask.to(device), labels.to(device)
            model.zero_grad()

            loss, outputs = model(data, token_type_ids = None,
                                  attention_mask= mask,
                                  labels =labels)

            loss.backward()
            optimizer.step()
            label_cpu = labels.squeeze().to('cpu').numpy()
            pred = outputs.data.max(-1)[1].to('cpu').numpy()
            
            # For accuracy
            total += labels.size(0)
            correct += float(sum((pred ==label_cpu)))
            total_loss += loss.item()
            
            # For F1
            f1_scores.append(f1(list(pred), list(label_cpu), average = "weighted"))
            
            all_f1.append(f1(list(pred), list(label_cpu), average = "weighted"))
            all_pred.append(pred)
            all_label.append(label_cpu)
            
        acc = correct/total
        t_loss = total_loss/total
        train_loss.append(t_loss)
        train_acc.append(acc)
        
        
        # report performance 
        print("Train Loss: {}".format(t_loss))
        print("Train Accuracy: {}".format(acc))
        print("Train F1-Score: {}".format(sum(f1_scores)/len(f1_scores)))
        print()
    
    # Evaluate after every epoch
        #Reset the initialization
        correct = 0
        total = 0
        total_loss = 0
        model.eval()
        
        predictions =[]
        truths= []
        val_f1_scores = []

        with torch.no_grad():
            for i, (data, mask, labels) in enumerate(val_loader):
                data, mask, labels = data.to(device), mask.to(device), labels.to(device)
                model.zero_grad()

                va_loss, outputs = model(data, token_type_ids = None,
                                      attention_mask= mask,
                                      labels =labels)

                label_cpu = labels.squeeze().to('cpu').numpy()
                
                pred = outputs.data.max(-1)[1].to('cpu').numpy()
                total += labels.size(0)
                correct += float(sum((pred == label_cpu)))
                total_loss += va_loss.item()
                
                predictions += list(pred)
                truths += list(label_cpu)
                
                #F1 scores calculation
                val_f1_scores.append(f1(list(pred), list(label_cpu), average = "weighted"))
                       
            v_acc = correct/total
            v_loss = total_loss/total
            val_loss.append(v_loss)
            val_acc.append(v_acc)
            v_f1 = sum(val_f1_scores)/len(val_f1_scores)
            
            print("Validation Loss: {}".format(v_loss))
            print("Validation Accuracy: {}".format(v_acc))
            print("Validation F1-Score: {}".format(v_f1))
            print()
             
    # Sanity check
    counter = 0
    number = 2000

    for row in range(number):
        query = df["query"][row]
        label = df["intent"][row]
        query_ids = tokenizer.encode(query)
        query_ids = torch.tensor(tokenizer.encode(query)).unsqueeze(0)
        with torch.no_grad():
            inference_output = model(torch.tensor(query_ids).to(device))
    #     print("Prediction: ", inference_output[0].data.max(-1)[1].to('cpu').numpy()[0], "\n")
    #     print("Label: ", label)
        if inference_output[0].data.max(-1)[1].to('cpu').numpy()[0] == label:
            counter += 1
    print(counter/number)

In [9]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(labels2idx))
model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [10]:
import warnings
warnings.filterwarnings('ignore')

train_loss, train_acc, val_loss, val_acc, val_f1, model, all_f1, all_pred, all_label = trainBERT(model, trainloader, valloader, num_epoch=6)

Train Loss: 0.018523494029316415
Train Accuracy: 0.8375777020252657
Train F1-Score: 0.8678720102274695

Validation Loss: 0.0014558453424263474
Validation Accuracy: 1.0
Validation F1-Score: 1.0

Train Loss: 0.0009712120422845807
Train Accuracy: 1.0
Train F1-Score: 1.0

Validation Loss: 0.0003642549113758915
Validation Accuracy: 1.0
Validation F1-Score: 1.0

Train Loss: 0.00037032398816361705
Train Accuracy: 1.0
Train F1-Score: 1.0

Validation Loss: 0.00019248230828892253
Validation Accuracy: 1.0
Validation F1-Score: 1.0

Train Loss: 0.00021730203072069666
Train Accuracy: 1.0
Train F1-Score: 1.0

Validation Loss: 0.0001251649368643572
Validation Accuracy: 1.0
Validation F1-Score: 1.0

Train Loss: 0.00014880258753949542
Train Accuracy: 1.0
Train F1-Score: 1.0

Validation Loss: 9.027210586496838e-05
Validation Accuracy: 1.0
Validation F1-Score: 1.0

Train Loss: 0.0001094197920322454
Train Accuracy: 1.0
Train F1-Score: 1.0

Validation Loss: 6.792639589237644e-05
Validation Accuracy: 1.0
Val

TypeError: cannot unpack non-iterable NoneType object

In [None]:
torch.save(model, "Intent.pth")

# Inference

In [9]:
model = torch.load("Intent.pth")
model.eval()
model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [27]:
counter = 0
number = 2000

for row in range(number):
    query = df["query"][row]
    label = df["intent"][row]
    query_ids = tokenizer.encode(query)
    query_ids = torch.tensor(tokenizer.encode(query)).unsqueeze(0)
    with torch.no_grad():
        inference_output = model(torch.tensor(query_ids).to(device))
#     print("Prediction: ", inference_output[0].data.max(-1)[1].to('cpu').numpy()[0], "\n")
#     print("Label: ", label)
    if inference_output[0].data.max(-1)[1].to('cpu').numpy()[0] == label:
        counter += 1
print(counter/number)

1.0


In [36]:
row = 6

query = torch.tensor(tokenizer.encode(df["query"][row])).unsqueeze(0)
with torch.no_grad():
    inference_output = model(torch.tensor(query).to(device))
    
print(inference_output[0].data.max(-1)[1].to('cpu').numpy()[0], df["intent"][row])
# label_indices = np.argmax(inference_output[0].to('cpu').numpy(), axis=2)

6 6
