In [4]:
!pip install -q transformers datasets

In [5]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig, AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification, AutoConfig

# from google.colab import drive
# drive.mount('/content/gdrive')

PATH_NAME = "./"
# %cd {PATH_NAME}

  from .autonotebook import tqdm as notebook_tqdm


## Data Preprocessing

In [6]:

## Sections of config

# Defining key variables for dataLoader, Training
MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
LEARNING_RATE = 1e-05

checkpoint = "trueto/medbert-base-wwm-chinese"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenizer.model_max_len=512
EPOCHS=5
FILE_NAME = "3-5-medical-bert.bin"

In [7]:
df = pd.read_csv("patient_data.csv")
#df['list'] = df[df.columns[1:3]].values.tolist()
new_df = df[['id', 'doctor_faculty', 'description']].copy()
# new_df.head()

#map each department to an index 
departments = new_df['doctor_faculty'].unique()
d2ind = {departments[i]:i for i in range(len(departments))}
ind2d = {i:departments[i] for i in range(len(departments))}

ground_truths = []
for i, row in new_df.iterrows():
  dept_name = row['doctor_faculty']
  dept_ind = d2ind[dept_name]
  one_hot = np.zeros(len(departments))
  one_hot[dept_ind] = 1
  ground_truths.append(one_hot)
new_df['one_hot'] = list(ground_truths)

new_df.drop(['id', 'doctor_faculty'], axis='columns', inplace=True)

new_df.head()

Unnamed: 0,description,one_hot
0,疾病： 孕期不舒服病情描述： 近两天肚子有下坠感，不痛，心跳比较快，其他没有不舒服希望获得的...,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,疾病： 指甲上突起的横纹病情描述： 上周发现拇指指甲上突起一道横纹！不疼不痒，突起来的横纹！...,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,疾病： hpv疫苗病情描述： 我每两年做一次宫颈癌全套筛查都是阴性，无任何感染。tct阴，去...,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,疾病： 宝宝眼角红红的，严重时轻微溃烂。病情描述： 宝宝眼角红红的氧，用小手挠，严重时轻微溃...,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,疾病： 宝宝奶粉过敏、湿疹病情描述： 两个礼拜前喂了普通的飞鹤奶粉，是用勺子喂的，喝完一点点...,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [8]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.description = dataframe.description
        self.targets = self.data.one_hot
        self.max_len = max_len

    def __len__(self):
        return len(self.description)

    def __getitem__(self, index):
        description = str(self.description[index])
        description = " ".join(description.split())

        inputs = self.tokenizer.encode_plus(
            description,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [9]:
# Creating the dataset and dataloader for the neural network
train_size = 0.8
train_dataset=new_df.sample(frac=train_size,random_state=200)

test_dataset=new_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

FULL Dataset: (25930, 2)
TRAIN Dataset: (20744, 2)
TEST Dataset: (5186, 2)


##Training

In [10]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [11]:
from torch import nn
from transformers import DataCollatorWithPadding,AutoModelForSequenceClassification, Trainer, TrainingArguments,AutoTokenizer,AutoModel,AutoConfig
from transformers.modeling_outputs import TokenClassifierOutput

class CustomModel(torch.nn.Module):
  def __init__(self,checkpoint,num_labels): 
    super(CustomModel,self).__init__() 
    self.num_labels = num_labels 
    self.projection_dim = 256
    self.temperature = 0.5

    #Load Model with given checkpoint and extract its body
    myConfig = AutoConfig.from_pretrained(checkpoint, output_attentions=True,output_hidden_states=True)
    myConfig.problem_type = "multi_label_classification"
    self.model = model = AutoModel.from_pretrained(checkpoint,config=myConfig)

    # Freezing paramaters
    # for param in self.model.parameters():
    #         param.requires_grad = False
    self.projection_head = nn.Sequential(
            nn.Linear(self.model.config.hidden_size, self.projection_dim),
            nn.ReLU(),
            nn.Linear(self.projection_dim, self.projection_dim),
        )
    self.dropout = torch.nn.Dropout(0.1) 
    self.classifier = torch.nn.Linear(self.projection_dim,num_labels) # load and initialize weights
    self.criterion = torch.nn.CrossEntropyLoss() # define loss function

  def forward(self, input_ids=None, attention_mask=None,labels=None):
    #Extract outputs from the body

    outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)

    # # Project the embeddings
    # projected = self.projection_head(outputs)

    # # Normalize the projected embeddings
    # projected = nn.functional.normalize(projected, p=2, dim=1)

    #Add custom layers
    sequence_output = self.dropout(outputs[0]) #outputs[0]=last hidden state
    #sequence_output.shape torch.Size([8, 200, 768]) sequence_output[:,0,:].shape torch.Size([8, 768]) sequence_output[:,0,:].view(-1,768).shape torch.Size([8, 768])

    hidden = self.projection_head(sequence_output[:,0,:])
    z = torch.nn.functional.normalize(hidden, dim=1) # L2 normalize the projected output
    sloss = self.contrastive_loss(hidden, labels)

    # logits = self.classifier(sequence_output[:,0,:].view(-1,768)) # calculate losses # NOT SURE WHY WE ARE ONLY GRABBING THE FIRST TOKEN
    logits = self.classifier(z) #predict the labels based on the projected output
    mloss = self.criterion(logits, labels)
    
    # newLoss = TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states,attentions=outputs.attentions).loss
    # this returns the same loss for some reason
    loss = mloss + sloss
    return loss

  def contrastive_loss(self, hidden, labels):
        # Calculate supervised contrastive loss
        device = hidden.device

        # Get batch size
        N = hidden.shape[0]
        # Calculate cosine similarity between all pairs of embeddings
        similarities = torch.matmul(hidden, hidden.T) / self.temperature

        labels = torch.argmax(labels, dim=1)
        print("hidden.shape", hidden.shape)
        print("labels", labels.shape, labels)
        # Generate positive pairs
        # mask = torch.eq(labels.view(N, 1), labels.view(1, N)).to(device)
        # pos_mask = mask.logical_and(torch.eye(N).bool().to(device))
        # pos_similarities = similarities[pos_mask].view(N, -1)
        mask = torch.eq(labels.view(N, 1), labels.view(1, N)).to(device)
        pos_mask = mask.logical_and(torch.eye(N).bool().to(device)).float()
        pos_similarities = similarities * pos_mask
        pos_similarities = torch.sum(pos_similarities, dim=1, keepdim=True)

        print('pos_similarities.shape', pos_similarities.shape)
        print('pos_similarities', pos_similarities)

        # Generate negative pairs
        neg_mask =  (~mask).triu(diagonal=-1).bool().to(device)
        print("neg_mask.shape", neg_mask.shape)
        print("similarities.shape", similarities.shape)
        print("simlarities[neg_mask].shape", similarities[neg_mask].shape)
        neg_similarities = similarities[neg_mask].view(N, -1)

        # Concatenate positive and negative similarities
        similarities = torch.cat([pos_similarities, neg_similarities], dim=1)

        # Calculate supervised contrastive loss
        targets = torch.arange(N).to(device)
        loss = torch.nn.functional.cross_entropy(similarities, targets)


        # print("hidden.shape, labels.shape", hidden.shape, labels.shape)
        # # Calculate cosine similarity between all pairs of embeddings
        # similarities = torch.matmul(hidden, hidden.T) / self.temperature
        # print("similarities.shape", similarities.shape)

        # labels = torch.argmax(labels, dim=1)
        # # labels.shape = [8,10] N = 8
        # print("labels.shape", labels.shape, "N", N)
        # # Generate positive pairs
        # mask = torch.eq(labels.view(N, 1), labels.view(1, N)).to(device)
        # pos_mask = mask.logical_and(torch.eye(N).bool().to(device))
        # pos_similarities = similarities[pos_mask].view(N, -1)

        # # Generate negative pairs
        
        # neg_mask =  (~mask) & (1 - torch.eye(N)).bool().to(device)
        # print("neg_mask.shape", neg_mask.shape, "N", N, "similarities.shape", similarities.shape, "similarities[neg_mask].shape", similarities[neg_mask].shape)
        # neg_similarities = similarities[neg_mask].view(N, -1)

        # # Concatenate positive and negative similarities
        # similarities = torch.cat([pos_similarities, neg_similarities], dim=1)

        # # Calculate supervised contrastive loss
        # targets = torch.arange(N).to(device)
        # loss = torch.nn.functional.cross_entropy(similarities, targets)

        # return loss
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model=CustomModel(checkpoint=checkpoint,num_labels=10).to(device)

Some weights of the model checkpoint at trueto/medbert-base-wwm-chinese were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
from tqdm.auto import tqdm
num_training_steps = EPOCHS * len(training_loader)
from datasets import load_metric
metric = load_metric("f1")

progress_bar_train = tqdm(range(num_training_steps))
progress_bar_eval = tqdm(range(EPOCHS * len(testing_loader)))
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

best_loss = float('inf')
best_accuracy = float('inf')

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

for epoch in range(EPOCHS):
  model.train()
  total_loss_current_epoch = 0
  for _,data in enumerate(training_loader, 0):

      ids = data['ids'].to(device, dtype = torch.long)
      mask = data['mask'].to(device, dtype = torch.long)
      token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
      targets = data['targets'].to(device, dtype = torch.float)

      #Calling the model
      outputs = model(ids, mask, targets)

      loss = outputs
      loss.backward()
      optimizer.step()
      # lr_scheduler.step()
      optimizer.zero_grad()
      progress_bar_train.update(1)
      if _%1000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
      
      #Calculating total loss during this EPOCH
      total_loss_current_epoch += loss.item()

  outputs, targets = validation()
  outputs = np.array(outputs) >= 0.5
  accuracy = metrics.accuracy_score(targets, outputs)

  if accuracy < best_accuracy:
      best_accuracy = accuracy

      torch.save(model.classifier.state_dict(), "{PATH_NAME}/{FILE_NAME}".format(PATH_NAME=PATH_NAME, FILE_NAME = "classification_head_weights.bin"))
      #torch.save(model.state_dict(), "{PATH_NAME}/{FILE_NAME}".format(PATH_NAME=PATH_NAME, FILE_NAME = FILE_NAME))
  print(f"Epoch {epoch} - Validation Accuracy: {accuracy}")

  epoch_loss = total_loss_current_epoch / len(training_loader)
  print(f"Epoch {epoch} - Validation Loss: {epoch_loss}")


  metric = load_metric("f1")
  0%|          | 0/12965 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


hidden.shape torch.Size([8, 256])
labels torch.Size([8]) tensor([0, 2, 0, 1, 5, 4, 8, 1], device='cuda:0')
pos_similarities.shape torch.Size([8, 1])
pos_similarities tensor([[19.9176],
        [18.7996],
        [17.3792],
        [20.4328],
        [22.7981],
        [19.8031],
        [20.0491],
        [ 9.4517]], device='cuda:0', grad_fn=<SumBackward1>)
neg_mask.shape torch.Size([8, 8])
similarities.shape torch.Size([8, 8])
simlarities[neg_mask].shape torch.Size([33])




RuntimeError: shape '[8, -1]' is invalid for input of size 33

## Validation 

In [13]:

def validation(model_weights_path = None):

    if model_weights_path:
      state_dict = torch.load(PATH_NAME+model_weights_path)
      model.load_state_dict(state_dict)
      # model.classifier.load_state_dict(torch.load(PATH_NAME+'classification_head_weights.bin'))

    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, targets)
            sigmoids = torch.sigmoid(outputs.logits).cpu().detach().numpy().tolist()
            idx = np.argmax(sigmoids, axis=-1)
            sigmoids = np.zeros( (len(sigmoids), len(sigmoids[1])) )
            sigmoids[ np.arange(len(sigmoids)), idx] = 1
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(sigmoids)
    return fin_outputs, fin_targets

In [14]:
from sklearn import metrics

# Define additional metrics
def precision(outputs, targets):
    return metrics.precision_score(targets, outputs, average = 'weighted')

def recall(outputs, targets):
    return metrics.recall_score(targets, outputs, average = 'weighted')

In [16]:
for epoch in range(1):
    outputs, targets = validation()
    print('outputs', outputs)
    print('targets', targets)
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    precision_score = precision(outputs, targets)
    recall_score = recall(outputs, targets)
      
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")
    print(f"Precision Score = {precision_score}")
    print(f"Recall Score = {recall_score}")

hidden.shape torch.Size([4, 256])
labels torch.Size([4]) tensor([9, 7, 1, 1], device='cuda:0')
pos_similarities.shape torch.Size([4, 1])
pos_similarities tensor([[15.9112],
        [17.0189],
        [19.9760],
        [17.6130]], device='cuda:0')
neg_mask.shape torch.Size([4, 4])
similarities.shape torch.Size([4, 4])
simlarities[neg_mask].shape torch.Size([7])


RuntimeError: shape '[4, -1]' is invalid for input of size 7

In [None]:
torch.save(model.state_dict(), "{PATH_NAME}/roberta-base-additional-classification-layer-bert-medical.bin".format(PATH_NAME=PATH_NAME))