In [None]:
!pip install -q transformers datasets

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig, AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification, AutoConfig

PATH_NAME = "./"
# %cd {PATH_NAME}

## Data Preprocessing

In [None]:

## Sections of config

# Defining key variables for dataLoader, Training
MAX_LEN = 200
BATCH_SIZE = 8
LEARNING_RATE = 1e-05

checkpoint = "trueto/medbert-base-wwm-chinese"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenizer.model_max_len=512
EPOCHS=5
FILE_NAME = "3-5-medical-bert.bin"

In [None]:
df = pd.read_csv("patient_data.csv")
#df['list'] = df[df.columns[1:3]].values.tolist()
new_df = df[['id', 'doctor_faculty', 'description']].copy()
# new_df.head()

#map each department to an index 
departments = new_df['doctor_faculty'].unique()
d2ind = {departments[i]:i for i in range(len(departments))}
ind2d = {i:departments[i] for i in range(len(departments))}

ground_truths = []
for i, row in new_df.iterrows():
  dept_name = row['doctor_faculty']
  dept_ind = d2ind[dept_name]
  one_hot = np.zeros(len(departments))
  one_hot[dept_ind] = 1
  ground_truths.append(one_hot)
new_df['one_hot'] = list(ground_truths)

new_df.drop(['id', 'doctor_faculty'], axis='columns', inplace=True)

new_df.head()

In [None]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.description = dataframe.description
        self.targets = self.data.one_hot
        self.max_len = max_len

    def __len__(self):
        return len(self.description)

    def __getitem__(self, index):
        description = str(self.description[index])
        description = " ".join(description.split())

        inputs = self.tokenizer.encode_plus(
            description,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
# Creating the dataset and dataloader for the neural network
train_size = 0.8
train_dataset=new_df.sample(frac=train_size,random_state=200)

test_dataset=new_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

train_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 2
                }

test_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 2
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

##Training

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [None]:
from torch import nn
from transformers import DataCollatorWithPadding,AutoModelForSequenceClassification, Trainer, TrainingArguments,AutoTokenizer,AutoModel,AutoConfig
from transformers.modeling_outputs import TokenClassifierOutput

class CustomModel(torch.nn.Module):
  def __init__(self,checkpoint,num_labels,temperature=0.5, dropout_rate = 0.1): 
    super(CustomModel,self).__init__() 
    self.num_labels = num_labels 
    self.projection_dim = 256
    self.temperature = temperature
    self.dropout_rate = dropout_rate

    #Load Model with given checkpoint and extract its body
    myConfig = AutoConfig.from_pretrained(checkpoint, output_attentions=True,output_hidden_states=True)
    myConfig.problem_type = "multi_label_classification"
    myConfig.temperature = self.temperature

    self.model = model = AutoModel.from_pretrained(checkpoint,config=myConfig)

    # Freezing paramaters
    # for param in self.model.parameters():
    #         param.requires_grad = False

    self.dropout = torch.nn.Dropout(self.dropout_rate) 
    self.classifier = torch.nn.Linear(self.model.config.hidden_size,num_labels) # load and initialize weights
    self.criterion = torch.nn.CrossEntropyLoss() # define loss function

  def forward(self, input_ids=None, attention_mask=None,labels=None):
    #Extract outputs from the body

    outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
    sequence_output = self.dropout(outputs[0])
    logits = self.classifier(sequence_output[:,0,:]) #predict the labels based on the projected output
    loss = self.criterion(logits, labels)
    
    #sequence_output.shape torch.Size([8, 200, 768]) sequence_output[:,0,:].shape torch.Size([8, 768]) sequence_output[:,0,:].view(-1,768).shape torch.Size([8, 768])
    return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states,attentions=outputs.attentions)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model=CustomModel(checkpoint=checkpoint,num_labels=10).to(device)
print(model)

In [None]:
from tqdm.auto import tqdm
num_training_steps = EPOCHS * len(training_loader)
from datasets import load_metric
metric = load_metric("f1")

def train(optimizer, model, training_loader, testing_loader, device, num_epochs, learning_rate = 0.1):
    best_accuracy = float('inf')

    print("Training Started with hyperparameters: batch_{BATCH}_lr_{LEARNING_RATE}_dropout_{DROPOUT}_temp_{TEMPERATURE}_weights.bin".format(BATCH=training_loader.batch_size, LEARNING_RATE=learning_rate, DROPOUT=model.dropout_rate, TEMPERATURE=model.temperature))
    
    for epoch in range(num_epochs):
        model.train()
        total_loss_current_epoch = 0
        for _,data in enumerate(training_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)

            #Calling the model
            outputs = model(ids, mask, targets)

            loss = outputs.loss
            loss.backward()
            optimizer.step()
            # lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar_train.update(1)
            if _%5000==0:
                print(f'Epoch: {epoch}, Loss:  {loss.item()}')
            
            #Calculating total loss during this EPOCH
            total_loss_current_epoch += loss.item()

        outputs, targets = validation(model, testing_loader,device)
        outputs = np.array(outputs) >= 0.5
        accuracy = round(metrics.accuracy_score(targets, outputs),3)

        if accuracy < best_accuracy:
            best_accuracy = accuracy

            torch.save(model.classifier.state_dict(), "{PATH_NAME}/{FILE_NAME}".format(PATH_NAME=PATH_NAME, FILE_NAME = "3_8_accuracy_{ACCURACY}_batch_{BATCH}_lr_{LEARNING_RATE}_dropout_{DROPOUT}_temp_{TEMPERATURE}_weights.bin".format(ACCURACY=accuracy, BATCH=training_loader.batch_size, LEARNING_RATE=learning_rate, DROPOUT=model.dropout_rate, TEMPERATURE=model.temperature)))
            #torch.save(model.state_dict(), "{PATH_NAME}/{FILE_NAME}".format(PATH_NAME=PATH_NAME, FILE_NAME = FILE_NAME))
        print(f"Epoch {epoch} - Validation Accuracy: {accuracy}")

        epoch_loss = total_loss_current_epoch / len(training_loader)
        print(f"Epoch {epoch} - Validation Loss: {epoch_loss}")
        print("#"*50)

progress_bar_train = tqdm(range(num_training_steps))
progress_bar_eval = tqdm(range(EPOCHS * len(testing_loader)))
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

train(optimizer, model, training_loader, testing_loader, device, EPOCHS, LEARNING_RATE)

# import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'




## Validation 

In [None]:

def validation( model, testing_loader, device, model_weights_path = None):

    if model_weights_path:
      state_dict = torch.load(PATH_NAME+model_weights_path)
      model.load_state_dict(state_dict)
      # model.classifier.load_state_dict(torch.load(PATH_NAME+'classification_head_weights.bin'))

    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)

            outputs = model(ids, mask, targets)
            
            sigmoids = torch.sigmoid(outputs.logits).cpu().detach().numpy().tolist()
            idx = np.argmax(sigmoids, axis=-1)
            sigmoids = np.zeros( (len(sigmoids), len(sigmoids[1])) )
            sigmoids[ np.arange(len(sigmoids)), idx] = 1
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(sigmoids)
    return fin_outputs, fin_targets

## Grid Search

In [None]:
param_grid = {'dropout': [0.1, 0.2],
              'learning_rate': [0.0001,0.001, 0.01],
              'batch_size': [32, 64],
              'temperature': [0.9, 0.7]}

from sklearn.metrics import accuracy_score
import torch.optim as optim

for batch_size in param_grid['batch_size']:
    train_params = {'batch_size': batch_size,
                'shuffle': True,
                'num_workers': 2
                }

    test_params = {'batch_size': batch_size,
                'shuffle': True,
                'num_workers': 2
                }

    training_loader = DataLoader(training_set, **train_params)
    testing_loader = DataLoader(testing_set, **test_params)

    for learning_rate in param_grid['learning_rate']:
        for dropout_rate in param_grid['dropout']:
            for temperature in param_grid['temperature']:
                device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
                model=CustomModel(checkpoint=checkpoint, num_labels=10, temperature=temperature, dropout_rate=dropout_rate).to(device)
                optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)

                train(optimizer, model, training_loader, testing_loader, device, EPOCHS, LEARNING_RATE)

                

                


In [11]:
from sklearn import metrics

# Define additional metrics
def precision(outputs, targets):
    return metrics.precision_score(targets, outputs, average = 'weighted')

def recall(outputs, targets):
    return metrics.recall_score(targets, outputs, average = 'weighted')

In [13]:
for epoch in range(1):
    outputs, targets = validation()
    print('outputs', outputs)
    print('targets', targets)
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    precision_score = precision(outputs, targets)
    recall_score = recall(outputs, targets)
      
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")
    print(f"Precision Score = {precision_score}")
    print(f"Recall Score = {recall_score}")

TypeError: validation() missing 3 required positional arguments: 'model', 'testing_loader', and 'device'

In [None]:
torch.save(model.state_dict(), "{PATH_NAME}/roberta-base-additional-classification-layer-bert-medical.bin".format(PATH_NAME=PATH_NAME))