In [1]:
import sys 
sys.path.insert(0, '../')

In [4]:
%pip install torch pytorch_pretrained_bert pandas scikit-learn numpy #installing missing packages (In Laurence's anaconda environment)


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


### Load Packages

In [6]:
from common.common import create_folder
from common.pytorch import load_model
import pytorch_pretrained_bert as Bert
from model.utils import age_vocab
from common.common import load_obj
from dataLoader.MLM import MLMLoader
from torch.utils.data import DataLoader
import pandas as pd
from model.MLM import BertForMaskedLM
from model.optimiser import adam
import sklearn.metrics as skm
import numpy as np
import torch
import time
import torch.nn as nn
import os

### Initialize BERT model configuration based on dictionary values. TrainConfig is training settings

In [7]:
class BertConfig(Bert.modeling.BertConfig):
    def __init__(self, config):
        super(BertConfig, self).__init__(
            vocab_size_or_config_json_file=config.get('vocab_size'), # size of vocabulary for token embeddings, used by model to embedd words
            hidden_size=config['hidden_size'], #number of units in each hidden layer
            num_hidden_layers=config.get('num_hidden_layers'), # number of transformer layers
            num_attention_heads=config.get('num_attention_heads'), # attention heads in each layer
            intermediate_size=config.get('intermediate_size'), # dimensions of intermediate (feed-forward) layers
            hidden_act=config.get('hidden_act'), #activation function used in hidden layers
            hidden_dropout_prob=config.get('hidden_dropout_prob'), # dropout rate for hidden layers
            attention_probs_dropout_prob=config.get('attention_probs_dropout_prob'), # dropout rate for attention probabilities
            max_position_embeddings = config.get('max_position_embedding'), #maximum sequence length the model can handle
            initializer_range=config.get('initializer_range'), #Range for initializing the model's weights
        )
        self.seg_vocab_size = config.get('seg_vocab_size') 
        self.age_vocab_size = config.get('age_vocab_size')
        
class TrainConfig(object):
    def __init__(self, config):
        self.batch_size = config.get('batch_size') # number of samples per batch during training
        self.use_cuda = config.get('use_cuda') # boolean indicating whether to use a GPU (CUDA) if available
        self.max_len_seq = config.get('max_len_seq') # maximum sequence length of input sequences
        self.train_loader_workers = config.get('train_loader_workers') # number of worker processes for loading training and testing data (for parallel loading)
        self.test_loader_workers = config.get('test_loader_workers')  
        self.device = config.get('device') # specify device for training 'cpu' or 'cuda' (note: macs don't use nvidea gpu's)
        self.output_dir = config.get('output_dir') # output directory
        self.output_name = config.get('output_name') #file name for output
        self.best_name = config.get('best_name') #file name for saving best performing model

### File Configuration - File paths are to be specified here. Made corresponding folders and file paths, but double check that all the paths are right

In [22]:
file_config = {
    'vocab':'../data/vocab',  # vocabulary idx2token, token2idx <- these two are mappings
    'data': '../data/formatted_data.csv',  # path to formated data (prepared for model training) 
    'model_path': '../models', # where to save model - directory where trained model will be saved
    'model_name': 'bert_model_bin', # name of model file to save
    'file_name': '../logs/training.log',  # log path - training progress, metrics, and other runtime information
}
create_folder(file_config['model_path']) #creates folder

In [13]:
global_params = {
    'max_seq_len': 64, #max length on input sequence
    'max_age': 110, #age feature of dataset?
    'month': 1,
    'age_symbol': None,
    'min_visit': 5, #minimum number of visits for a patient?
    'gradient_accumulation_steps': 1
}

optim_param = {
    'lr': 3e-5, #learning rate
    'warmup_proportion': 0.1, # 
    'weight_decay': 0.01 # regularization term to prevent overfitting
}

train_params = {
    'batch_size': 256, # number of sampeles processed at each training step
    'use_cuda': True, # use NVIDEA gpu if available
    'max_len_seq': global_params['max_seq_len'], #set max sequence length
    'device': 'cuda:0'
}

### stuck here...  data/vocab.pkl might need to contain idx2token and token2idx (.pkl files turn a tuple into 2 numpy arrays)

In [23]:
BertVocab = load_obj(file_config['vocab'])
ageVocab, _ = age_vocab(max_age=global_params['max_age'], mon=global_params['month'], symbol=global_params['age_symbol'])

EOFError: Ran out of input

In [None]:
data = pd.read_parquet(file_config['data'])
# remove patients with visits less than min visit
data['length'] = data['caliber_id'].apply(lambda x: len([i for i in range(len(x)) if x[i] == 'SEP']))
data = data[data['length'] >= global_params['min_visit']]
data = data.reset_index(drop=True)

In [None]:
Dset = MLMLoader(data, BertVocab['token2idx'], ageVocab, max_len=train_params['max_len_seq'], code='caliber_id')
trainload = DataLoader(dataset=Dset, batch_size=train_params['batch_size'], shuffle=True, num_workers=3)

In [None]:
model_config = {
    'vocab_size': len(BertVocab['token2idx'].keys()), # number of disease + symbols for word embedding
    'hidden_size': 288, # word embedding and seg embedding hidden size
    'seg_vocab_size': 2, # number of vocab for seg embedding
    'age_vocab_size': len(ageVocab.keys()), # number of vocab for age embedding
    'max_position_embedding': train_params['max_len_seq'], # maximum number of tokens
    'hidden_dropout_prob': 0.1, # dropout rate
    'num_hidden_layers': 6, # number of multi-head attention layers required
    'num_attention_heads': 12, # number of attention heads
    'attention_probs_dropout_prob': 0.1, # multi-head attention dropout rate
    'intermediate_size': 512, # the size of the "intermediate" layer in the transformer encoder
    'hidden_act': 'gelu', # The non-linear activation function in the encoder and the pooler "gelu", 'relu', 'swish' are supported
    'initializer_range': 0.02, # parameter weight initializer range
}

In [None]:
conf = BertConfig(model_config)
model = BertForMaskedLM(conf)

In [None]:
model = model.to(train_params['device'])
optim = adam(params=list(model.named_parameters()), config=optim_param)

In [None]:
def cal_acc(label, pred):
    logs = nn.LogSoftmax()
    label=label.cpu().numpy()
    ind = np.where(label!=-1)[0]
    truepred = pred.detach().cpu().numpy()
    truepred = truepred[ind]
    truelabel = label[ind]
    truepred = logs(torch.tensor(truepred))
    outs = [np.argmax(pred_x) for pred_x in truepred.numpy()]
    precision = skm.precision_score(truelabel, outs, average='micro')
    return precision

In [None]:
def train(e, loader):
    tr_loss = 0
    temp_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    cnt= 0
    start = time.time()

    for step, batch in enumerate(loader):
        cnt +=1
        batch = tuple(t.to(train_params['device']) for t in batch)
        age_ids, input_ids, posi_ids, segment_ids, attMask, masked_label = batch
        loss, pred, label = model(input_ids, age_ids, segment_ids, posi_ids,attention_mask=attMask, masked_lm_labels=masked_label)
        if global_params['gradient_accumulation_steps'] >1:
            loss = loss/global_params['gradient_accumulation_steps']
        loss.backward()
        
        temp_loss += loss.item()
        tr_loss += loss.item()
        
        nb_tr_examples += input_ids.size(0)
        nb_tr_steps += 1
        
        if step % 200==0:
            print("epoch: {}\t| cnt: {}\t|Loss: {}\t| precision: {:.4f}\t| time: {:.2f}".format(e, cnt, temp_loss/2000, cal_acc(label, pred), time.time()-start))
            temp_loss = 0
            start = time.time()
            
        if (step + 1) % global_params['gradient_accumulation_steps'] == 0:
            optim.step()
            optim.zero_grad()

    print("** ** * Saving fine - tuned model ** ** * ")
    model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
    create_folder(file_config['model_path'])
    output_model_file = os.path.join(file_config['model_path'], file_config['model_name'])

    torch.save(model_to_save.state_dict(), output_model_file)
        
    cost = time.time() - start
    return tr_loss, cost

In [None]:
f = open(os.path.join(file_config['model_path'], file_config['file_name']), "w")
f.write('{}\t{}\t{}\n'.format('epoch', 'loss', 'time'))
for e in range(50):
    loss, time_cost = train(e, trainload)
    loss = loss/data_len
    f.write('{}\t{}\t{}\n'.format(e, loss, time_cost))
f.close()    