### install packages

In [1]:
!pip install --pre --upgrade torch==1.6.0.dev20200411+cu101 torchvision -f https://download.pytorch.org/whl/nightly/cu101/torch_nightly.html
!pip install --pre --upgrade pytorch-ignite 
!pip install --upgrade pynvml fire

Looking in links: https://download.pytorch.org/whl/nightly/cu101/torch_nightly.html
Collecting torch==1.6.0.dev20200411+cu101
  Downloading https://download.pytorch.org/whl/nightly/cu101/torch-1.6.0.dev20200411%2Bcu101-cp37-cp37m-linux_x86_64.whl (718.1 MB)
[K     |████████████████████████████████| 718.1 MB 160 bytes/s a 0:00:01   |▌                               | 11.0 MB 734 kB/s eta 0:16:03     |████▊                           | 104.8 MB 39.7 MB/s eta 0:00:16     |██████▊                         | 150.0 MB 47.0 MB/s eta 0:00:13     |██████████████████████▏         | 498.0 MB 53.3 MB/s eta 0:00:05     |██████████████████████████████▎ | 680.4 MB 34.9 MB/s eta 0:00:02
[?25hCollecting torchvision
  Downloading https://download.pytorch.org/whl/nightly/cu101/torchvision-0.8.0.dev20200724%2Bcu101-cp37-cp37m-linux_x86_64.whl (9.2 MB)
[K     |████████████████████████████████| 9.2 MB 59.7 MB/s eta 0:00:01
[31mERROR: torchvision 0.8.0.dev20200724+cu101 has requirement torch==1.7.0.dev20200

In [2]:
#!pip install transformers

In [1]:
import torch
import torchvision
import torch.nn as nn
import random
import time
import math

import pandas as pd
import numpy as np
import collections
from collections import Counter

from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split
import sklearn.metrics
from sklearn.metrics import f1_score

from transformers import BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import BertTokenizer

from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.optim import Optimizer
import torch.nn.functional as F

from torch.cuda.amp import autocast
from torch.cuda.amp import GradScaler


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#seed_val = 17
#random.seed(seed_val)
#np.random.seed(seed_val)
#torch.manual_seed(seed_val)
#torch.cuda.manual_seed_all(seed_val)


print(f'working on {device}, GPU is a {torch.cuda.get_device_name()}')
print(f'using pytorch version {torch.__version__}')

working on cuda, GPU is a Tesla K80
using pytorch version 1.5.0


In [3]:
#getting the clinical biobert tokenizer
tokenizer = BertTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




In [4]:
#set directories
data_dir = '/floyd/home/data/' 
model_dir = '/floyd/home/models/'

In [5]:
#loads result dataframe or initializes an empty one
try:
    results_df = pd.read_csv(data_dir + 'results.csv', index_col = 0)
except:
    results_df = pd.DataFrame(columns = ['experiment description', 'num samples', 'weighting', 'f1w', 'acc', 'auroc', 'ppv', 'sens', 'batch size'])

results_df.index = list(range(len(results_df)))   #to clean the index 
results_df

Unnamed: 0,experiment description,num samples,weighting,f1w,acc,auroc,ppv,sens,batch size
0,fresh start floydhub,10000,,0.85842,0.904,0.672198,0.0,,32
1,floydhub gpu2 weighted cross ent,25000,"tensor([1.0000, 0.1250])",0.85842,0.904,0.803038,0.0,,32
2,floydhub gpu2 weighted cross ent,25000,"tensor([1, 8])",0.807983,0.764,0.809364,0.704167,0.24564,32
3,fresh start floydhub gpu2,25000,,0.880639,0.9048,0.808262,0.159836,0.541667,32
4,gpu2 weights,25000,"tensor([1, 9])",0.803657,0.7596,0.803181,0.691057,0.244604,32
5,gpu2 weights,25000,"tensor([1, 7])",0.841837,0.8156,0.796174,0.581967,0.283433,32
6,larger sample weighted loss,100000,"tensor([1, 8])",0.827629,0.7931,0.818773,0.677126,0.276561,32
7,larger sample weighted focal loss,50000,"tensor([1, 8])",0.877208,0.899,0.799443,0.170385,0.466667,32
8,weighted focal loss bias trick,50000,"tensor([1, 8])",0.878251,0.9046,0.807105,0.13786,0.536,32
9,tweaked weighted crossentropy,25000,"tensor([0.1111, 0.8887], device='cuda:0', dtyp...",0.814802,0.7736,0.821624,0.710744,0.257485,32


### define helper functions

In [6]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return sklearn.metrics.accuracy_score(labels_flat, preds_flat)

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [7]:
#load up the weighted sampler class
class ImbalancedDatasetSampler(torch.utils.data.sampler.Sampler):
    """Samples elements randomly from a given list of indices for imbalanced dataset
    Arguments:
        indices (list, optional): a list of indices
        num_samples (int, optional): number of samples to draw
        callback_get_label func: a callback-like function which takes two arguments - dataset and index
    """

    def __init__(self, dataset, indices=None, num_samples=None, callback_get_label=None):
                
        # if indices is not provided, 
        # all elements in the dataset will be considered
        self.indices = list(range(len(dataset))) \
            if indices is None else indices

        # define custom callback
        self.callback_get_label = callback_get_label

        # if num_samples is not provided, 
        # draw `len(indices)` samples in each iteration
        self.num_samples = len(self.indices) \
            if num_samples is None else num_samples
            
        # distribution of classes in the dataset 
        label_to_count = {}
        for idx in self.indices:
            label = self._get_label(dataset, idx)
            if label in label_to_count:
                label_to_count[label] += 1
            else:
                label_to_count[label] = 1
                
        # weight for each sample
        weights = [1.0 / label_to_count[self._get_label(dataset, idx)]
                   for idx in self.indices]
        self.weights = torch.DoubleTensor(weights)

    def _get_label(self, dataset, idx):
        if isinstance(dataset, torchvision.datasets.MNIST):
            return dataset.train_labels[idx].item()
        elif isinstance(dataset, torchvision.datasets.ImageFolder):
            return dataset.imgs[idx][1]
        elif isinstance(dataset, torch.utils.data.Subset):
            return dataset.dataset.imgs[idx][1]
        elif self.callback_get_label:
            return self.callback_get_label(dataset, idx)
        else:
            raise NotImplementedError
                
    def __iter__(self):
        return (self.indices[i] for i in torch.multinomial(
            self.weights, self.num_samples, replacement=True))

    def __len__(self):
        return self.num_samples

In [8]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [9]:
def train_model(model, dataloader_train, dataloader_valid, save_name = None, mixed_prec = True, lr = 1e-5, eps = 1e-8, epochs = 3):
  
    #model.to(device)

    optimizer = AdamW(model.parameters(), lr=lr, eps=eps, weight_decay=0.01)
    
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)
    if mixed_prec == True:
        scaler = GradScaler()
  
    for epoch in tqdm(range(1, epochs+1)):
    
        model.train()
    
        loss_train_total = 0

        progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
        for batch in progress_bar:

            model.zero_grad()
        
            batch = tuple(b.to(device) for b in batch)
        
            inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       


            if mixed_prec == True:
                with autocast():
                    outputs = model(**inputs)
            else:
                outputs = model(**inputs)
        
            loss = outputs[0]
            loss_train_total += loss.item()
            
            if mixed_prec == True:
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                loss.backward()
                optimizer.step()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            scheduler.step()
        
            progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
        if save_name:
            torch.save(model.state_dict(), f'{model_dir}finetuned_BERT_{save_name}_epoch_{epoch}.model')
        
        tqdm.write(f'\nEpoch {epoch}')
    
        loss_train_avg = loss_train_total/len(dataloader_train)            
        tqdm.write(f'Training loss: {loss_train_avg}')
    
        val_loss, predictions, true_vals = evaluate(dataloader_validation)
        val_f1 = f1_score_func(predictions, true_vals)
        val_acc = accuracy_score_func(predictions, true_vals)
        tqdm.write(f'Validation loss: {val_loss}')
        tqdm.write(f'F1 Score (Weighted): {val_f1}')
        tqdm.write(f'Validation accuracy: {val_acc}')


In [10]:
def train_model_defined_loss(model, dataloader_train, dataloader_valid, save_name = None, mixed_prec = True, lr = 1e-5, eps = 1e-8, epochs = 3, weight_decay = 0.01):

    optimizer = AdamW(model.parameters(), lr=lr, eps=eps, weight_decay=weight_decay)
    
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)
    if mixed_prec == True:
        scaler = GradScaler()
  
    for epoch in tqdm(range(1, epochs+1)):
    
        model.train()
    
        loss_train_total = 0

        progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
        for batch in progress_bar:

            model.zero_grad()
        
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)
            
            
            #batch = tuple(b.to(device) for b in batch)
        
            #inputs = {'input_ids':      batch[0],
            #      'attention_mask': batch[1],
            #      'labels':         batch[2],
            #   }       


            if mixed_prec == True:
                with autocast():
                    outputs = model(input_ids, attention_mask)
            else:
                outputs = model(input_ids, attention_mask)
        
            loss = loss_func(outputs[0],labels)
            loss_train_total += loss.item()
            
            #if mixed_prec == True:
            #    scaler.scale(loss).backward()
            #    scaler.step(optimizer)
            #    scaler.update()
            #else:
            loss.backward()
            optimizer.step()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            scheduler.step()
        
            progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
        if save_name:
            torch.save(model.state_dict(), f'{model_dir}finetuned_BERT_{save_name}_epoch_{epoch}.model')
        
        tqdm.write(f'\nEpoch {epoch}')
    
        loss_train_avg = loss_train_total/len(dataloader_train)            
        tqdm.write(f'Training loss: {loss_train_avg}')
    
        val_loss, predictions, true_vals = evaluate(dataloader_validation)
        val_f1 = f1_score_func(predictions, true_vals)
        val_acc = accuracy_score_func(predictions, true_vals)
        tqdm.write(f'Validation loss: {val_loss}')
        tqdm.write(f'F1 Score (Weighted): {val_f1}')
        tqdm.write(f'Validation accuracy: {val_acc}')


In [11]:
def get_metrics(dataloader): #(predictions, true_vals, dataloader):

    _, predictions, true_vals = evaluate(dataloader)

    preds = [np.argmax(pred) for pred in predictions]
    preds_flat = np.argmax(preds).flatten()
    true_vals = true_vals.flatten()

    f1_w = sklearn.metrics.f1_score(true_vals, preds, average='weighted')
    f1 = sklearn.metrics.f1_score(true_vals, preds, average=None)
    acc = sklearn.metrics.accuracy_score(true_vals, preds)
    prec = sklearn.metrics.precision_score(true_vals,preds, average=None) 
    rec = sklearn.metrics.recall_score(true_vals,preds, average=None)
    auroc = sklearn.metrics.roc_auc_score(true_vals,predictions[:,1], average=None)
    confusion = sklearn.metrics.confusion_matrix(true_vals, preds)

    tn, fn, fp, tp = confusion[0,0], confusion[0,1], confusion[1,0], confusion[1,1]

    sens = tp/(tp + fn)
    spec = tn/(tn + fp)
    ppv = tp/(tp + fp)
    npv = tn/(tn + fn)

    print ('Metrics Report:')
    print ('---------------')
    print ('weighted f1: ', f1_w)
    print ('AUROC:       ',auroc)
    print ('accuracy:    ', acc)
    print ('precision:   ', prec)
    print ('recall:      ', rec)
    print ('sensitivity: ', sens)
    print ('specificity: ', spec)
    print ('PPV:         ', ppv)
    print ('NPV:         ', npv)
    print ()
    print ('confusion matrix')
    print (confusion)

    results_df.loc[len(results_df)] = [desc,num_samples, weights, f1_w, acc, auroc, ppv, sens, batch_size]

In [12]:
def get_label_cb(dataset, idx):
    return dataset[idx][2].item()


In [13]:
def encode_data(df, text_field):
    encoded_data_train = tokenizer.batch_encode_plus(
        df[df.data_type=='train'][text_field].values, 
        add_special_tokens=True, 
        return_attention_mask=True, 
        pad_to_max_length=True, 
        max_length=256, 
        return_tensors='pt'
        )

    encoded_data_val = tokenizer.batch_encode_plus(
        df[df.data_type=='val'][text_field].values, 
        add_special_tokens=True, 
        return_attention_mask=True, 
        pad_to_max_length=True, 
        max_length=256, 
        return_tensors='pt'
        )


    input_ids_train = encoded_data_train['input_ids']
    attention_masks_train = encoded_data_train['attention_mask']
    labels_train = torch.tensor(df[df.data_type=='train'].label.values)

    input_ids_val = encoded_data_val['input_ids']
    attention_masks_val = encoded_data_val['attention_mask']
    labels_val = torch.tensor(df[df.data_type=='val'].label.values)

    return input_ids_train, attention_masks_train, labels_train, input_ids_val, attention_masks_val, labels_val



def create_dataloaders(input_ids_train, attention_masks_train, labels_train, input_ids_val, attention_masks_val, labels_val, batch_size = 32):
    dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
    dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

    dataloader_train = DataLoader(dataset_train, 
                              #sampler=RandomSampler(dataset_train), 
                              sampler=ImbalancedDatasetSampler(dataset_train, callback_get_label = get_label_cb),
                              batch_size=batch_size)

    dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)
  
    return dataloader_train, dataloader_validation

def process_data(data):
    df = data.sample(num_samples)
    df.discharge.value_counts()
    label_dict = {'discharge':0, 'admit':1}; label_dict
    df['label'] = df.discharge.replace(label_dict)
    df['text'] = df['CleanSubjectiveNotes'].map(str) + ', ' + df['pmhx'].map(str)
    df = df[['discharge', 'label', 'text']]
    display(df.head())
    
    return df, label_dict


def split_data(df, test_size=0.1):
    X_train, X_val, y_train, y_val = train_test_split(df.index.values, 
                                                  df.label.values, 
                                                  test_size=test_size, 
                                                  random_state=17, 
                                                  stratify=df.label.values)
    
    # this field is done for creating a balanced training portion of the dataset
    #pos_training_examples = [[x[0],x[1]] for x in list(zip(X_train, y_train)) if x[1] == 1]
    #neg_training_examples = [[x[0], x[1]] for x in list(zip(X_train, y_train)) if x[1] == 0]
    #neg_train_subset = random.sample(list(neg_training_examples), len(pos_training_examples))
    #new_train_set = pos_training_examples + neg_train_subset
    #X_train = np.array(new_train_set)[:,0]
    #y_train = np.array(new_train_set)[:,1]
    
    
    df['data_type'] = ['not_set']*df.shape[0]
    df.loc[X_train, 'data_type'] = 'train'
    df.loc[X_val, 'data_type'] = 'val'
    #df = df[df['data_type'] != 'not set']  #new 
    
    display(df.groupby(['discharge', 'label', 'data_type']).count())
    
    return X_train, X_val, y_train, y_val

In [14]:
class Lamb(Optimizer):
    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6,
                 weight_decay=0.01, adam=False):   # I changed wd default from 0
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
        defaults = dict(lr=lr, betas=betas, eps=eps,
                        weight_decay=weight_decay)
        self.adam = adam
        super(Lamb, self).__init__(params, defaults)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError('Lamb does not support sparse gradients, consider SparseAdam instad.')

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p.data)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta1, beta2 = group['betas']

                state['step'] += 1

                # Decay the first and second moment running average coefficient
                # m_t
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                # v_t
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)

                # Paper v3 does not use debiasing.
                bias_correction1 = 1 - beta1 ** state['step']
                bias_correction2 = 1 - beta2 ** state['step']
                # Apply bias to lr to avoid broadcast.
                step_size = group['lr']  * math.sqrt(bias_correction2) / bias_correction1

                weight_norm = p.data.pow(2).sum().sqrt().clamp(0, 10)

                adam_step = exp_avg / exp_avg_sq.sqrt().add(group['eps'])
                if group['weight_decay'] != 0:
                    adam_step.add_(group['weight_decay'], p.data)

                adam_norm = adam_step.pow(2).sum().sqrt()
                if weight_norm == 0 or adam_norm == 0:
                    trust_ratio = 1
                else:
                    trust_ratio = weight_norm / adam_norm
                state['weight_norm'] = weight_norm
                state['adam_norm'] = adam_norm
                state['trust_ratio'] = trust_ratio
                if self.adam:
                    trust_ratio = 1

                p.data.add_(-step_size * trust_ratio, adam_step)

        return loss


In [15]:

ALPHA = 0.125
GAMMA = 2

class FocalLoss(nn.Module):
    def __init__(self, weight=None, size_average=True):
        super(FocalLoss, self).__init__()

    def forward(self, inputs, targets, alpha=ALPHA, gamma=GAMMA, smooth=1):
        
        #comment out if your model contains a sigmoid or equivalent activation layer
        inputs = torch.sigmoid(inputs)       
        
        #flatten label and prediction tensors
        inputs = inputs.view(-1)
        targets = targets.view(-1)
        
        #first compute binary cross-entropy 
        BCE = F.binary_cross_entropy(inputs, targets, reduction='mean')
        BCE_EXP = torch.exp(-BCE)
        focal_loss = alpha * (1-BCE_EXP)**gamma * BCE
                       
        return focal_loss
    
class FocalLoss2(nn.Module):
    def __init__(self, weight=None, size_average=True):
        super(FocalLoss2, self).__init__()

    def forward(self, inputs, targets, alpha=ALPHA, gamma=GAMMA, smooth=1):
        
        #comment out if your model contains a sigmoid or equivalent activation layer
        #inputs = torch.sigmoid(inputs)       
        
        #flatten label and prediction tensors
        #inputs = inputs.view(-1)
        #targets = targets.view(-1)
        
        #first compute binary cross-entropy 
        BCE = F.binary_cross_entropy_with_logits(inputs, targets, reduction='mean')
        BCE_EXP = torch.exp(-BCE)
        focal_loss = alpha * (1-BCE_EXP)**gamma * BCE
                       
        return focal_loss
    
class JJFocalLoss(nn.Module):
    def __init__(self, weight=None, size_average=True):
        super(JJFocalLoss, self).__init__()

    def forward(self, inputs, targets, alpha=0.125, gamma=2, smooth=1):
        
        ce_loss = torch.nn.functional.cross_entropy(inputs, targets, reduction='none') # important to add reduction='none' to keep per-batch-item loss
        pt = torch.exp(-ce_loss)
        focal_loss = (alpha * (1-pt)**gamma * ce_loss).mean()
                       
        return focal_loss


# import the dataframe

In [16]:
#data = pd.read_csv(data_dir + 'complete_clean_combo_data.csv', index_col = None, low_memory = False)
data = pd.read_csv('/floyd/home/data/nlp_combo_data.csv', index_col = 0)

In [17]:
data.head()

Unnamed: 0,CleanSubjectiveNotes,pmhx,num_comorbids,outcome,target,service,target2,discharge,target3,dispo,target4,ICUvsother,target5,site
0,patient states that she feels shaky patient de...,no significant medical history,0,discharge,1,discharge,1,discharge,1,,,,0,BCH
1,patient states no pain,no significant medical history,0,discharge,1,discharge,1,discharge,1,,,,0,BCH
2,denied chest pain,no significant medical history,0,discharge,1,discharge,1,discharge,1,,,,0,BCH
3,denies pain eating emergency department lunch ...,"dialysis haemo, type one diabetes, chronic ren...",5,discharge,1,discharge,1,discharge,1,,,,0,BCH
4,patient complains of pain on the left wrist an...,fibromyalgia,1,discharge,1,discharge,1,discharge,1,,,,0,BCH


From prior experimentation it is clear that the only good way to do this training is with GPU2 and mixed precision

In [19]:
weights = torch.tensor([1,9], dtype = torch.half)
num_samples = 25000
desc = 'balanced sampler, gpu2'
batch_size = 32

### Exploratory Data Analysis and Preprocessing

In [20]:
df, label_dict = process_data(data)

Unnamed: 0,discharge,label,text
92879,discharge,0,patient states complains of dizziness since 6a...
4038,discharge,0,patient recently returned from jamaica on marc...
44899,discharge,0,as per patient with pain to testicles x 1 week...
59030,discharge,0,patient has had chest pain since last night an...
90388,discharge,0,complains of testicular pain since yesterday a...


### Training/Validation Split

In [22]:
X_train, X_val, y_train, y_val = split_data(df, test_size = 0.1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,text
discharge,label,data_type,Unnamed: 3_level_1
admit,1,train,453
admit,1,val,50
discharge,0,train,4047
discharge,0,val,450


### Loading Tokenizer and Encoding our Data

In [23]:
input_ids_train, attention_masks_train, labels_train, input_ids_val, attention_masks_val, labels_val = encode_data(df, 'text')

In [24]:
dataloader_train, dataloader_validation = create_dataloaders(input_ids_train, attention_masks_train, labels_train, input_ids_val, attention_masks_val, labels_val,batch_size)

In [26]:
model = BertForSequenceClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

#this is needed to optimize the focal loss function
#model.classifier.bias.data = torch.tensor([-2.,-2.])

model.to(device);

### model training 2

In [27]:
weights = weights.to(device)
loss_func = nn.CrossEntropyLoss(weight = weights)
train_model_defined_loss(model, dataloader_train, dataloader_validation, lr = 1e-3, epochs = 3, weight_decay = 0.01)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=282.0, style=ProgressStyle(description_widt…

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha)



Epoch 1
Training loss: 0.7326077091778423
Validation loss: 0.5665170922875404
F1 Score (Weighted): 0.8526315789473684
Validation accuracy: 0.9


HBox(children=(FloatProgress(value=0.0, description='Epoch 2', max=282.0, style=ProgressStyle(description_widt…


Epoch 2
Training loss: 0.7094726230658538
Validation loss: 0.6608710810542107
F1 Score (Weighted): 0.8526315789473684
Validation accuracy: 0.9


HBox(children=(FloatProgress(value=0.0, description='Epoch 3', max=282.0, style=ProgressStyle(description_widt…


Epoch 3
Training loss: 0.7090417252364734
Validation loss: 0.726312555372715
F1 Score (Weighted): 0.018181818181818184
Validation accuracy: 0.1



In [28]:
get_metrics(dataloader_validation)

Metrics Report:
---------------
weighted f1:  0.018181818181818184
AUROC:        0.5281555555555555
accuracy:     0.1
precision:    [0.  0.1]
recall:       [0. 1.]
sensitivity:  0.1
specificity:  nan
PPV:          1.0
NPV:          0.0

confusion matrix
[[  0 450]
 [  0  50]]


  _warn_prf(average, modifier, msg_start, len(result))


In [29]:
results_df.to_csv(data_dir + 'results.csv')

## Notes to self:

- first step - adamw optimizer, standard precision
- second step - adamw optimizer, mixed precision
- third step - adamw optimizer, label smoothing
- 4th step - adamw optimzer, mixup?
- 5th step - try lamb
- 6th step - try weighted loss
- 7th step - better cleaned data
- 8th step - agumentation of minor class