# Sentiment Analysis with Deep Learning using BERT

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


### install packages

In [3]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/48/35/ad2c5b1b8f99feaaf9d7cdadaeef261f098c6e1a6a2935d4d07662a6b780/transformers-2.11.0-py3-none-any.whl (674kB)
[K     |████████████████████████████████| 675kB 2.8MB/s 
[?25hCollecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 14.9MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 12.4MB/s 
Collecting tokenizers==0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/14/e5/a26eb4716523808bb0a799fcfdceb6ebf77a18169d9591b2f46a9adb87d9/tokenizers-0.7.0-cp36-cp36m-manylinux1_x86_64.whl (3.8MB)
[K   

In [4]:
import torch
import random

import pandas as pd
import numpy as np
from collections import Counter

from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split
import sklearn.metrics
from sklearn.metrics import f1_score

from transformers import BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import BertTokenizer

from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


print(f'random seeds initialized, wordking on {device}')

random seeds initialized, wordking on cuda


In [6]:
#getting the clinical biobert tokenizer
tokenizer = BertTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




In [7]:
#loads result dataframe or initializes an empty one
try:
  results_df = pd.read_csv('/content/drive/My Drive/ML_data/results.csv', index_col = 0)
except:
  results_df = pd.DataFrame(columns = ['experiment description', 'num samples', 'weighting', 'f1w', 'acc', 'auroc', 'ppv', 'sens'])

results_df

Unnamed: 0,experiment description,num samples,weighting,f1w,acc,auroc,ppv,sens
0,tiny test dataset,1000,,0.831008,0.885,0.558339,0.0,
1,tiny test dataset,1000,,0.831008,0.885,0.558339,0.0,
2,tiny test dataset,1000,,0.831008,0.885,0.558339,0.0,
3,"larger test dataset, only subj notes",10000,,0.854283,0.8995,0.780036,0.01,0.4
4,"larger test dataset, only subj notes",50000,,0.876438,0.8969,0.795039,0.178138,0.44557
5,"larger test dataset, only subj notes",100000,,0.87697,0.901,0.807763,0.155172,0.493548
6,"larger test dataset, only subj notes",100000,,0.881214,0.8954,0.798594,0.237219,0.43609


### define helper functions

In [8]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

In [9]:
def accuracy_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return sklearn.metrics.accuracy_score(labels_flat, preds_flat)

In [10]:
def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [11]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [1]:
def train_model(model, dataloader_train, dataloader_valid, save_name, lr = 1e-5, eps = 1e-8, epochs = 3, weights = None):
  
  model.to(device)
  loss_fn = torch.nn.CrossEntropyLoss(weight = weights)
  optimizer = AdamW(model.parameters(), lr=lr, eps=eps)
  scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)
  
  for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), f'/content/models/{save_name}_BERT_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    val_acc = accuracy_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')
    tqdm.write(f'Validation accuracy: {val_acc}')


In [12]:
def get_metrics(predictions, true_vals, dataloader):

  _, predictions, true_vals = evaluate(dataloader)

  preds = [np.argmax(pred) for pred in predictions]
  preds_flat = np.argmax(preds).flatten()
  true_vals = true_vals.flatten()

  f1_w = sklearn.metrics.f1_score(true_vals, preds, average='weighted')
  f1 = sklearn.metrics.f1_score(true_vals, preds, average=None)
  acc = sklearn.metrics.accuracy_score(true_vals, preds)
  prec = sklearn.metrics.precision_score(true_vals,preds, average=None) 
  rec = sklearn.metrics.recall_score(true_vals,preds, average=None)
  auroc = sklearn.metrics.roc_auc_score(true_vals,predictions[:,1], average=None)
  confusion = sklearn.metrics.confusion_matrix(true_vals, preds)

#labels flipped for some reason so I had to chance the confusion interpretation    
#tp, fn, fp, tn = confusion[0,0], confusion[0,1], confusion[1,0], confusion[1,1]
  tn, fn, fp, tp = confusion[0,0], confusion[0,1], confusion[1,0], confusion[1,1]


  sens = tp/(tp + fn)
  spec = tn/(tn + fp)
  ppv = tp/(tp + fp)
  npv = tn/(tn + fn)

  print ('Metrics Report:')
  print ('---------------')
  print ('weighted f1: ', f1_w)
  print ('AUROC:       ',auroc)
  print ('accuracy:    ', acc)
  print ('precision:   ', prec)
  print ('recall:      ', rec)
  print ('sensitivity: ', sens)
  print ('specificity: ', spec)
  print ('PPV:         ', ppv)
  print ('NPV:         ', npv)
  print ()
  print ('confusion matrix')
  print (confusion)

  results_df.loc[len(results_df)] = [desc,num_samples, weights, f1_w, acc, auroc, ppv, sens]

In [13]:
def encode_data(text_field):
  encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'][text_field].values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
    )

  encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'][text_field].values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
    )


  input_ids_train = encoded_data_train['input_ids']
  attention_masks_train = encoded_data_train['attention_mask']
  labels_train = torch.tensor(df[df.data_type=='train'].label.values)

  input_ids_val = encoded_data_val['input_ids']
  attention_masks_val = encoded_data_val['attention_mask']
  labels_val = torch.tensor(df[df.data_type=='val'].label.values)

  return input_ids_train, attention_masks_train, labels_train, input_ids_val, attention_masks_val, labels_val

In [14]:
def create_dataloaders(batch_size = 32):
  dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
  dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

  dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

  dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)
  
  return dataloader_train, dataloader_validation

# import the master dataframe

In [15]:
data = pd.read_csv  ('/content/drive/My Drive/ML_data/complete_clean_combo_data.csv', index_col = None, low_memory = False)

# First pass - non processed subjective notes

In [None]:
weights = None
num_samples = 100000
desc = 'larger test dataset, only subj notes'

### Data importing and Preprocessing

In [None]:
df = data.sample(num_samples)

In [None]:
#using discharge column to get labels
df.discharge.value_counts()

discharge    90136
admit         9864
Name: discharge, dtype: int64

In [None]:
label_dict = {'admit':1, 'discharge':0}
df['label'] = df.discharge.replace(label_dict)

In [None]:
df = df[['CleanSubjectiveNotes', 'discharge', 'label']]

In [None]:
df.head()

Unnamed: 0,CleanSubjectiveNotes,discharge,label
78145,as per son's translation. patient got up from ...,discharge,0
138434,complains of lower back pain down to hip and r...,discharge,0
106143,patient brought by mother with shortness of br...,discharge,0
116801,patient complains of vomiting x2 days. denies ...,admit,1
133861,states has lower back pain for a week and lowe...,discharge,0


### creating train and valid dataloaders

In [None]:
X_train, X_val, y_train, y_val = train_test_split(df.index.values, 
                                                  df.label.values, 
                                                  test_size=0.1, 
                                                  random_state=17, 
                                                  stratify=df.label.values)

In [None]:
df['data_type'] = ['not_set']*df.shape[0]

In [None]:
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

In [None]:
df.groupby(['discharge', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,CleanSubjectiveNotes
discharge,label,data_type,Unnamed: 3_level_1
admit,1,train,8878
admit,1,val,986
discharge,0,train,81122
discharge,0,val,9014


In [None]:
#encoding using bert tokenizer
input_ids_train, attention_masks_train, labels_train, input_ids_val, attention_masks_val, labels_val = encode_data('CleanSubjectiveNotes')

In [None]:
dataloader_train, dataloader_validation = create_dataloaders()

### training

In [None]:
#instantiate bert model

model = BertForSequenceClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)



In [None]:
train_model(model, dataloader_train, dataloader_validation)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=2813.0, style=ProgressStyle(description_wid…


Epoch 1
Training loss: 0.28150320104881194
Validation loss: 0.26354485255079907
F1 Score (Weighted): 0.8647319631009545


HBox(children=(FloatProgress(value=0.0, description='Epoch 2', max=2813.0, style=ProgressStyle(description_wid…


Epoch 2
Training loss: 0.25216405331383934
Validation loss: 0.27502434269879195
F1 Score (Weighted): 0.8703839880229233


HBox(children=(FloatProgress(value=0.0, description='Epoch 3', max=2813.0, style=ProgressStyle(description_wid…


Epoch 3
Training loss: 0.23051603078654112
Validation loss: 0.2743735836622433
F1 Score (Weighted): 0.8769695846402432



In [None]:
get_metrics(predictions, true_vals, dataloader_validation)

Metrics Report:
---------------
weighted f1:  0.8769695846402432
AUROC:        0.8077633125122922
accuracy:     0.901
precision:    [0.91403509 0.49354839]
recall:       [0.98258265 0.15517241]
sensitivity:  0.4935483870967742
specificity:  0.9140350877192982
PPV:          0.15517241379310345
NPV:          0.9825826492123364

confusion matrix
[[8857  157]
 [ 833  153]]


In [None]:
results_df

Unnamed: 0,experiment description,num samples,weighting,f1w,acc,auroc,ppv,sens
0,tiny test dataset,1000,,0.831008,0.885,0.558339,0.0,
1,tiny test dataset,1000,,0.831008,0.885,0.558339,0.0,
2,tiny test dataset,1000,,0.831008,0.885,0.558339,0.0,
3,"larger test dataset, only subj notes",10000,,0.854283,0.8995,0.780036,0.01,0.4
4,"larger test dataset, only subj notes",50000,,0.876438,0.8969,0.795039,0.178138,0.44557
5,"larger test dataset, only subj notes",100000,,0.87697,0.901,0.807763,0.155172,0.493548


so let's say that 50000 examples is reflective and now do some experimentation on that

# second pass - adding medical history

In [None]:
weights = None
num_samples = 50000
desc = 'subj notes and pmhx'

### Data loading and Preprocessing

In [None]:
df = data.sample(50000)

In [None]:
df.shape

(50000, 122)

In [None]:
df.discharge.value_counts()

discharge    45112
admit         4888
Name: discharge, dtype: int64

In [None]:
label_dict = {'discharge':0, 'admit':1}; label_dict

{'admit': 1, 'discharge': 0}

In [None]:
df['label'] = df.discharge.replace(label_dict)

In [None]:
df = df[['CleanSubjectiveNotes', 'pmhx','discharge', 'label']]

In [None]:
df['text'] = df['CleanSubjectiveNotes'].map(str) + ', ' + df['pmhx'].map(str)

In [None]:
df = df[['discharge', 'label', 'text']]
df.head()

Unnamed: 0,discharge,label,text
144699,discharge,0,"lower back pain, abdominal pain, epigastric pa..."
142547,discharge,0,"cold s+s x1mth, patient complains of sinus pai..."
142960,discharge,0,patient reported was elbowed to upper left sid...
116701,discharge,0,with complains of left sided lower abdominal p...
27916,discharge,0,per patient is 7-8 weeks pregnant. last menstr...


## Training/Validation Split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(df.index.values, 
                                                  df.label.values, 
                                                  test_size=0.2, 
                                                  random_state=17, 
                                                  stratify=df.label.values)

In [None]:
df['data_type'] = ['not_set']*df.shape[0]

In [None]:
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

In [None]:
df.groupby(['discharge', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,text
discharge,label,data_type,Unnamed: 3_level_1
admit,1,train,3910
admit,1,val,978
discharge,0,train,36090
discharge,0,val,9022


## Loading Tokenizer and Encoding our Data

In [None]:
input_ids_train, attention_masks_train, labels_train, input_ids_val, attention_masks_val, labels_val = encode_data('text')

In [None]:
dataloader_train, dataloader_validation = create_dataloaders(16)

In [None]:
model = BertForSequenceClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)



In [None]:
train_model(model, dataloader_train, dataloader_validation)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=2500.0, style=ProgressStyle(description_wid…


Epoch 1
Training loss: 0.28256826400309804
Validation loss: 0.26461688067913053
F1 Score (Weighted): 0.8734886372237314


HBox(children=(FloatProgress(value=0.0, description='Epoch 2', max=2500.0, style=ProgressStyle(description_wid…


Epoch 2
Training loss: 0.2501123862288892
Validation loss: 0.2682400894701481
F1 Score (Weighted): 0.879665592679998


HBox(children=(FloatProgress(value=0.0, description='Epoch 3', max=2500.0, style=ProgressStyle(description_wid…


Epoch 3
Training loss: 0.22407042768597601
Validation loss: 0.2902081708073616
F1 Score (Weighted): 0.8812139862965125



In [None]:
get_metrics(predictions, true_vals, dataloader_validation)

Metrics Report:
---------------
weighted f1:  0.8812139862965125
AUROC:        0.7985938938627187
accuracy:     0.8954
precision:    [0.92120828 0.43609023]
recall:       [0.96674795 0.23721881]
sensitivity:  0.43609022556390975
specificity:  0.9212082805238698
PPV:          0.23721881390593047
NPV:          0.9667479494568831

confusion matrix
[[8722  300]
 [ 746  232]]


In [None]:
results_df

Unnamed: 0,experiment description,num samples,weighting,f1w,acc,auroc,ppv,sens
0,tiny test dataset,1000,,0.831008,0.885,0.558339,0.0,
1,tiny test dataset,1000,,0.831008,0.885,0.558339,0.0,
2,tiny test dataset,1000,,0.831008,0.885,0.558339,0.0,
3,"larger test dataset, only subj notes",10000,,0.854283,0.8995,0.780036,0.01,0.4
4,"larger test dataset, only subj notes",50000,,0.876438,0.8969,0.795039,0.178138,0.44557
5,"larger test dataset, only subj notes",100000,,0.87697,0.901,0.807763,0.155172,0.493548
6,"larger test dataset, only subj notes",100000,,0.881214,0.8954,0.798594,0.237219,0.43609


In [None]:
results_df.to_csv('/content/drive/My Drive/ML_data/results.csv')

# third pass - weighted loss function

In [16]:
weights = torch.tensor([1.,8.]).to(device)
num_samples = 50000
desc = 'weighted loss, both nlp fields'

### Exploratory Data Analysis and Preprocessing

In [17]:
df = data.sample(num_samples)

In [18]:
df.discharge.value_counts()

discharge    45025
admit         4975
Name: discharge, dtype: int64

In [19]:
label_dict = {'discharge':0, 'admit':1}; label_dict

{'admit': 1, 'discharge': 0}

In [20]:
df['label'] = df.discharge.replace(label_dict)

In [21]:
df['text'] = df['CleanSubjectiveNotes'].map(str) + ', ' + df['pmhx'].map(str)

In [22]:
df = df[['discharge', 'label', 'text']]
df.head()

Unnamed: 0,discharge,label,text
102395,discharge,0,rectal bleeding x 4 days. bowel movement x 5 t...
112374,discharge,0,unable to void x 4hours and with lower abdo pa...
111136,discharge,0,"right lower leg pain,calf pain and knee pain f..."
140645,discharge,0,states felt itchy in the left eye this afterno...
117690,discharge,0,mom states that patient started summer camp on...


## Training/Validation Split

In [23]:
X_train, X_val, y_train, y_val = train_test_split(df.index.values, 
                                                  df.label.values, 
                                                  test_size=0.1, 
                                                  random_state=17, 
                                                  stratify=df.label.values)

In [24]:
df['data_type'] = ['not_set']*df.shape[0]

In [25]:
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

In [26]:
df.groupby(['discharge', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,text
discharge,label,data_type,Unnamed: 3_level_1
admit,1,train,4477
admit,1,val,498
discharge,0,train,40523
discharge,0,val,4502


## Loading Tokenizer and Encoding our Data

In [27]:
input_ids_train, attention_masks_train, labels_train, input_ids_val, attention_masks_val, labels_val = encode_data('text')

In [28]:
dataloader_train, dataloader_validation = create_dataloaders(16)

In [29]:
model = BertForSequenceClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=385.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435778770.0, style=ProgressStyle(descri…




In [None]:
train_model(model, dataloader_train, dataloader_validation, 'weighted_1')

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=2813.0, style=ProgressStyle(description_wid…

# New Section

In [None]:
get_metrics(predictions, true_vals, dataloader_validation)

In [None]:
results_df

In [None]:
results_df.to_csv('/content/drive/My Drive/ML_data/results.csv')

# fourth pass new training function

In [None]:
data = pd.read_csv  ('/content/drive/My Drive/ML_data/complete_clean_combo_data.csv', index_col = None, low_memory = True)
#df.set_index('ID', inplace=True)

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
df = data.sample(50000)

In [None]:
df.discharge.value_counts()

discharge    45025
admit         4975
Name: discharge, dtype: int64

In [None]:
label_dict = {'discharge':0, 'admit':1}
df['label'] = df.discharge.replace(label_dict)
df['text'] = df['CleanSubjectiveNotes'].map(str) + ', ' + df['pmhx'].map(str)

In [None]:
df = df[['discharge', 'label', 'text']]
df.head()

Unnamed: 0,discharge,label,text
102395,discharge,0,rectal bleeding x 4 days. bowel movement x 5 t...
112374,discharge,0,unable to void x 4hours and with lower abdo pa...
111136,discharge,0,"right lower leg pain,calf pain and knee pain f..."
140645,discharge,0,states felt itchy in the left eye this afterno...
117690,discharge,0,mom states that patient started summer camp on...


## Training/Validation Split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(df.index.values, 
                                                  df.label.values, 
                                                  test_size=0.2, 
                                                  random_state=17, 
                                                  stratify=df.label.values)

In [None]:
df['data_type'] = ['not_set']*df.shape[0]


In [None]:
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

In [None]:
df.groupby(['discharge', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,text
discharge,label,data_type,Unnamed: 3_level_1
admit,1,train,66
admit,1,val,17
discharge,0,train,734
discharge,0,val,183


## Loading Tokenizer and Encoding our Data

In [None]:
tokenizer = BertTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

In [None]:
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].text.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].text.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)


input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)

In [None]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [None]:
len(dataset_train)

800

In [None]:
len(dataset_val)

200

## Setting up BERT Pretrained Model

In [None]:
model = BertForSequenceClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)



## Creating Data Loaders

In [None]:
batch_size = 16

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

In [None]:
train_model(model, dataloader_train, dataloader_validation, weights = torch.tensor([1.,9.]).to(device))

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=1250.0, style=ProgressStyle(description_wid…




KeyboardInterrupt: ignored

# code below is for reloading models to use for inference

In [None]:
model = BertForSequenceClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device);

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
model.load_state_dict(torch.load('Models/<<INSERT MODEL NAME HERE>>.model', map_location=torch.device('cpu')))

<All keys matched successfully>

In [None]:
_, predictions, true_vals = evaluate(dataloader_validation)

In [None]:
accuracy_per_class(predictions, true_vals)

Class: happy
Accuracy: 163/171

Class: not-relevant
Accuracy: 20/32

Class: angry
Accuracy: 7/9

Class: disgust
Accuracy: 0/1

Class: sad
Accuracy: 4/5

Class: surprise
Accuracy: 2/5

