In [1]:
import numpy as np
import pandas as pd
import transformers
import tokenizers
import torch.nn as nn
import torch 
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from tqdm.auto import tqdm 
from ast import literal_eval
import time
from tqdm.notebook import tqdm
from transformers import AutoModel , AutoTokenizer
from sklearn.model_selection import train_test_split

In [2]:
class config:
    MAX_LEN = 312
    TRAIN_BATCH_SIZE = 32
    VALID_BATCH_SIZE = 16 
    EPOCHS = 2
    model = "Tsubasaz/clinical-bert-base-128" # pretrained model on clinical notes 
    MODEL_PATH = "model.bin"
    TOKENIZER = AutoTokenizer.from_pretrained(model)
    DROPOUT = 0.2
    MAX_GRAD_NORM = 1.0
    LEARNING_RATE = 1e-5

Downloading:   0%|          | 0.00/321 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [3]:
BASE_PATH = "../input/nbme-score-clinical-patient-notes/"
features_df = pd.read_csv(BASE_PATH + "features.csv")
patient_notes_df = pd.read_csv(BASE_PATH + "patient_notes.csv")
train_df = pd.read_csv(BASE_PATH + "train.csv")
test_df = pd.read_csv(BASE_PATH + "test.csv")
submission_df = pd.read_csv(BASE_PATH + "sample_submission.csv")

In [4]:
df = pd.merge(train_df, features_df, on=['feature_num','case_num'], how='inner')
df =pd.merge(df, patient_notes_df, on=['pn_num','case_num'], how='inner')
df.head()

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history
0,00016_000,0,16,0,['dad with recent heart attcak'],['696 724'],Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...
1,00016_001,0,16,1,"['mom with ""thyroid disease']",['668 693'],Family-history-of-thyroid-disorder,HPI: 17yo M presents with palpitations. Patien...
2,00016_002,0,16,2,['chest pressure'],['203 217'],Chest-pressure,HPI: 17yo M presents with palpitations. Patien...
3,00016_003,0,16,3,"['intermittent episodes', 'episode']","['70 91', '176 183']",Intermittent-symptoms,HPI: 17yo M presents with palpitations. Patien...
4,00016_004,0,16,4,['felt as if he were going to pass out'],['222 258'],Lightheaded,HPI: 17yo M presents with palpitations. Patien...


In [5]:
pd.set_option("display.max_info_columns", 200)

In [6]:
df.iloc[0,:].values

array(['00016_000', 0, 16, 0, "['dad with recent heart attcak']",
       "['696 724']",
       'Family-history-of-MI-OR-Family-history-of-myocardial-infarction',
       'HPI: 17yo M presents with palpitations. Patient reports 3-4 months of intermittent episodes of "heart beating/pounding out of my chest." 2 days ago during a soccer game had an episode, but this time had chest pressure and felt as if he were going to pass out (did not lose conciousness). Of note patient endorses abusing adderall, primarily to study (1-3 times per week). Before recent soccer game, took adderrall night before and morning of game. Denies shortness of breath, diaphoresis, fevers, chills, headache, fatigue, changes in sleep, changes in vision/hearing, abdominal paun, changes in bowel or urinary habits. \r\nPMHx: none\r\nRx: uses friends adderrall\r\nFHx: mom with "thyroid disease," dad with recent heart attcak\r\nAll: none\r\nImmunizations: up to date\r\nSHx: Freshmen in college. Endorses 3-4 drinks 3 nights

In [7]:
# always use literal_eval instead of eval https://nedbatchelder.com/blog/201206/eval_really_is_dangerous.html
df["annotation"] = [literal_eval(x) for x in df["annotation"]] 
df["location"] = [literal_eval(x) for x in df["location"]]

In [8]:

pn_history_lengths = []
tk0 = tqdm(df['pn_history'].fillna("").values, total=len(df))
for text in tk0:
    length = config.TOKENIZER.encode(text,add_special_tokens=False)
        
    pn_history_lengths.append(len(length))
print(f'pn_history max(lengths): {max(pn_history_lengths)}')


features_lengths=[]
tk1 = tqdm(df['feature_text'].fillna("").values, total=len(df))
for text in tk1:
    length = config.TOKENIZER.encode(text,add_special_tokens=False)
    features_lengths.append(len(length))
print(f'feature_text  max(lengths): {max(features_lengths)}')

max_lenght= max(pn_history_lengths) + max(features_lengths) + 3 # cls & sep & sep
print(f"max_len: {max_lenght}")

  0%|          | 0/14300 [00:00<?, ?it/s]

pn_history max(lengths): 280


  0%|          | 0/14300 [00:00<?, ?it/s]

feature_text  max(lengths): 29
max_len: 312


In [9]:
def loc_list_to_ints(loc_list):
    to_return = []
    for loc_str in loc_list:  
        loc_strs = loc_str.split(";")
        for loc in loc_strs:
            start, end = loc.split()
            to_return.append((int(start), int(end)))
    return to_return


In [10]:
def classLabeling(pn_history, feature_text, annotation, location, tokenizer, max_len):    ##X , Y, selected_text  
    

    location_list = loc_list_to_ints(location)   # convert the locations into a list 

    char_targets = [0] * len(pn_history)  # creation of character taragert you can reason below 

    for loc,anno in zip(location_list ,annotation):        
        
        len_st = int(loc[1]) - int(loc[0])
        idx0 = None
        idx1 = None        
        for ind in (i for i, e in enumerate(pn_history) if (e == anno[0] and i == int(loc[0]))): # Only if the annotation start with character we are interested and look and character annotation postion match go inside the loop
        
            if pn_history[ind: ind+len_st] == anno.strip():

                idx0 = ind
                idx1 = ind + len_st - 1
                if idx0 != None and idx1 != None:
                    for ct in range(idx0, idx1 + 1): # make character targets as "1" for them 
                        char_targets[ct] = 1 
                break
    # Tokenize the data and here we are returing the offstes which we gone use as labels which you can find below 
    tokenized_input = config.TOKENIZER.encode_plus(feature_text,pn_history,return_attention_mask=True,
                                                  return_offsets_mapping=True,return_token_type_ids=True)
    
    input_ids = tokenized_input['input_ids']
    mask = tokenized_input['attention_mask']
    token_type_ids = tokenized_input['token_type_ids']
    offsets = tokenized_input['offset_mapping']
    
    target_idx = []
    for j, (offset1, offset2) in enumerate(offsets): # look for offsets 
        if sum(char_targets[offset1: offset2]) > 0: # if the lenght of the char_target for particualr target is greater than 0 then added one at that offsets 
            target_idx.append(j)
            
    #padding
    padding_length = config.MAX_LEN - len(input_ids) # Since we used 312 as max_lenght incase if we less lenght we need to pad the zeros 
    if padding_length > 0:
        input_ids = input_ids + ([0] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        offsets = offsets + ([(0, 0)] * padding_length)
       
    #creating label
    ignore_idxes = np.where(np.array(token_type_ids) != 1)[0] # Bascially we use token type ids Segment token indices to indicate first and second portions of the input

    label = np.zeros(len(offsets))
    label[ignore_idxes] = 0.0 # creating a labels zero for not interested to look 
    label[target_idx] = 1.0  # label for which we are interested to look
    return {
    'ids': input_ids,
    'mask': mask,
    'token_type_ids': token_type_ids,
    'labels': label,
    'offsets': offsets
}

In [11]:
class NBMEDataset:
    
    def __init__(self,pn_history,feature_text, annotation, location):
        self.pn_history=pn_history
        self.feature_text=feature_text
        self.annotation=annotation
        self.location=location
    def __len__(self):
        return len(self.pn_history+self.feature_text)
    
    def __getitem__(self,item):
   
        output=classLabeling(self.pn_history[item],self.feature_text[item],self.annotation[item],self.location[item],config.TOKENIZER,config.MAX_LEN)
        
        return {
            'input_ids':torch.tensor(output['ids']),
             'mask':torch.tensor(output['mask'],dtype=torch.long),
            'token_type_ids':torch.tensor(output['token_type_ids'],dtype=torch.long),
            'labels':torch.tensor(output['labels'],dtype=torch.float),
            'offsets':torch.tensor(output['offsets'],dtype=torch.long)   
            
        }
        
        

In [12]:
class NBMEModel(nn.Module):
    def __init__(self):
        super(NBMEModel,self).__init__()
        self.bert=AutoModel.from_pretrained(config.model)
        self.dropout=nn.Dropout(0.4)
        self.linear=nn.Linear(768,1)
        self.parameter=nn.Parameter(torch.ones(1))
    def forward(self,ids, mask,token_ids):
        sequence_output=self.bert(ids,attention_mask=mask, token_type_ids=token_ids)[0] # we gone take last hidden state no the model 
        output=self.dropout(sequence_output)
        logits=self.linear(output)
        logits = logits.squeeze(-1) 
        return logits

In [13]:
def loss_fn(logits, labels):
    loss_fct = torch.nn.BCEWithLogitsLoss(reduction = "mean")
    loss = loss_fct(logits,labels.float())
    return loss

In [14]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [15]:
model=NBMEModel()

Downloading:   0%|          | 0.00/712 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

Some weights of the model checkpoint at Tsubasaz/clinical-bert-base-128 were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at Tsubasaz/clinical-bert-base-128 and are newly initialized: ['bert.pooler.dense.bias', 'bert.p

In [16]:
model.to(DEVICE)

NBMEModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  

In [17]:
def train_fn(dataloader,model,optimizer,scheduler=None):
    model.train()
    
    train_loss=0


    tqd=tqdm(dataloader,total=len(dataloader))
    
    for batch , data in enumerate(tqd):

      
        ids=data['input_ids']
        mask=data['mask']
        token_ids=data['token_type_ids']
        label=data['labels']
        offsets=data['offsets']
        
        ids=ids.to(DEVICE,dtype=torch.long)
        mask=mask.to(DEVICE,dtype=torch.long)
        token_ids=token_ids.to(DEVICE,dtype=torch.long)
        label=label.to(DEVICE,dtype=torch.long)
        
        model.zero_grad()
        
        output=model(ids=ids,mask=mask,token_ids=token_ids)
        
        loss=loss_fn(output,label)

        train_loss=+loss.item()
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # cliping to get rid of exploding gradient if any 
        
        optimizer.step()
        
        scheduler.step()
        
    return train_loss/len(dataloader)
        
        
           

In [18]:
@torch.no_grad()
def eval_fn(dataloader, model):
    model.eval()
    
        
    eval_loss=0
    
    tk = tqdm(dataloader, total=len(dataloader)) 
    
    for batch, data in enumerate(tk):
        ids = data['input_ids']
        token_type_ids = data["token_type_ids"]
        mask = data["mask"]
        labels = data['labels']
        offsets = data["offsets"]
        ids = ids.to(DEVICE, dtype=torch.long)
        token_type_ids = token_type_ids.to(DEVICE, dtype=torch.long)
        mask = mask.to(DEVICE, dtype=torch.long)
        labels = labels.to(DEVICE, dtype=torch.float64)
      

        logits = model(ids=ids, mask=mask, token_ids=token_type_ids ) #last_hidden_state
            
        loss = loss_fn(logits, labels)
 
        eval_loss=+loss.item()
         
    
        
    return eval_loss/len(dataloader)
      


In [19]:
def run():
    
    train_loss_data, valid_loss_data = [], []
    
  
    df_train , df_valid= train_test_split(df,test_size=0.3, random_state=42)
   
    df_train = df_train.reset_index(drop=True) 
    df_valid = df_valid.reset_index(drop=True)
    
    train_dataset = NBMEDataset(
        pn_history=df_train.pn_history.values,
        feature_text=df_train.feature_text.values,
        annotation=df_train.annotation.values,
        location=df_train.location.values
        
    )
    
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE
 
    )
    
   

    valid_dataset = NBMEDataset(
        pn_history=df_valid.pn_history.values,
        feature_text=df_valid.feature_text.values,
        annotation=df_valid.annotation.values,
        location=df_valid.location.values
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE
    )


    
    num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]
    optimizer = AdamW(optimizer_parameters, lr=config.LEARNING_RATE)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=0, 
        num_training_steps=num_train_steps
    )


    for i in range(config.EPOCHS):
        print("Epoch: {}/{}".format(i + 1, config.EPOCHS))
    
      

        train_loss = train_fn(train_data_loader, model, optimizer, scheduler=scheduler)
        
        eval_loss =  eval_fn(valid_data_loader , model )
       
        print(f"Train loss: {train_loss} and the valida loss {eval_loss} after the epochs : {i+1}")




In [20]:
run()

Epoch: 1/2


  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/269 [00:00<?, ?it/s]

Train loss: 7.189687091512041e-05 and the valida loss 9.498140708886115e-05 after the epochs : 1
Epoch: 2/2


  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/269 [00:00<?, ?it/s]

Train loss: 6.756610787524202e-05 and the valida loss 9.507552260138288e-05 after the epochs : 2


In [21]:
test_df = pd.read_csv(BASE_PATH + "test.csv")

In [22]:
df_test = pd.merge(test_df, features_df, on=['feature_num','case_num'], how='inner')
df_test =pd.merge(df_test, patient_notes_df, on=['pn_num','case_num'], how='inner')


In [23]:
df_test

Unnamed: 0,id,case_num,pn_num,feature_num,feature_text,pn_history
0,00016_000,0,16,0,Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...
1,00016_001,0,16,1,Family-history-of-thyroid-disorder,HPI: 17yo M presents with palpitations. Patien...
2,00016_002,0,16,2,Chest-pressure,HPI: 17yo M presents with palpitations. Patien...
3,00016_003,0,16,3,Intermittent-symptoms,HPI: 17yo M presents with palpitations. Patien...
4,00016_004,0,16,4,Lightheaded,HPI: 17yo M presents with palpitations. Patien...


> **Inference**

In [24]:
# Inferece on one of the data point request 
@torch.no_grad()
def inference_fn(df_test, model, device):
    model.eval()
    model.to(device)
    

    tokenized_input = config.TOKENIZER.encode_plus(df_test[0],df_test[1],return_attention_mask=True,padding='max_length',truncation=True,
                                                return_offsets_mapping=True,return_token_type_ids=True,max_length=config.MAX_LEN)
        
    input_ids = torch.tensor(tokenized_input['input_ids'],dtype=torch.long).unsqueeze(0)
    mask = torch.tensor(tokenized_input['attention_mask'],dtype=torch.long).unsqueeze(0)
    token_type_ids = torch.tensor(tokenized_input['token_type_ids'],dtype=torch.long).unsqueeze(0)
    offsets = tokenized_input['offset_mapping']
        
        
    input_ids= input_ids.to(DEVICE)
    mask=mask.to(DEVICE)
    token_type_ids = token_type_ids.to(DEVICE)
    
    y_preds= model(input_ids,mask,token_type_ids)
    
    predictions= y_preds.sigmoid().to('cpu').numpy()
    
        
    return predictions, offsets
        

In [25]:
data,offsets=inference_fn(df_test.iloc[1,4:6],model,DEVICE)

In [26]:
index_ofstring = np.where(data[0] >= 0.5)

In [27]:
df_test.iloc[1,4:6].values

array(['Family-history-of-thyroid-disorder',
       'HPI: 17yo M presents with palpitations. Patient reports 3-4 months of intermittent episodes of "heart beating/pounding out of my chest." 2 days ago during a soccer game had an episode, but this time had chest pressure and felt as if he were going to pass out (did not lose conciousness). Of note patient endorses abusing adderall, primarily to study (1-3 times per week). Before recent soccer game, took adderrall night before and morning of game. Denies shortness of breath, diaphoresis, fevers, chills, headache, fatigue, changes in sleep, changes in vision/hearing, abdominal paun, changes in bowel or urinary habits. \r\nPMHx: none\r\nRx: uses friends adderrall\r\nFHx: mom with "thyroid disease," dad with recent heart attcak\r\nAll: none\r\nImmunizations: up to date\r\nSHx: Freshmen in college. Endorses 3-4 drinks 3 nights / week (on weekends), denies tabacco, endorses trying marijuana. Sexually active with girlfriend x 1 year, uses cond

In [28]:
# lets made it offsets 

begning_of_string = offsets[187][0]
end_of_string = offsets[188][1]

In [29]:
# Model is doing good its able to predict properly 
df_test.iloc[1,5][begning_of_string:end_of_string]

'thyroid disease'