In [1]:
#  backbone_name = 'deberta-v3-base'
%env TOKENIZERS_PARALLELISM=false
import json
import logging
import math
import os
import random
import sys
from dataclasses import dataclass, field
from itertools import chain
from pathlib import Path
from typing import Optional
import numpy as np 
import pandas as pd 
import datasets
import tensorflow as tf
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import transformers
from transformers import (
    AutoConfig,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    TFAutoModelForMaskedLM
)
from transformers.utils import send_example_telemetry
from transformers.utils.versions import require_version
from tqdm.notebook import tqdm
from datasets import Dataset

from transformers.optimization_tf import AdamWeightDecay

env: TOKENIZERS_PARALLELISM=false


In [2]:
features_df = pd.read_csv("/kaggle/input/bighandsome/NBME/features.csv")
patient_notes_df = pd.read_csv("/kaggle/input/bighandsome/NBME/patient_notes.csv")
train_df = pd.read_csv("/kaggle/input/bighandsome/NBME/train.csv")

In [3]:
@dataclass
class config:
    model:str=field(default="microsoft/deberta-v3-base")
    train_batch_size:int = field(default=4)
    test_batch_size:int = field(default=8)
    drop_out:float = field(default=0.4)
    max_lenght:int = field(default=None)
    prob_tokens:int = field(default=0.3)
    learning_rate:float = field(default=1e-5)
    epochs:int = field(default=5)

    
    

In [4]:
# using deberata-v3-large model 
tokenizer = AutoTokenizer.from_pretrained(config.model)
# model 
model = TFAutoModelForMaskedLM.from_pretrained(config.model)


Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading tf_model.h5:   0%|          | 0.00/736M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFDebertaV2ForMaskedLM.

Some layers of TFDebertaV2ForMaskedLM were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['cls']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
df = pd.merge(train_df, features_df, on=['feature_num','case_num'], how='inner')
df =pd.merge(df, patient_notes_df, on=['pn_num','case_num'], how='inner')
df.head()

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history
0,00016_000,0,16,0,['dad with recent heart attcak'],['696 724'],Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...
1,00016_001,0,16,1,"['mom with ""thyroid disease']",['668 693'],Family-history-of-thyroid-disorder,HPI: 17yo M presents with palpitations. Patien...
2,00016_002,0,16,2,['chest pressure'],['203 217'],Chest-pressure,HPI: 17yo M presents with palpitations. Patien...
3,00016_003,0,16,3,"['intermittent episodes', 'episode']","['70 91', '176 183']",Intermittent-symptoms,HPI: 17yo M presents with palpitations. Patien...
4,00016_004,0,16,4,['felt as if he were going to pass out'],['222 258'],Lightheaded,HPI: 17yo M presents with palpitations. Patien...


In [6]:
columnslist_nondrop=['feature_text','pn_history']

columns_to_drop = [col for col in df.columns if col not in columnslist_nondrop]

df.drop(columns=columns_to_drop,inplace=True)

In [7]:
pn_history_lengths = []
tk0 = tqdm(df['pn_history'].fillna("").values, total=len(df))
for text in tk0:
    length = tokenizer.encode(text,add_special_tokens=False)
        
    pn_history_lengths.append(len(length))
print(f'pn_history max(lengths): {max(pn_history_lengths)}')


features_lengths=[]
tk1 = tqdm(df['feature_text'].fillna("").values, total=len(df))
for text in tk1:
    length = tokenizer.encode(text,add_special_tokens=False)
    features_lengths.append(len(length))
print(f'feature_text  max(lengths): {max(features_lengths)}')

max_lenght= max(pn_history_lengths) + max(features_lengths) + 3 # cls & sep & sep
print(f"max_len: {max_lenght}")

config.max_lenght=max_lenght

  0%|          | 0/14300 [00:00<?, ?it/s]

pn_history max(lengths): 309


  0%|          | 0/14300 [00:00<?, ?it/s]

feature_text  max(lengths): 28
max_len: 340


In [8]:
dataset=Dataset.from_pandas(df)

In [9]:
def tokenize_function(data):

    return tokenizer(
                data['feature_text'],
                data['pn_history'],
                padding='max_length',
                truncation=True,
                truncation_strategy="only_second",
                max_length=config.max_lenght,
                return_special_tokens_mask=True,
        
            )

    
tokenized_datasets = dataset.map(
            tokenize_function,
            batched=True,
            desc="Running tokenizer on dataset line_by_line"
    )

Running tokenizer on dataset line_by_line:   0%|          | 0/15 [00:00<?, ?ba/s]

In [10]:
dataset=tokenized_datasets.train_test_split(test_size=0.2)

In [11]:
train_dataset = dataset["train"]

test_dataset = dataset['test']

In [12]:
data_collator = DataCollatorForLanguageModeling(
            tokenizer=tokenizer, mlm_probability=config.prob_tokens, return_tensors="np"
        )
options = tf.data.Options()

In [13]:
tf_train_dataset = model.prepare_tf_dataset(
            train_dataset,
            shuffle=True,
            batch_size=config.train_batch_size,
            collate_fn=data_collator,
        ).with_options(options)

tf_eval_dataset = model.prepare_tf_dataset(
            test_dataset,
            shuffle=False,
            batch_size=config.test_batch_size,
            collate_fn=data_collator,
            drop_remainder=True,
        ).with_options(options)

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [14]:
model.compile(optimizer=AdamWeightDecay(learning_rate=config.learning_rate))

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [15]:
history = model.fit(
            tf_train_dataset,
            validation_data=tf_eval_dataset,
            epochs=int(config.epochs)
        )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [16]:
train_loss = history.history["loss"][-1]
try:
    train_perplexity = math.exp(train_loss)
except OverflowError:
    train_perplexity = math.inf

print(f"  Final train loss: {train_loss:.3f}")
print(f"  Final train perplexity: {train_perplexity:.3f}")

validation_loss = history.history["val_loss"][-1]
try:
    validation_perplexity = math.exp(validation_loss)
except OverflowError:
    validation_perplexity = math.inf

print(f"  Final validation loss: {validation_loss:.3f}")
print(f"  Final validation perplexity: {validation_perplexity:.3f}")

  Final train loss: 1.957
  Final train perplexity: 7.076
  Final validation loss: 1.658
  Final validation perplexity: 5.251


In [17]:
model.save_pretrained('nbme_mlm_model') # saving the model