In [2]:
# pytorch library 
import torch
# hugging face library to work with hugging face datasets
from datasets import load_dataset
# hugging face library to use models
# Data Collators are objects that form a batch by using a list of dataset elements as input
# https://huggingface.co/docs/transformers/main_classes/data_collator#transformers.DataCollatorForLanguageModeling
# Data Collators for Language Modeling --> Inputs are dynamically padded to the maximum length of the batch (if they are not the same size)
# Collator: person or machine that collate --> collect and combine
from transformers import BertTokenizer, BertForMaskedLM, DataCollatorForLanguageModeling
from huggingface_hub import HfApi

In [3]:
# To check if accelerator is active or not
if torch.cuda.device_count() == 0:
    print("No GPU Available!")
else:
    for i in range(torch.cuda.device_count()):
        print(torch.cuda.get_device_properties(i))

_CudaDeviceProperties(name='Tesla T4', major=7, minor=5, total_memory=15095MB, multi_processor_count=40, uuid=b09e3ecd-0795-8a1c-3e68-1c26213234df, L2_cache_size=4MB)
_CudaDeviceProperties(name='Tesla T4', major=7, minor=5, total_memory=15095MB, multi_processor_count=40, uuid=baf7a457-6cd3-6e09-1e79-330a69c64f25, L2_cache_size=4MB)


In [4]:
import platform, socket, re, uuid, json, psutil, logging

def getSystemInfo():
    try:
        info = {}
        info["platform"]=platform.system()
        info["platform-release"]=platform.release()
        info["platform-version"]=platform.version()
        info["architecture"]=platform.architecture()
        info["architecture"]=platform.architecture()
        info["hostname"]=socket.gethostname()
        info["ip-address"]=socket.gethostbyname(socket.gethostname())
        info["mac-address"]=":".join(re.findall('..', '%012x' % uuid.getnode()))
        info["processor"]=platform.processor()
        info["ram"]=str(round(psutil.virtual_memory().total / (1024.0**3))) + " GB"
        return json.dumps(info)
    except Exception as e:
        logging.exception(e)

from pprint import pprint
pprint(getSystemInfo())

('{"platform": "Linux", "platform-release": "6.6.56+", "platform-version": "#1 '
 'SMP PREEMPT_DYNAMIC Sun Nov 10 10:07:59 UTC 2024", "architecture": ["64bit", '
 '"ELF"], "hostname": "d153c34cfd76", "ip-address": "172.19.2.2", '
 '"mac-address": "02:42:ac:13:02:02", "processor": "x86_64", "ram": "31 GB"}')


In [5]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("HuggingFace_API")

In [6]:
# # from datasets import list_datasets does not exist
# import datasets
# # if `datasets` points to the right `datasets` module, this should print the location of the module
# print(datasets.__file__)
# # if `datasets` points to a bad `datasets` module, this should print the location of the folder named "datasets"
# print(datasets.__path__)
# from huggingface_hub import list_datasets
# # it is a generator type you need to loop within it to see the result
# print(list_datasets())
# for i in list_datasets():
#     print(i)

In [None]:
# check if the device have access to gpu
device = "cuda" if torch.cuda.is_available() else "cpu"
trainset_range = list(range(18000, 58000))
# Hugging Face Dataset: https://huggingface.co/datasets/openlifescienceai/medmcqa
medmcqa_dataset_path = "openlifescienceai/medmcqa"
# Hugging Face Model: https://huggingface.co/emilyalsentzer/Bio_ClinicalBERT
base_bert_path = "emilyalsentzer/Bio_ClinicalBERT"
# File location to save the checkpoints
checkpoint_file = "BioClinicalBert-MLM-Finetuned-40k-25epoch-exp-25epoch-questions.pth"
# Hugging Face Personal Repo
repo_id = "MMK79/Medical-RAG"
# push_model_to_huggingface = False
# For the first run
# push_model_to_huggingface = True
push_model_to_huggingface = False

num_epochs = 25
batch_size = 64
# learning rate
lr = 2e-4

In [8]:
# Prepare the input for the models
# Model don't understand text, you need to convert it to number which machine understand
tokenizer = BertTokenizer.from_pretrained(base_bert_path)
# Bert for Mask task: predict the masked word in input
model = BertForMaskedLM.from_pretrained(base_bert_path)

In [9]:
# to understand the mlp_map_function better
# test = {"name":["masuod", 'milad', 'khosro', "mojgan"],
#         "last_name":["mohararzadeh", "garsivaz"],
#         "age":[18, 25, 50, 55]}
# dict_ = {**test, 'esm':test["name"]}
# # print(dict_)
# from pprint import pprint
# pprint(dict_)

# new_dict = {n:n**2 for n in range(10) if n%2==0}
# pprint(new_dict)

In [10]:
# Preprocessing Data

# Keep the useful data
def filter_none(example):
    return (
        # ignore the data that the exp column is empty/null
        (example["exp"] is not None)
        # ignore short explanation exp
        and (len(example["exp"]) > 20)
        # ignore the data that question column is empty/null
        and (example["question"] is not None)
    )

# Tokenizing Data
def mlm_map_function(rows):
    counter = 0
    input_info = tokenizer(
        # apply on exp column
        rows["exp"],
        # set a max_length so every tokenized example have exactly 128 tokens within it
        # Why?
        # Batching-efficiency (more efficient for GPU Computation)
        # Memory Control (Prevent long sequence from consuming to much memory)
        # Consistency
        # It is a hyperparameter 128, 256, 512 (original max input BERT)
        max_length=128,
        # if your sequence is short will pad it to match the 128 token
        padding="max_length",
        # if your sequence is longer will shorten it to match the 128 token
        truncation=True,
        # convert it to pytorch tensor
        return_tensors="pt",
    )
    # See some example input
    # if counter > 2:
    #     pass
    # else:
    #     print(rows["exp"])
    #     print(input_info)
    #     counter +=1
    # dictionary comprehension new_dict = {expression for iterable if condition}
    # **input_info --> dictionary unpacking syntax (copy all key values pairs)
    # adding labels cause later we gonna apply data collator and it will change input_ids and we want to keep the original version untouched
    # data collator will mask 15% of input_ids later
    return {**input_info, "labels": input_info["input_ids"]}

# loading the dataset
dataset = load_dataset(medmcqa_dataset_path)
# partitioning data set
mlm_dataset = dataset["train"].select(trainset_range)
# apply filter_none function on 'exp' column
mlm_dataset = mlm_dataset.filter(filter_none).select_columns(["exp"])

# map() apply function to each element + transform it too
mlm_dataset = mlm_dataset.map(
    mlm_map_function,
    # instead of applying row by row it will apply the function by batch
    # batch size= 1000 by default
    batched=True,
    # parallel processing 
    num_proc=2,
)
print(mlm_dataset)

# training model using Pytorch DataLoader --> batching data using collate_fn
# collate_fn define the way to merge a list of dataset items into the batch
# mlm = Masked Language Modeling
# mlm_probability means that 15% tokens will get masked and get predict and learn by the model
# this approach enable us to have more control over each epoch, each epochs different tokens are masked
# Model is more generalized
collate_fn = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

Dataset({
    features: ['exp', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 33707
})


# Hugging Face Training

In [11]:
import accelerate
accelerate.__version__

'1.8.1'

In [12]:
# Can't run it locally
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    num_train_epochs=num_epochs,
    save_strategy="no",  # Disable checkpointing
    # save_strategy="step",  # After each step 
    # save_strategy="epoch",  # After each epoch # Cause kaggle to run out of space/storage
    logging_steps=len(mlm_dataset) // batch_size,  # Log per epoch
    report_to=[],  # Disable wandb logging
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    train_dataset=mlm_dataset,
)

results = trainer.train()
print(f"Training Results:\n{results}")



Step,Training Loss
526,1.6739
1052,1.4153
1578,1.2999
2104,1.2112
2630,1.1464
3156,1.0905
3682,1.0306
4208,0.9823
4734,0.9375
5260,0.8959


Training Results:
TrainOutput(global_step=6600, training_loss=1.1026795866995147, metrics={'train_runtime': 13415.8949, 'train_samples_per_second': 62.812, 'train_steps_per_second': 0.492, 'total_flos': 5.54480413184256e+16, 'train_loss': 1.1026795866995147, 'epoch': 25.0})


In [13]:
# Save the trainable weights
torch.save(
    {
        "model_state_dict": model.state_dict(),
    },
    checkpoint_file,
)

In [14]:
if push_model_to_huggingface:
    # generate a token from Profile > Setting > Access Tokens with write access
    api = HfApi(
        token=secret_value_0,
    )
    api.upload_file(
        path_or_fileobj=f"./{checkpoint_file}",
        path_in_repo=checkpoint_file,
        repo_id=repo_id,
        repo_type="model",
    )

BioClinicalBert-MLM-Finetuned-40k-25epoch-exp-25epoch-questions.pth:   0%|          | 0.00/433M [00:00<?, ?B/s…