In [2]:
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import (
    BertTokenizer,
    BertForMaskedLM,
)
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
from huggingface_hub import HfApi
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

!unzip -qn /users/masoudkord/downloads/wordnet_source.zip -d  /users/masoudkord/downloads/wordnet/

2025-07-29 13:15:01.133391: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753794901.331840      89 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753794901.388188      89 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


unzip:  cannot find or open /users/masoudkord/downloads/wordnet_source.zip, /users/masoudkord/downloads/wordnet_source.zip.zip or /users/masoudkord/downloads/wordnet_source.zip.ZIP.


In [3]:
if torch.cuda.device_count() == 0:
    print("No GPU Available!")
else:
    for i in range(torch.cuda.device_count()):
        print(torch.cuda.get_device_properties(i))

_CudaDeviceProperties(name='Tesla T4', major=7, minor=5, total_memory=15095MB, multi_processor_count=40, uuid=42f5d29f-e760-4a70-68ea-6dfc70dcbd4d, L2_cache_size=4MB)
_CudaDeviceProperties(name='Tesla T4', major=7, minor=5, total_memory=15095MB, multi_processor_count=40, uuid=16e938e0-1c7b-2ec0-09ff-cc2851fb4d68, L2_cache_size=4MB)


In [4]:
import platform, socket, re, uuid, json, psutil, logging

def getSystemInfo():
    try:
        info = {}
        info["platform"]=platform.system()
        info["platform-release"]=platform.release()
        info["platform-version"]=platform.version()
        info["architecture"]=platform.architecture()
        info["architecture"]=platform.architecture()
        info["hostname"]=socket.gethostname()
        info["ip-address"]=socket.gethostbyname(socket.gethostname())
        info["mac-address"]=":".join(re.findall('..', '%012x' % uuid.getnode()))
        info["processor"]=platform.processor()
        info["ram"]=str(round(psutil.virtual_memory().total / (1024.0**3))) + " GB"
        return json.dumps(info)
    except Exception as e:
        logging.exception(e)

from pprint import pprint
pprint(getSystemInfo())

('{"platform": "Linux", "platform-release": "6.6.56+", "platform-version": "#1 '
 'SMP PREEMPT_DYNAMIC Sun Nov 10 10:07:59 UTC 2024", "architecture": ["64bit", '
 '"ELF"], "hostname": "20677a442f23", "ip-address": "172.19.2.2", '
 '"mac-address": "02:42:ac:13:02:02", "processor": "x86_64", "ram": "31 GB"}')


In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"

trainset_range = list(range(18000, 38000))
medmcqa_dataset_path = "openlifescienceai/medmcqa"
base_bert_path = "emilyalsentzer/Bio_ClinicalBERT"
finetuned_bert_path = (
    "BioClinicalBert-MLM-Finetuned-40k-25epoch-exp-25epoch-questions.pth"
)
dataset_file_name = (
    "MEDMCQA-dataset-with-CLS-40k-25epoch-exp-25epoch-questions-nltk.json"
)
repo_id = "MMK79/Medical-RAG"
# push_dataset_to_huggingface = True # After the First run disable it
push_dataset_to_huggingface = False

batch_size = 32

# Load MEDMCQA Dataset and Select Columns
'exp' which is a explanation about why the answer is the right answer <br>
we ignore the columns that have low explanation ('exp') <br>
we ignore the columns that don't have explanation ('exp') <br>
we ignore the columns that don't have questions

In [6]:
def filter_none(example):
    return (
        (example["exp"] is not None)
        and (len(example["exp"]) > 20)
        and (example["question"] is not None)
    )


dataset = load_dataset(medmcqa_dataset_path)
dataset = dataset["train"].select(trainset_range)
dataset = dataset.filter(filter_none).select_columns(["question", "exp"])
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
dataset_df = pd.DataFrame(dataset.to_dict())
print(f"dataset length: {len(dataset)}")

README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/85.9M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/936k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.48M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/182822 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6150 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4183 [00:00<?, ? examples/s]

Filter:   0%|          | 0/20000 [00:00<?, ? examples/s]

dataset length: 16831


# Load the Fine-tuned Model from HuggingFace

In [7]:
from huggingface_hub import hf_hub_download


tokenizer = BertTokenizer.from_pretrained(base_bert_path)
model = BertForMaskedLM.from_pretrained(base_bert_path).to(device)

checkpoint_file = hf_hub_download(repo_id=repo_id, filename=finetuned_bert_path)
checkpoint = torch.load(checkpoint_file)
model.load_state_dict(checkpoint["model_state_dict"])
model = model.bert  # dropping MLM head
model.eval() # to test/inference your model --> it is a switch
# model.train() # to go in training mode again

vocab.txt: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

(…)ed-40k-25epoch-exp-25epoch-questions.pth:   0%|          | 0.00/433M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [8]:
def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens]
    # list comprehension --> new_list = [expression for iterable if condition]
    # isalpha() --> check if all the characters in word are letter --> return a boolean 
    tokens = [word for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words("english"))
    # delete stop works
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(tokens)

In [9]:
# isalpha test
s = "What%"
s.isalpha()
# WordNetLemmatizer() test
# from nltk.stem import WordNetLemmatizer
wl = WordNetLemmatizer()
# Lemmatize using WordNet’s built-in morphy function. Returns the input word unchanged if it cannot be found in WordNet.
# morphy functions: a set of morphology functions that aim to convert a given string into its base or dictionary form
wl.lemmatize('its')

'it'

In [10]:
# test tqdm --> show progress bar
for i in tqdm(range(1000)):
    pass

  0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
max_length = 128
# Classification tokens --> a special, learnable token that is added to the beginning of every input sequence
cls_tokens = []
for batch in tqdm(dataloader):
    batch["question"] = [preprocess_text(txt) for txt in batch["question"]]
    # Tokenization
    tokens = tokenizer(
        batch["question"],
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt",
    )
    # need to use to(device) to have the model and data on the same device --> else you get runtime error
    input_ids = tokens["input_ids"].to(device)
    # it will get generated automatically by the tokenizer itself
    # attention mask is a binary tensor (a list of 0, 1) --> tell which tokens are real data and which tokens are padding --> 1=real data(pay attention), 0=padding (ignore)
    # We need it cause models like bert use Attention Mechanism to focus on relevant parts of the input --> without attention mask, model will try to understand paddings too
    # So real word get used in attention computation
    # Padding will get ignore and won't affect hidden states (model understanding/knowledge) and output
    att_mask = tokens["attention_mask"].to(device)

    with torch.no_grad():
        # forward pass --> takeing input data, propagating through all the layers and producing output
        outputs = model(input_ids, att_mask)

    # CLS Extraction
    # only available if you use the full model --> BERT
    # Transform CLS through linear + tanh layer
    # shape=(batch_size, hidden_dim)
    if "pooler_output" in outputs:
        cls_embedding = outputs.pooler_output
    elif "last_hidden_state" in outputs:
        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze()
    else:
        raise Exception("No CLS token found in the given model")

    # Storing the embedding
    cls_embedding = cls_embedding.cpu().numpy().tolist()
    cls_tokens += cls_embedding

  0%|          | 0/526 [00:00<?, ?it/s]

In [None]:
len(outputs)
dir(type(outputs))
outputs.items()
for k, v in outputs.items():
    # print(k, v)
    pass
print(outputs.hidden_states)
print(outputs.attentions)
# print(outputs.last_hidden_state)
for k in outputs.keys():
    print(k)
# dir(type(outputs.last_hidden_state)) # shape is available
# output of each token in the last layer
print(outputs.last_hidden_state.shape) # batchsize, seq_len, hidden_dim

None
None
last_hidden_state
torch.Size([31, 128, 768])


In [None]:
# converting cls tokens to np.array
cls_tokens = [np.asarray(cls) for cls in cls_tokens]
dataset_df["question_cls"] = cls_tokens
print(len(dataset_df))
print(type(dataset_df))
print(display(dataset_df))

Unnamed: 0,question,exp,question_cls
0,"All of the following are pyrogenic cytokines, ...",Interleukin 18 is not a pyrogenic cytokine. IL...,"[0.4302745461463928, -0.4610498547554016, -0.1..."
1,40-year old female presented with neck swellin...,Ref. Robbins Pathology. 9th edition. Page. 109...,"[1.1794304847717285, 0.1141890361905098, -0.11..."
2,Following statement regarding dislocation of t...,Anterior dislocation is more common in which h...,"[0.024791469797492027, -0.19909602403640747, -..."
3,The active search for unrecognized disease or ...,Screening is the search for unrecognized disea...,"[0.6333976984024048, 0.0940687358379364, -0.01..."
4,Fir tree pattern lesion is seen in,Fir tree pattern of distribution of lesions is...,"[1.1372100114822388, 0.17038805782794952, -0.2..."
...,...,...,...
16826,Carcinoma sigmoid colon with obstruction Manag...,- Obstruction due to rectosigmoid growth with ...,"[0.6879968047142029, 0.10079113394021988, -0.6..."
16827,ADHD in childhood can lead to which of the fol...,"ADHD can lead to substance abuse,mood disorder...","[0.37708356976509094, 0.5247248411178589, -0.1..."
16828,Nerve for adductor compament of thigh ?,Ans. B) Obturator nerveObturator nerve is the ...,"[1.0885707139968872, -0.436038076877594, -0.68..."
16829,The &;a&;wave of jugular venous pulse is produ...,JVP or jugular venous is a reflection of the r...,"[0.23046493530273438, -0.13350751996040344, 0...."


pandas.core.frame.DataFrame

In [None]:
# The number of decimal places to use when encoding floating point values.
# Maximum is 15 --> more than 15 == Value Error
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_json.html
dataset_df.to_json(dataset_file_name, double_precision=15)

In [25]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("HuggingFace_API")

In [26]:
if push_dataset_to_huggingface:
    # generate a token from Profile > Setting > Access Tokens with write access
    api = HfApi(
        token=secret_value_0,
    )
    api.upload_file(
        path_or_fileobj=f"./{dataset_file_name}",
        path_in_repo=dataset_file_name,
        repo_id=repo_id,
        repo_type="model",
    )

MEDMCQA-dataset-with-CLS-40k-25epoch-exp-25epoch-questions-nltk.json:   0%|          | 0.00/249M [00:00<?, ?B/…