In [1]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip install --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git
!pip install mlflow
!pip install evaluate
!pip install rouge_score
!pip install bert_score

In [2]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

hf_token = user_secrets.get_secret("HUGGINGFACE_TOKEN")
login(hf_token)

In [67]:
dataset = load_dataset("openlifescienceai/medmcqa", split="validation")

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/85.9M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/936k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.48M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/182822 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6150 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4183 [00:00<?, ? examples/s]

In [71]:
dataset

Dataset({
    features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name'],
    num_rows: 4183
})

In [77]:
dataset["question"][0]

'Which of the following is not true for myelinated nerve fibers:'

In [None]:
# Define valid answer choices
valid_choices = ["1", "2", "3", "4"]
valid_tokens = tokenizer(valid_choices, add_special_tokens=False)["input_ids"]  # Get token IDs for A, B, C, D

prompt_temp =  """Below is an instruction that describes a task, paired with an input that provides further context. 
Write a response that appropriately completes the request. 
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Instruction:
You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning. 
Please answer the following medical question. 

### Question:
{}

### Response:
<think>{}"""

def generate_mcqa_answer(model, tokenizer, question, options):
    # prompt = f"""###Instruction: Choose the correct option (1, 2, 3 or 4) for the following question.\n\n
    #              ###Question: {question} \n\nOptions:\n"""
    prompt = prompt_temp.format(question,options[0],options[1],options[2],options[3],"")
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    with torch.no_grad():
        output = model.generate(
            **inputs, max_new_tokens=1, eos_token_id=tokenizer.eos_token_id
        )

    # Extract predicted token
    predicted_text = tokenizer.decode(output[0], skip_special_tokens=True).strip()
    for choice in valid_choices:
        if choice in predicted_text:
            return choice
    return "INVALID"

# Run Evaluation
predictions, references = [], []

for i in range(100):  # Evaluate on 100 samples for speed
    question = dataset[i]["question"]
    options = [dataset[i]["opa"], dataset[i]["opb"], dataset[i]["opc"], dataset[i]["opd"]]
    correct_choice = str(dataset[i]["cop"])  # Convert label to string
    
    predicted_choice = generate_mcqa_answer(model, tokenizer, question, options)
    if predicted_choice in valid_choices:  # Only count valid predictions
        predictions.append(predicted_choice)
        references.append(correct_choice)

In [None]:
# Compute Accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(references, predictions)*100
print(f"Model Accuracy: {accuracy:.2f}")

In [None]:
# print(rouge_scores)
# print(bleu_score)
# print(P)
# print(R)
# print(F1)
# # print(meteor_score)
# # {'rouge1': 0.39861943638797637, 'rouge2': 0.15852988222338912, 'rougeL': 0.24056601785296156, 'rougeLsum': 0.25601796407185623}
# # {'bleu': 0.10663558385629832, 'precisions': [0.477088948787062, 0.18032786885245902, 0.0997229916897507, 0.06460674157303371], 'brevity_penalty': 0.6949736126020096, 'length_ratio': 0.733201581027668, 'translation_length': 371, 'reference_length': 506}
# # tensor([0.5492, 0.7260, 0.7357, 0.6497, 0.5720])
# # tensor([0.5879, 0.5425, 0.6210, 0.6112, 0.5811])
# # tensor([0.5679, 0.6210, 0.6735, 0.6298, 0.5765])

## Evaluation Loop

In [3]:
import mlflow
import torch
from unsloth import FastLanguageModel
from transformers import AutoTokenizer
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
import evaluate
import numpy as np
import math
from torch.nn.functional import cross_entropy

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [4]:
max_seq_length = 2048 
dtype = None 
load_in_4bit = True 

In [61]:
prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context. 
Write a response that appropriately completes the request. 
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Instruction:
You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning. 
Please answer the following medical question. 

### Question:
{}

### Response:
<think>{}"""

eval_dataset = load_dataset("JC-24/MediQAReasoning", split="validation[:100]")
# dataloader = DataLoader(eval_dataset, batch_size=1)

In [6]:
model_name_or_path = "JC-24/gemma-7b-mediqa-final"
from unsloth import is_bfloat16_supported

dtype = torch.bfloat16 if is_bfloat16_supported() else torch.float16
print(dtype)
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name_or_path,
    load_in_4bit=load_in_4bit,  # Enable 4-bit quantization
    max_seq_length=max_seq_length,
    dtype=dtype
)

torch.float16
==((====))==  Unsloth 2025.2.14: Fast Gemma patching. Transformers: 4.49.0.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


pytorch_model.bin.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

pytorch_model-00001-of-00004.bin:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/21.9k [00:00<?, ?B/s]

pytorch_model-00002-of-00004.bin:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

pytorch_model-00003-of-00004.bin:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

pytorch_model-00004-of-00004.bin:   0%|          | 0.00/2.11G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/154 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.16k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/555 [00:00<?, ?B/s]

In [7]:
FastLanguageModel.for_inference(model)

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 3072, padding_idx=0)
    (layers): ModuleList(
      (0-27): 28 x GemmaDecoderLayer(
        (self_attn): GemmaAttention(
          (q_proj): Linear4bit(in_features=3072, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=3072, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=3072, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=3072, bias=False)
          (rotary_emb): GemmaFixedRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear4bit(in_features=3072, out_features=24576, bias=False)
          (up_proj): Linear4bit(in_features=3072, out_features=24576, bias=False)
          (down_proj): Linear4bit(in_features=24576, out_features=3072, bias=False)
          (act_fn): GELUActivation()
        )
        (input_layernorm): GemmaRMSNorm((3072,), eps=1e-06)
        (post_attention_layernorm):

In [41]:
eval_dataset

Dataset({
    features: ['Question', 'Complex_CoT', 'Response'],
    num_rows: 5
})

In [64]:
all_preds = []
all_refs = []
all_pred_cot = []
all_cot = []
all_questions = []
# Load Hugging Face Evaluation Metrics
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")
# meteor_metric = evaluate.load("meteor")
bertscore_metric = evaluate.load("bertscore")
exact_match_metric = evaluate.load("exact_match")

for i in range(len(eval_dataset)):
    question = eval_dataset[i]["Question"]
    answer = eval_dataset[i]["Response"]
    cot = eval_dataset[i]["Complex_CoT"]
    
    inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")

    outputs = model.generate(
        input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_new_tokens=1200,
        use_cache=True,
    )
    response = tokenizer.batch_decode(outputs)
    clear_res = response[0].split("</think>")[-1]
    pred_cot = response[0].split("<think>")[-1].split("</think>")[0]
    all_questions.append(question)
    all_preds.append(clear_res)
    all_refs.append(answer)
    all_pred_cot.append(pred_cot)
    all_cot.append(cot)
    
bleu = bleu_metric.compute(predictions=all_preds, references=all_refs)['bleu']
rouge = rouge_metric.compute(predictions=all_preds, references=all_refs)
bertscore = bertscore_metric.compute(predictions=all_preds, references=all_refs, lang="en")
exact_match = exact_match_metric.compute(predictions=all_preds, references=all_refs)['exact_match']

metrics = {
        # "loss": avg_loss,
        # "perplexity": perplexity,
        "bleu": bleu,
        # "meteor": meteor,
        "exact_match": exact_match,
        "rouge1": rouge["rouge1"],
        "rouge2": rouge["rouge2"],
        "rougeL": rouge["rougeL"],
        "bertscore_precision": np.mean(bertscore["precision"]),
        "bertscore_recall": np.mean(bertscore["recall"]),
        "bertscore_f1": np.mean(bertscore["f1"]),
    }

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [65]:
print(metrics)

{'bleu': 0.1586626140636257, 'exact_match': 0.0, 'rouge1': 0.46471873674751546, 'rouge2': 0.24126687188185544, 'rougeL': 0.32993377246628985, 'bertscore_precision': 0.889073098897934, 'bertscore_recall': 0.8800099664926528, 'bertscore_f1': 0.8841670423746109}


In [66]:
import pandas as pd
import dagshub
import mlflow

dagshub.init(repo_owner='janvichokshi1998', repo_name='gemma-7b-unsloth-mediQA', mlflow=True)
mlflow.set_tracking_uri("https://dagshub.com/janvichokshi1998/gemma-7b-unsloth-mediQA.mlflow")
# mlflow.create_experiment("Gemma-7B-Unsloth-4bit-HF-Eval")
mlflow.set_experiment("Gemma-7B-Unsloth-4bit-HF-Eval")

with mlflow.start_run(run_name="Gemma-7B-Unsloth-4bit-HF-Eval") as run:
    # Log all metrics
    for key, value in metrics.items():
        mlflow.log_metric(key, value)

    # Log parameters
    mlflow.log_param("model_name", model_name_or_path)
    mlflow.log_param("quantization", "4-bit")
    mlflow.log_param("batch_size", 4)

    example_data = {
    "Question": all_questions,
    "Predicted Answer": all_preds,
    "Actual Answer": all_refs,
    "Predicted CoT": all_pred_cot,
    "Actual CoT": all_cot
    }
    
    # Convert to DataFrame
    examples_df = pd.DataFrame(example_data)
    csv_save_path = "/kaggle/working/example_predictions.csv"
    examples_df.to_csv(csv_save_path, index=False)
    
    # Log CSV file to MLflow and DagsHub
    mlflow.log_artifact(csv_save_path)
    print(f"Logged to MLflow Run ID: {run.info.run_id}")
    print("Evaluation Metrics:")
    for key, value in metrics.items():
        print(f"{key}: {value:.4f}")

Logged to MLflow Run ID: 3fb5894903104778a5b81031562306fd
Evaluation Metrics:
bleu: 0.1587
exact_match: 0.0000
rouge1: 0.4647
rouge2: 0.2413
rougeL: 0.3299
bertscore_precision: 0.8891
bertscore_recall: 0.8800
bertscore_f1: 0.8842
🏃 View run Gemma-7B-Unsloth-4bit-HF-Eval at: https://dagshub.com/janvichokshi1998/gemma-7b-unsloth-mediQA.mlflow/#/experiments/1/runs/3fb5894903104778a5b81031562306fd
🧪 View experiment at: https://dagshub.com/janvichokshi1998/gemma-7b-unsloth-mediQA.mlflow/#/experiments/1


In [31]:
# # Load Hugging Face Evaluation Metrics
# bleu_metric = evaluate.load("bleu")
# rouge_metric = evaluate.load("rouge")
# # meteor_metric = evaluate.load("meteor")
# bertscore_metric = evaluate.load("bertscore")
# exact_match_metric = evaluate.load("exact_match")

 
# # Evaluation Function Using Hugging Face Metrics
# def evaluate_model(model, dataloader):
#     total_loss = 0.0
#     total_tokens = 0

#     all_preds = []
#     all_refs = []
#     device = "cuda"
    
#     with torch.no_grad():
#         for batch in tqdm(dataloader, desc="Evaluating"):
#             prompt = prompt_style.format(batch['Question'][0],"")
#             print(prompt)
#             inputs = tokenizer(prompt, padding="max_length", max_length = max_seq_length,truncation=True, return_tensors="pt").to("cuda")
#             # outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
#             # logits = outputs.logits
#             # shift_logits = logits[..., :-1, :].contiguous()
#             # shift_labels = input_ids[..., 1:].contiguous()
    
#             # # Compute loss manually to control masking
#             # loss = cross_entropy(
#             #     shift_logits.view(-1, shift_logits.size(-1)),
#             #     shift_labels.view(-1),
#             #     ignore_index=tokenizer.pad_token_id,
#             #     reduction='sum'  # Sum to get total loss
#             # )

#             # # Count non-padding tokens
#             # non_pad_tokens = (shift_labels != tokenizer.pad_token_id).sum().item()
    
#             # total_loss += loss.item()
#             # total_tokens += non_pad_tokens
            
#             # Generate predictions
#             outputs = model.generate(
#                                     input_ids=inputs.input_ids,
#                                     attention_mask=inputs.attention_mask,
#                                     max_new_tokens=1200,
#                                     use_cache=True,
#                                 )
#             decoded_preds = tokenizer.batch_decode(outputs)
#             # generated_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=1200, use_cache=True)
#             # decoded_preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
#             decoded_pred_new = [decoded_pred.split("</think>")[-1] for decoded_pred in decoded_preds]
#             print(decoded_pred_new)
#             break
#             # print("------------------")
#             # actual_response = [res for res in batch["Response"]]
#             # print(actual_response)
#             # all_preds.extend(decoded_pred_new)
#             # all_refs.extend(actual_response)

#     # Calculate Metrics
#     # avg_loss = total_loss / total_tokens
#     # perplexity = math.exp(avg_loss)

#     # Compute Metrics using Hugging Face Evaluate
#     # bleu = bleu_metric.compute(predictions=all_preds, references=all_refs)['bleu']
#     # rouge = rouge_metric.compute(predictions=all_preds, references=all_refs)
#     # # meteor = meteor_metric.compute(predictions=all_preds, references=all_refs)['meteor']
#     # bertscore = bertscore_metric.compute(predictions=all_preds, references=all_refs, lang="en")
#     # exact_match = exact_match_metric.compute(predictions=all_preds, references=all_refs)['exact_match']

#     # metrics = {
#     #     # "loss": avg_loss,
#     #     # "perplexity": perplexity,
#     #     "bleu": bleu,
#     #     # "meteor": meteor,
#     #     "exact_match": exact_match,
#     #     "rouge1": rouge["rouge1"],
#     #     "rouge2": rouge["rouge2"],
#     #     "rougeL": rouge["rougeL"],
#     #     "bertscore_precision": np.mean(bertscore["precision"]),
#     #     "bertscore_recall": np.mean(bertscore["recall"]),
#     #     "bertscore_f1": np.mean(bertscore["f1"]),
#     # }
#     # return metrics, all_preds, all_refs