In [1]:
# !pip install -U -q bitsandbytes
# !pip install -U -q evaluate
# !pip install -U -q rouge_score

In [2]:
!export CUDA_LAUNCH_BLOCKING=1

In [3]:
import os

HF_API_KEY = "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"

os.environ["HF_TOKEN"] = HF_API_KEY

In [4]:
# !git clone https://huggingface.co/MohamedAhmedAE/Llama-3.2-3B-Instruct-Medical-Finetune-v4
# !git clone https://huggingface.co/MohamedAhmedAE/Llama-3.2-1B-Instruct-Medical-Finetune-v4

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, Gemma3ForConditionalGeneration
from peft import PeftModel, PeftConfig
from datasets import load_dataset
import evaluate
from tqdm import tqdm
import pandas as pd
import numpy as np
import math
from torch.nn import CrossEntropyLoss
import torch.nn.functional as F
import os

def load_peft_model_and_tokenizer(peft_model_path, base_model_name=None, is_gemma_model=False):
    """Load PEFT model and tokenizer from local path"""
    print(f"Loading PEFT model from: {peft_model_path}")
    
    # Load PEFT config to get base model info
    peft_config = PeftConfig.from_pretrained(peft_model_path)
    
    # Use base model from config if not provided
    if base_model_name is None:
        base_model_name = peft_config.base_model_name_or_path
    
    print(f"Base model: {base_model_name}")
    
    # Load tokenizer from base model
    tokenizer = AutoTokenizer.from_pretrained(base_model_name)
    
    # Set up quantization config
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        llm_int8_enable_fp32_cpu_offload=False
    )
    
    # Load base model
    if is_gemma_model:
        base_model = Gemma3ForConditionalGeneration.from_pretrained(
            base_model_name,
            quantization_config=quantization_config,
            dtype=torch.bfloat16,
            device_map="cuda"
        )
    else:
        base_model = AutoModelForCausalLM.from_pretrained(
            base_model_name,
            quantization_config=quantization_config,
            dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
            device_map="auto" if torch.cuda.is_available() else None
        )
    
    # Load PEFT adapters
    model = PeftModel.from_pretrained(base_model, peft_model_path)
    
    # Add padding token if it doesn't exist
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    return model, tokenizer

def load_model_and_tokenizer(model_name_or_path, is_gemma_model=False, is_peft_model=False, base_model_name=None):
    """Load model and tokenizer - handles both regular and PEFT models"""
    
    if is_peft_model:
        return load_peft_model_and_tokenizer(model_name_or_path, base_model_name, is_gemma_model)
    else:
        # Original function for regular models
        print(f"Loading model: {model_name_or_path}")
        tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
            llm_int8_enable_fp32_cpu_offload=False
        )

        if is_gemma_model:
            model = Gemma3ForConditionalGeneration.from_pretrained(
                model_name_or_path,
                quantization_config=quantization_config,
                dtype=torch.bfloat16,
                device_map="cuda"
            )
        else:
            model = AutoModelForCausalLM.from_pretrained(
                model_name_or_path,
                quantization_config=quantization_config,
                dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
                device_map="auto" if torch.cuda.is_available() else None
            )
        
        # Add padding token if it doesn't exist
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        
        return model, tokenizer

def calculate_perplexity(model, tokenizer, texts, max_length=512):
    """Calculate perplexity for a list of texts - core MegaTron metric"""
    model.eval()
    total_loss = 0
    total_tokens = 0
    
    with torch.no_grad():
        for text in tqdm(texts, desc="Calculating perplexity"):
            # Tokenize text
            inputs = tokenizer(
                text, 
                return_tensors="pt", 
                truncation=True, 
                max_length=max_length,
                padding=False
            )
            
            if torch.cuda.is_available():
                inputs = {k: v.to(model.device) for k, v in inputs.items()}
            
            input_ids = inputs['input_ids']
            
            # Skip if sequence is too short
            if input_ids.size(1) < 2:
                continue
                
            # Get model outputs
            outputs = model(input_ids=input_ids, labels=input_ids)
            
            # Calculate loss (negative log likelihood)
            loss = outputs.loss
            num_tokens = input_ids.size(1) - 1  # -1 because we predict next token
            
            total_loss += loss.item() * num_tokens
            total_tokens += num_tokens
    
    if total_tokens == 0:
        return float('inf')
    
    avg_loss = total_loss / total_tokens
    perplexity = math.exp(avg_loss)
    
    return perplexity, avg_loss

def calculate_token_accuracy(model, tokenizer, texts, max_length=512):
    """Calculate token-level accuracy - MegaTron metric"""
    model.eval()
    correct_predictions = 0
    total_predictions = 0
    
    with torch.no_grad():
        for text in tqdm(texts, desc="Calculating token accuracy"):
            inputs = tokenizer(
                text, 
                return_tensors="pt", 
                truncation=True, 
                max_length=max_length,
                padding=False
            )
            
            if torch.cuda.is_available():
                inputs = {k: v.to(model.device) for k, v in inputs.items()}
            
            input_ids = inputs['input_ids']
            
            if input_ids.size(1) < 2:
                continue
            
            # Get predictions
            outputs = model(input_ids=input_ids)
            predictions = torch.argmax(outputs.logits, dim=-1)
            
            # Compare predictions with actual next tokens
            # We predict tokens 1 to n given tokens 0 to n-1
            predicted_tokens = predictions[0, :-1]  # Remove last prediction
            actual_tokens = input_ids[0, 1:]        # Remove first token
            
            correct = (predicted_tokens == actual_tokens).sum().item()
            total = actual_tokens.size(0)
            
            correct_predictions += correct
            total_predictions += total
    
    if total_predictions == 0:
        return 0.0
    
    accuracy = correct_predictions / total_predictions
    return accuracy

def calculate_generation_diversity(predictions):
    """Calculate diversity metrics for generated text - MegaTron quality metric"""
    # Distinct-1 and Distinct-2 (unique unigrams and bigrams)
    all_tokens = []
    all_bigrams = []
    
    for pred in predictions:
        tokens = pred.lower().split()
        all_tokens.extend(tokens)
        
        # Create bigrams
        bigrams = [f"{tokens[i]} {tokens[i+1]}" for i in range(len(tokens)-1)]
        all_bigrams.extend(bigrams)
    
    if not all_tokens:
        return {
            'distinct_1': 0.0,
            'distinct_2': 0.0,
            'vocab_diversity': 0.0
        }
    
    distinct_1 = len(set(all_tokens)) / len(all_tokens) if all_tokens else 0
    distinct_2 = len(set(all_bigrams)) / len(all_bigrams) if all_bigrams else 0
    vocab_diversity = len(set(all_tokens))
    
    return {
        'distinct_1': distinct_1,
        'distinct_2': distinct_2,
        'vocab_diversity': vocab_diversity
    }

def calculate_semantic_coherence(model, tokenizer, predictions, max_length=512):
    """Calculate semantic coherence using model's own confidence - MegaTron metric"""
    model.eval()
    coherence_scores = []
    
    with torch.no_grad():
        for pred in tqdm(predictions, desc="Calculating semantic coherence"):
            if not pred.strip():
                continue
                
            inputs = tokenizer(
                pred, 
                return_tensors="pt", 
                truncation=True, 
                max_length=max_length,
                padding=False
            )
            
            if torch.cuda.is_available():
                inputs = {k: v.to(model.device) for k, v in inputs.items()}
            
            input_ids = inputs['input_ids']
            
            if input_ids.size(1) < 2:
                continue
            
            # Get model confidence (average probability of generated sequence)
            outputs = model(input_ids=input_ids)
            logits = outputs.logits[0, :-1, :]  # Remove last logit
            targets = input_ids[0, 1:]          # Remove first token
            
            # Calculate probabilities
            probs = F.softmax(logits, dim=-1)
            target_probs = probs.gather(1, targets.unsqueeze(-1)).squeeze(-1)
            
            # Average log probability as coherence score
            avg_log_prob = torch.log(target_probs).mean().item()
            coherence_scores.append(avg_log_prob)
    
    if not coherence_scores:
        return 0.0
    
    return np.mean(coherence_scores)

def calculate_megatron_metrics(model, tokenizer, predictions, references):
    """Calculate comprehensive MegaTron evaluation metrics"""
    print("Calculating MegaTron metrics...")
    
    megatron_results = {}
    
    # 1. Perplexity on references (gold standard)
    print("Computing perplexity on reference texts...")
    ref_perplexity, ref_avg_loss = calculate_perplexity(model, tokenizer, references)
    megatron_results['reference_perplexity'] = ref_perplexity
    megatron_results['reference_avg_loss'] = ref_avg_loss
    
    # 2. Perplexity on predictions
    print("Computing perplexity on generated texts...")
    pred_perplexity, pred_avg_loss = calculate_perplexity(model, tokenizer, predictions)
    megatron_results['prediction_perplexity'] = pred_perplexity
    megatron_results['prediction_avg_loss'] = pred_avg_loss
    
    # 3. Token accuracy on references
    print("Computing token accuracy...")
    token_accuracy = calculate_token_accuracy(model, tokenizer, references)
    megatron_results['token_accuracy'] = token_accuracy
    
    # 4. Generation diversity
    print("Computing generation diversity...")
    diversity_metrics = calculate_generation_diversity(predictions)
    megatron_results.update(diversity_metrics)
    
    # 5. Semantic coherence
    print("Computing semantic coherence...")
    coherence_score = calculate_semantic_coherence(model, tokenizer, predictions)
    megatron_results['semantic_coherence'] = coherence_score
    
    # 6. Additional MegaTron-style metrics
    # Length statistics
    pred_lengths = [len(pred.split()) for pred in predictions]
    ref_lengths = [len(ref.split()) for ref in references]
    
    megatron_results['avg_prediction_length'] = np.mean(pred_lengths)
    megatron_results['avg_reference_length'] = np.mean(ref_lengths)
    megatron_results['length_ratio'] = np.mean(pred_lengths) / np.mean(ref_lengths) if np.mean(ref_lengths) > 0 else 0
    
    # Perplexity ratio (lower is better for generation quality)
    megatron_results['perplexity_ratio'] = pred_perplexity / ref_perplexity if ref_perplexity > 0 else float('inf')
    
    return megatron_results

def generate_answer(model, tokenizer, instruction, input_text, max_length=512):
    """Generate answer for a given question using the same chat template format as training"""
    
    # Create prompt using instruction and input
    prompt_template = """{instruction}

{input}
"""
    user_prompt = prompt_template.format(
        instruction=instruction.strip(),
        input=input_text.strip()
    )
    
    # Create message structure matching your training format
    system = "You are a Medical Assistant follow the following instruction"
    message = [
        {"role": "system", "content": system},
        {"role": "user", "content": user_prompt}
    ]
    
    # Apply chat template with generation prompt (no assistant message for generation)
    prompt = tokenizer.apply_chat_template(
        message,
        tokenize=False,
        max_length=max_length,
        add_generation_prompt=True,
        truncation=True
    )
    
    # Tokenize input
    inputs = tokenizer(
        prompt, 
        return_tensors="pt",
        truncation=True,
        max_length=max_length,
        padding=True
    )
    
    if torch.cuda.is_available():
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    # Generate response
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=200,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )
    
    # Decode and extract the generated answer
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract only the new generated part (after the prompt)
    prompt_length = len(tokenizer.decode(inputs['input_ids'][0], skip_special_tokens=True))
    answer = full_response[prompt_length:].strip()
    
    return answer

def evaluate_model(model_path, is_gemma_model=False, is_peft_model=True, base_model_name=None):
    """Main evaluation function with MegaTron metrics"""
    
    # Configuration
    dataset_name = "medalpaca/medical_meadow_medical_flashcards"
    
    print("Starting evaluation with MegaTron metrics...")
    print(f"Using dataset: {dataset_name}")
    
    # Load model and tokenizer
    model, tokenizer = load_model_and_tokenizer(
        model_path, 
        is_gemma_model=is_gemma_model, 
        is_peft_model=is_peft_model,
        base_model_name=base_model_name
    )
    
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # Load dataset
    print("Loading dataset...")
    try:
        dataset = load_dataset(dataset_name)
        
        # Use validation split if available, otherwise use train split
        available_splits = list(dataset.keys())
        print(f"Available splits: {available_splits}")
        
        if "validation" in available_splits:
            eval_split = "validation"
        elif "test" in available_splits:
            eval_split = "test"
        elif "train" in available_splits:
            eval_split = "train"
        else:
            eval_split = available_splits[0]
        
        eval_data = dataset[eval_split]
        print(f"Successfully loaded {dataset_name}")
        print(f"Using {eval_split} split with {len(eval_data)} examples")
        
    except Exception as e:
        print(f"Error: Could not load dataset {dataset_name}: {str(e)}")
        return
    
    # Load evaluation metrics
    print("Loading evaluation metrics...")
    rouge = evaluate.load("rouge")
    bleu = evaluate.load("bleu")
    
    # Generate predictions
    print("Generating predictions...")
    predictions = []
    references = []
    
    # Limit evaluation to first 100 examples for faster testing
    # Remove this limit for full evaluation
    eval_subset = eval_data.select(range(min(100, len(eval_data))))
    
    # Print first example to understand dataset structure
    print(f"Dataset structure - First example keys: {list(eval_subset[0].keys())}")
    print(f"First example: {eval_subset[0]}")
    
    for i, example in enumerate(tqdm(eval_subset, desc="Evaluating")):
        # Extract the three columns: instruction, input, output
        instruction = example.get('instruction', 'Answer this question truthfully')
        input_text = example.get('input', '')
        reference = example.get('output', '')
        
        # Skip if any required field is missing
        if not instruction or not reference:
            print(f"Warning: Missing instruction or output in example {i}, skipping...")
            continue
        
        # Generate prediction
        try:
            prediction = generate_answer(model, tokenizer, instruction, input_text)
            predictions.append(prediction)
            references.append(reference)
            
            # Print first few examples for debugging
            if i < 3:
                print(f"\nExample {i+1}:")
                print(f"Instruction: {instruction[:100]}...")
                print(f"Input: {input_text[:100]}..." if input_text else "Input: None")
                print(f"Reference: {reference[:100]}...")
                print(f"Prediction: {prediction[:100]}...")
                
        except Exception as e:
            print(f"Error generating prediction for example {i}: {e}")
            continue
    
    if not predictions:
        print("No predictions generated. Please check the dataset structure.")
        return
    
    print(f"Generated {len(predictions)} predictions")
    
    # Calculate traditional metrics (ROUGE and BLEU)
    print("Calculating ROUGE scores...")
    rouge_results = rouge.compute(
        predictions=predictions,
        references=references,
        use_stemmer=True
    )
    
    print("Calculating BLEU scores...")
    references_bleu = [[ref] for ref in references]
    bleu_results = bleu.compute(
        predictions=predictions,
        references=references_bleu
    )
    
    # Calculate MegaTron metrics
    megatron_results = calculate_megatron_metrics(model, tokenizer, predictions, references)
    
    # Display results
    print("\n" + "="*60)
    print("COMPREHENSIVE EVALUATION RESULTS")
    print("="*60)
    
    print("\nTraditional NLG Metrics:")
    print("-" * 30)
    print("ROUGE Scores:")
    for key, value in rouge_results.items():
        print(f"  {key}: {value:.4f}")
    
    print(f"\nBLEU Score: {bleu_results['bleu']:.4f}")
    for i, score in enumerate(bleu_results['precisions'], 1):
        print(f"  BLEU-{i}: {score:.4f}")
    
    print("\nMegaTron Language Model Metrics:")
    print("-" * 40)
    print("Perplexity Metrics:")
    print(f"  Reference Perplexity: {megatron_results['reference_perplexity']:.4f}")
    print(f"  Prediction Perplexity: {megatron_results['prediction_perplexity']:.4f}")
    print(f"  Perplexity Ratio: {megatron_results['perplexity_ratio']:.4f}")
    print(f"  Reference Avg Loss: {megatron_results['reference_avg_loss']:.4f}")
    print(f"  Prediction Avg Loss: {megatron_results['prediction_avg_loss']:.4f}")
    
    print("\nGeneration Quality Metrics:")
    print(f"  Token Accuracy: {megatron_results['token_accuracy']:.4f}")
    print(f"  Semantic Coherence: {megatron_results['semantic_coherence']:.4f}")
    
    print("\nDiversity Metrics:")
    print(f"  Distinct-1: {megatron_results['distinct_1']:.4f}")
    print(f"  Distinct-2: {megatron_results['distinct_2']:.4f}")
    print(f"  Vocabulary Diversity: {megatron_results['vocab_diversity']}")
    
    print("\nLength Statistics:")
    print(f"  Avg Prediction Length: {megatron_results['avg_prediction_length']:.2f} tokens")
    print(f"  Avg Reference Length: {megatron_results['avg_reference_length']:.2f} tokens")
    print(f"  Length Ratio: {megatron_results['length_ratio']:.4f}")
    
    # Save results to file
    if predictions:
        results_df = pd.DataFrame({
            'Instruction': [eval_subset[i].get('instruction', 'N/A') for i in range(len(predictions))],
            'Input': [eval_subset[i].get('input', 'N/A') for i in range(len(predictions))],
            'Reference': references,
            'Prediction': predictions
        })
        
        results_df.to_csv('evaluation_results_megatron.csv', index=False)
        print(f"\nDetailed results saved to 'evaluation_results_megatron.csv'")
        
        # Save comprehensive summary metrics
        summary = {
            'Model': model_path,
            'Dataset': dataset_name,
            'Num_Examples': len(predictions),
            # Traditional metrics
            **{f'ROUGE_{k}': v for k, v in rouge_results.items()},
            'BLEU': bleu_results['bleu'],
            **{f'BLEU_{i}': score for i, score in enumerate(bleu_results['precisions'], 1)},
            # MegaTron metrics
            **megatron_results
        }
        
        summary_df = pd.DataFrame([summary])
        summary_df.to_csv('evaluation_summary_megatron.csv', index=False)
        print("Comprehensive summary metrics saved to 'evaluation_summary_megatron.csv'")
    
    return {
        'rouge': rouge_results,
        'bleu': bleu_results,
        'megatron': megatron_results
    }


In [None]:
# model_name = "MohamedAhmedAE/Llama-3.1-8B-Instruct-Medical-Finetune-v3" 
# model_name = "./Llama-3.2-3B-Instruct-Medical-Finetune-v4/" 
# model_name = "./Llama-3.2-1B-Instruct-Medical-Finetune-v4"

# model_name = "meta-llama/Llama-3.1-8B-Instruct"
# model_name = "m42-health/Llama3-Med42-8B"

model_name = "MohamedAhmedAE/distil_MedGemma_4B_Llama-3.2-1B"


peft_model_path = "MohamedAhmedAE/Llama-3.1-8B-Instruct-Medical-Finetune-v4"  
base_model_name = "meta-llama/Llama-3.1-8B-Instruct"  

results = evaluate_model(
    base_model_name=base_model_name,
    model_path=peft_model_path,
    is_gemma_model=False,  
    is_peft_model=True,
)


Starting evaluation with MegaTron metrics...
Using dataset: medalpaca/medical_meadow_medical_flashcards
Loading model: MohamedAhmedAE/distil_MedGemma_4B_Llama-3.2-1B


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/872 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

Loading dataset...
Available splits: ['train']
Successfully loaded medalpaca/medical_meadow_medical_flashcards
Using train split with 33955 examples
Loading evaluation metrics...
Generating predictions...
Dataset structure - First example keys: ['input', 'output', 'instruction']
First example: {'input': 'What is the relationship between very low Mg2+ levels, PTH levels, and Ca2+ levels?', 'output': 'Very low Mg2+ levels correspond to low PTH levels which in turn results in low Ca2+ levels.', 'instruction': 'Answer this question truthfully'}


Evaluating:   1%|▎                          | 1/100 [10:15<16:54:52, 615.08s/it]


Example 1:
Instruction: Answer this question truthfully...
Input: What is the relationship between very low Mg2+ levels, PTH levels, and Ca2+ levels?...
Reference: Very low Mg2+ levels correspond to low PTH levels which in turn results in low Ca2+ levels....
Prediction: As a Medical Assistant, I'll provide a clear and accurate answer based on scientific knowledge.

Ver...


Evaluating:   1%|▎                          | 1/100 [16:16<26:50:55, 976.32s/it]


KeyboardInterrupt: 

In [None]:
# model_name = "google/medgemma-4b-it" 

# rouge_scores, bleu_scores, megatron_scores = evaluate_model(model_name, is_gemma_model=True)

In [8]:
import torch
import time
import psutil
import gc
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import numpy as np

class ModelMetricsCalculator:
    def __init__(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
    def get_model_memory_usage(self, model):
        """
        Calculate actual memory usage of the loaded model
        """
        memory_stats = {}
        
        # GPU memory usage
        if torch.cuda.is_available():
            torch.cuda.synchronize()
            gpu_memory_allocated = torch.cuda.memory_allocated() / (1024**3)  # GB
            gpu_memory_reserved = torch.cuda.memory_reserved() / (1024**3)   # GB
            gpu_memory_max = torch.cuda.max_memory_allocated() / (1024**3)   # GB
            
            memory_stats['gpu_allocated_gb'] = gpu_memory_allocated
            memory_stats['gpu_reserved_gb'] = gpu_memory_reserved
            memory_stats['gpu_max_allocated_gb'] = gpu_memory_max
        
        # Model parameter count and size
        total_params = sum(p.numel() for p in model.parameters())
        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        
        # Estimate model size in memory
        model_size_bytes = 0
        for param in model.parameters():
            if hasattr(param, 'quant_state'):
                # Quantized parameter - estimate based on quantization
                model_size_bytes += param.numel() * 0.5  # 4-bit = 0.5 bytes per param
            else:
                model_size_bytes += param.numel() * param.element_size()
        
        model_size_gb = model_size_bytes / (1024**3)
        
        memory_stats.update({
            'total_params': total_params,
            'trainable_params': trainable_params,
            'model_size_gb': model_size_gb,
        })
        
        return memory_stats
    
    def benchmark_inference_speed(self, model, tokenizer, num_trials=10, max_new_tokens=50):
        """
        Benchmark actual inference speed and latency
        """
        model.eval()
        
        # Sample prompts for testing
        test_prompts = [
            "The future of artificial intelligence is",
            "In a world where technology advances rapidly,",
            "The key to understanding complex systems lies in",
            "When we consider the implications of modern computing,",
            "The relationship between data and decision-making"
        ]
        
        all_latencies = []
        all_tokens_generated = []
        total_time = 0
        
        with torch.no_grad():
            for trial in range(num_trials):
                prompt = test_prompts[trial % len(test_prompts)]
                inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
                
                if torch.cuda.is_available():
                    inputs = {k: v.to(self.device) for k, v in inputs.items()}
                
                # Measure inference time
                start_time = time.time()
                
                # Generate text
                with torch.cuda.amp.autocast() if torch.cuda.is_available() else torch.no_grad():
                    outputs = model.generate(
                        **inputs,
                        max_new_tokens=max_new_tokens,
                        do_sample=True,
                        temperature=0.7,
                        pad_token_id=tokenizer.eos_token_id,
                    )
                
                end_time = time.time()
                
                # Calculate metrics
                inference_time = end_time - start_time
                tokens_generated = outputs.shape[1] - inputs['input_ids'].shape[1]
                
                all_latencies.append(inference_time)
                all_tokens_generated.append(tokens_generated)
                total_time += inference_time
                
                # Clear cache
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
        
        # Calculate statistics
        avg_latency = np.mean(all_latencies) * 1000  # Convert to ms
        std_latency = np.std(all_latencies) * 1000
        avg_tokens = np.mean(all_tokens_generated)
        tokens_per_second = avg_tokens / (avg_latency / 1000)
        
        # First token latency (approximate)
        first_token_latency = avg_latency / avg_tokens if avg_tokens > 0 else avg_latency
        
        return {
            'avg_latency_ms': avg_latency,
            'std_latency_ms': std_latency,
            'first_token_latency_ms': first_token_latency,
            'tokens_per_second': tokens_per_second,
            'avg_tokens_generated': avg_tokens,
            'total_inference_time': total_time
        }
    
    def get_system_memory_usage(self):
        """
        Get system-wide memory usage
        """
        # RAM usage
        ram = psutil.virtual_memory()
        ram_total_gb = ram.total / (1024**3)
        ram_used_gb = ram.used / (1024**3)
        ram_percent = ram.percent
        
        return {
            'ram_total_gb': ram_total_gb,
            'ram_used_gb': ram_used_gb,
            'ram_percent': ram_percent
        }
    
    def comprehensive_benchmark(self, model_name, num_trials=10):
        """
        Complete benchmark of a model after loading with quantization
        """
        print(f"Benchmarking model: {model_name}")
        print("="*60)
        
        # Setup quantization config (your configuration)
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
            llm_int8_enable_fp32_cpu_offload=False
        )
        
        # Memory before loading
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            torch.cuda.reset_peak_memory_stats()
            initial_gpu_memory = torch.cuda.memory_allocated() / (1024**3)
        
        initial_system_memory = self.get_system_memory_usage()
        
        # Load model and tokenizer
        try:
            print("Loading model...")
            tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
            if tokenizer.pad_token is None:
                tokenizer.pad_token = tokenizer.eos_token
            
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                quantization_config=quantization_config,
                torch_dtype=torch.bfloat16,
                device_map="auto" if torch.cuda.is_available() else None
            )
            
            print("Model loaded successfully!")
            
        except Exception as e:
            print(f"Error loading model: {e}")
            return None
        
        # Calculate memory usage
        print("\\nCalculating memory usage...")
        memory_stats = self.get_model_memory_usage(model)
        final_system_memory = self.get_system_memory_usage()
        
        # Calculate inference metrics
        print("Benchmarking inference speed...")
        inference_stats = self.benchmark_inference_speed(model, tokenizer, num_trials)
        
        # Compile results
        results = {
            'model_name': model_name,
            'memory': memory_stats,
            'inference': inference_stats,
            'system_memory_before': initial_system_memory,
            'system_memory_after': final_system_memory
        }
        
        # Print results
        self.print_results(results)
        
        # Generate LaTeX table row        
        return results
    
    def print_results(self, results):
        """
        Print formatted results
        """
        print("\\n" + "="*60)
        print("BENCHMARK RESULTS")
        print("="*60)
        
        # Memory Usage
        print("\\nMEMORY USAGE:")
        print(f"  Model Parameters: {results['memory']['total_params']:,}")
        print(f"  Model Size: {results['memory']['model_size_gb']:.2f} GB")
        
        if 'gpu_allocated_gb' in results['memory']:
            print(f"  GPU Memory Allocated: {results['memory']['gpu_allocated_gb']:.2f} GB")
            print(f"  GPU Memory Reserved: {results['memory']['gpu_reserved_gb']:.2f} GB")
        
        # Inference Performance
        print("\\nINFERENCE PERFORMANCE:")
        print(f"  Average Latency: {results['inference']['avg_latency_ms']:.2f} ± {results['inference']['std_latency_ms']:.2f} ms")
        print(f"  First Token Latency: {results['inference']['first_token_latency_ms']:.2f} ms")
        print(f"  Tokens per Second: {results['inference']['tokens_per_second']:.2f}")
        print(f"  Average Tokens Generated: {results['inference']['avg_tokens_generated']:.1f}")
    

# Example usage
def main():
    calculator = ModelMetricsCalculator()
    
    # Example models to benchmark
    models_to_test = ["m42-health/Llama3-Med42-8B", 
                      "MohamedAhmedAE/Llama-3.1-8B-Instruct-Medical-Finetune-v3", 
                      "meta-llama/Llama-3.1-8B-Instruct", 
                      "google/medgemma-4b-it",
                      "MohamedAhmedAE/Llama-3.2-3B-Instruct-Medical-Finetune-v4", 
                      "MohamedAhmedAE/Llama-3.1-8B-Instruct-Medical-Finetune-v4", ]
    
    results = {}
    for model_name in models_to_test:
        try:
            print(f"\\n{'='*80}")
            result = calculator.comprehensive_benchmark(model_name, num_trials=5)
            if result:
                results[model_name] = result
        except Exception as e:
            print(f"Failed to benchmark {model_name}: {e}")
            continue
    
    # Summary table
    if results:
        print("\\n" + "="*80)
        print("SUMMARY TABLE FOR PAPER")
        print("="*80)
        print("Model & Memory (GB) & Inference Speed (tokens/sec) & Latency (ms) \\\\\\\\")
        print("\\hline")
        
        for model_name, result in results.items():
            model_short = model_name.split('/')[-1]
            memory_gb = result['memory'].get('gpu_allocated_gb', result['memory']['model_size_gb'])
            tokens_per_sec = result['inference']['tokens_per_second']
            latency_ms = result['inference']['avg_latency_ms']
            print(f"{model_short} & {memory_gb:.2f} & {tokens_per_sec:.1f} & {latency_ms:.2f} \\\\\\\\")

if __name__ == "__main__":
    main()

Benchmarking model: m42-health/Llama3-Med42-8B
Loading model...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model loaded successfully!
\nCalculating memory usage...
Benchmarking inference speed...


  with torch.cuda.amp.autocast() if torch.cuda.is_available() else torch.no_grad():


BENCHMARK RESULTS
\nMEMORY USAGE:
  Model Parameters: 4,540,600,320
  Model Size: 3.58 GB
  GPU Memory Allocated: 5.31 GB
  GPU Memory Reserved: 7.10 GB
\nINFERENCE PERFORMANCE:
  Average Latency: 1047.38 ± 73.48 ms
  First Token Latency: 20.95 ms
  Tokens per Second: 47.74
  Average Tokens Generated: 50.0
Benchmarking model: MohamedAhmedAE/Llama-3.1-8B-Instruct-Medical-Finetune-v3
Loading model...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model loaded successfully!
\nCalculating memory usage...
Benchmarking inference speed...
BENCHMARK RESULTS
\nMEMORY USAGE:
  Model Parameters: 4,876,144,640
  Model Size: 4.83 GB
  GPU Memory Allocated: 6.57 GB
  GPU Memory Reserved: 7.86 GB
\nINFERENCE PERFORMANCE:
  Average Latency: 1661.79 ± 18.87 ms
  First Token Latency: 33.24 ms
  Tokens per Second: 30.09
  Average Tokens Generated: 50.0
Benchmarking model: meta-llama/Llama-3.1-8B-Instruct
Loading model...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model loaded successfully!
\nCalculating memory usage...
Benchmarking inference speed...
BENCHMARK RESULTS
\nMEMORY USAGE:
  Model Parameters: 4,540,600,320
  Model Size: 3.58 GB
  GPU Memory Allocated: 8.68 GB
  GPU Memory Reserved: 12.32 GB
\nINFERENCE PERFORMANCE:
  Average Latency: 1014.23 ± 2.47 ms
  First Token Latency: 20.28 ms
  Tokens per Second: 49.30
  Average Tokens Generated: 50.0
Benchmarking model: google/medgemma-4b-it
Loading model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Model loaded successfully!
\nCalculating memory usage...
Benchmarking inference speed...
BENCHMARK RESULTS
\nMEMORY USAGE:
  Model Parameters: 2,490,222,960
  Model Size: 2.11 GB
  GPU Memory Allocated: 9.74 GB
  GPU Memory Reserved: 12.45 GB
\nINFERENCE PERFORMANCE:
  Average Latency: 1328.35 ± 4.64 ms
  First Token Latency: 26.57 ms
  Tokens per Second: 37.64
  Average Tokens Generated: 50.0
Benchmarking model: MohamedAhmedAE/Llama-3.2-3B-Instruct-Medical-Finetune-v4
Loading model...
Error loading model: None is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `hf auth login` or by passing `token=<your_token>`
Benchmarking model: MohamedAhmedAE/Llama-3.1-8B-Instruct-Medical-Finetune-v4
Loading model...


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/439 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

adapter_config.json:   0%|          | 0.00/916 [00:00<?, ?B/s]

Error loading model: None is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `hf auth login` or by passing `token=<your_token>`
SUMMARY TABLE FOR PAPER
Model & Memory (GB) & Inference Speed (tokens/sec) & Latency (ms) \\\\
\hline
Llama3-Med42-8B & 5.31 & 47.7 & 1047.38 \\\\
Llama-3.1-8B-Instruct-Medical-Finetune-v3 & 6.57 & 30.1 & 1661.79 \\\\
Llama-3.1-8B-Instruct & 8.68 & 49.3 & 1014.23 \\\\
medgemma-4b-it & 9.74 & 37.6 & 1328.35 \\\\
