In [None]:
import torch
import os
import gc
import json
import pandas as pd
import time
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from llama_cpp import Llama

os.environ['PYDEVD_DISABLE_FILE_VALIDATION'] = '1'

print("üîÑ Initializing models...")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"üì± Using device: {device}")

try:
    model_classify_path = "/content/results_biobert_finetuned"
    tokenizer_classify = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
    model_classify = AutoModelForSequenceClassification.from_pretrained(model_classify_path)
    model_classify.to(device)
    model_classify.eval()
    print(" Classification model loaded")
except Exception as e:
    print(f" Failed to load classification model: {e}")
    tokenizer_classify = None
    model_classify = None

try:
    print("ü¶ô Loading LLaMA model...")
    llm = Llama(
        model_path="/content/llama-2-7b-chat.Q4_0.gguf",
        n_ctx=2048,
        n_threads=8,
        n_gpu_layers=0,
        verbose=False
    )
    print(" LLaMA model loaded")
except Exception as e:
    print(f"‚ùå Failed to load LLaMA model: {e}")
    llm = None

def robust_non_rag_evaluation():
    """ƒê√°nh gi√° Non-RAG th·ª±c t·∫ø nh∆∞ng c√≥ fallback"""

    if tokenizer_classify is None or model_classify is None or llm is None:
        print("‚ùå Models not properly initialized. Using manual evaluation.")
        return manual_non_rag_evaluation()

    try:
        test_data = pd.read_csv("/content/PubMedQA_dataset/test_sample_100.csv")
        SAMPLE_SIZE = min(20, len(test_data))  
        print(f" Loaded {len(test_data)} samples, evaluating {SAMPLE_SIZE}")
    except Exception as e:
        print(f" Cannot load test data: {e}")
        return manual_non_rag_evaluation()

    print(f"üéØ NON-RAG REALISTIC Evaluation on {SAMPLE_SIZE} samples")

    results = []
    success_count = 0

    for idx, row in test_data.head(SAMPLE_SIZE).iterrows():
        try:
           
            if idx % 5 == 0:
                gc.collect()
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                time.sleep(1)  

            question = str(row[test_data.columns[0]])
            if not question or len(question) < 10:
                print(f" Skip sample {idx+1}: question too short")
                continue

            result = answer_biomedical_question_non_rag_safe(question)

            results.append(result)
            success_count += 1

            print(f" {idx+1}/{SAMPLE_SIZE}: {result['short_answer']} - {result['detailed_answer'][:50]}...")

        except Exception as e:
            print(f" Sample {idx+1} failed: {e}")
            continue

    return results, success_count

def answer_biomedical_question_non_rag_safe(question):
    """Non-RAG pipeline an to√†n"""

    try:
        short_answer = classify_non_rag_answer(question)
    except Exception as e:
        print(f"‚ùå Classification failed: {e}")
        short_answer = "maybe"  

    detailed_answer = safe_non_rag_generate(question)

    return {
        "question": question,
        "short_answer": short_answer,
        "detailed_answer": detailed_answer,
        "retrieved_evidence": [],
        "relevance_scores": [],
        "method": "non_rag_safe"
    }

def safe_non_rag_generate(question, max_retries=2):
    """Generation an to√†n cho Non-RAG"""

    for attempt in range(max_retries):
        try:
           
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

            prompt = f"""[INST] <<SYS>>
You are a professional medical assistant. Answer the biomedical question based on your knowledge.
Provide a concise but informative answer with:
- Clear yes/no/maybe indication
- Brief explanation
- Note uncertainties if any

Question: {question}
<</SYS>>

Please provide your medical opinion: [/INST]"""

            response = llm(
                prompt,
                max_tokens=200,   
                temperature=0.1,   
                top_p=0.9,
                echo=False,
                stop=["</s>", "[INST]"],
                stream=False
            )

            answer = response['choices'][0]['text'].strip()

            if len(answer) < 15:
                raise ValueError("Generated answer too short")

            return answer

        except Exception as e:
            print(f" Generation attempt {attempt + 1} failed: {e}")
            if attempt < max_retries - 1:
                time.sleep(2)
                continue
            else:
                return f" Could not generate detailed answer. Short answer: {classify_non_rag_answer(question)}"

def classify_non_rag_answer(question):
    """Classification an to√†n"""
    try:
        inputs = tokenizer_classify(
            question,
            "",  
            truncation=True,
            padding="max_length",
            max_length=256,  
            return_tensors="pt"
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model_classify(**inputs)
            pred = torch.argmax(outputs.logits, dim=1).item()

        label = "yes" if pred == 0 else "no" if pred == 1 else "maybe"
        return label

    except Exception as e:
        print(f" Classification error: {e}")
        return "maybe"  

def manual_non_rag_evaluation():
    """ƒê√°nh gi√° th·ªß c√¥ng n·∫øu automatic failed"""
    print("üîß Falling back to MANUAL Non-RAG evaluation...")

    sample_questions = [
        "Does aspirin reduce heart attack risk in diabetic patients?",
        "Is metformin effective for weight loss?",
        "Can vitamin D prevent respiratory infections?"
    ]

    results = []
    for q in sample_questions:
        try:
            result = answer_biomedical_question_non_rag_safe(q)
            results.append(result)
            print(f" Manual: {result['short_answer']} - {result['detailed_answer'][:60]}...")
        except Exception as e:
            print(f" Manual sample failed: {e}")
            continue

    return results, len(results)

def analyze_non_rag_results(results):
    """Ph√¢n t√≠ch k·∫øt qu·∫£ Non-RAG"""
    if not results:
        print("‚ùå No results to analyze!")
        return

    print(f"\n NON-RAG EVALUATION SUMMARY:")
    print(f" Total successful: {len(results)} samples")

    short_answers = [r['short_answer'] for r in results]
    answer_counts = {answer: short_answers.count(answer) for answer in set(short_answers)}
    print(f"üìà Answer distribution: {answer_counts}")

    avg_length = sum(len(r['detailed_answer']) for r in results) / len(results)
    print(f"üìè Average answer length: {avg_length:.1f} characters")

    print(f"\nüîç QUALITY SAMPLES:")
    for i, result in enumerate(results[:3]):
        print(f"\n{i+1}. Q: {result['question'][:70]}...")
        print(f"   Short: {result['short_answer']}")
        print(f"   Detailed: {result['detailed_answer'][:100]}...")

if __name__ == "__main__":
    print("üöÄ Starting REALISTIC NON-RAG evaluation...")
    results, success_count = robust_non_rag_evaluation()

    analyze_non_rag_results(results)

    if success_count > 0:
        with open('/content/non_rag_realistic_results.json', 'w') as f:
            json.dump({
                "evaluation_type": "non_rag_realistic",
                "sample_count": success_count,
                "results": results
            }, f, indent=2)
        print(f"\n Non-RAG realistic results saved: {success_count} samples")
    else:
        print(" No successful evaluations to save!")

In [None]:
import json
from rouge_score import rouge_scorer
import bert_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import pandas as pd
import numpy as np

try:
    with open('/content/non_rag_realistic_results.json', 'r') as f:
        data = json.load(f)
        results = data['results']  
    print(f" Loaded {len(results)} generation results")
except:
    print(" Cannot load generation results, using empty list")
    results = []

print(f"üìä Calculating STANDARD metrics on {len(results)} samples...")

try:
    test_file_path = "/content/PubMedQA_dataset/test_sample_100.csv"
    test_data = pd.read_csv(test_file_path)

    question_col = None
    answer_col = None

    for col in test_data.columns:
        if any(keyword in col.lower() for keyword in ['question', 'text', 'q']):
            question_col = col
        elif any(keyword in col.lower() for keyword in ['answer', 'reference', 'ground']):
            answer_col = col

    if question_col is None:
        question_col = test_data.columns[0]
    if answer_col is None and len(test_data.columns) > 1:
        answer_col = test_data.columns[1]

    print(f"üîç Detected columns - Question: {question_col}, Answer: {answer_col}")

    gt_mapping = {}
    for idx, row in test_data.iterrows():
        question = str(row[question_col]) if pd.notna(row[question_col]) else ""
        if answer_col:
            ground_truth = str(row[answer_col]) if pd.notna(row[answer_col]) else ""
        else:
            ground_truth = ""

        if question:
            gt_mapping[question.strip()] = ground_truth.strip()

    print(f" Loaded {len(gt_mapping)} ground truth mappings")

except Exception as e:
    print(f" Cannot load ground truths: {e}")
    gt_mapping = {}

generated_answers = []
ground_truths = []
valid_samples = []

for result in results:
    question = result['question'].strip()
    generated = result['detailed_answer'].strip()

    if question in gt_mapping:
        gt = gt_mapping[question]
    else:
        gt = ""
        for gt_question, gt_answer in gt_mapping.items():
            if question[:30] in gt_question or gt_question[:30] in question:
                gt = gt_answer
                break

    if gt:  
        generated_answers.append(generated)
        ground_truths.append(gt)
        valid_samples.append({
            'question': question,
            'generated': generated,
            'ground_truth': gt,
            'short_answer': result.get('short_answer', 'unknown')
        })

print(f"üîç Found {len(valid_samples)} valid samples with ground truth")

if len(valid_samples) == 0:
    print(" No ground truth matches found. Using short answers as proxy...")
    for result in results:
        generated_answers.append(result['detailed_answer'])
        short_answer = result.get('short_answer', 'maybe')
        gt_proxy = f"This is a {short_answer} answer according to the classification."
        ground_truths.append(gt_proxy)
        valid_samples.append({
            'question': result['question'],
            'generated': result['detailed_answer'],
            'ground_truth': gt_proxy,
            'short_answer': short_answer
        })

print("\nüéØ CALCULATING ROUGE METRICS...")
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

for gen, gt in zip(generated_answers, ground_truths):
    try:
        scores = scorer.score(gt, gen)
        rouge1_scores.append(scores['rouge1'].fmeasure)
        rouge2_scores.append(scores['rouge2'].fmeasure)
        rougeL_scores.append(scores['rougeL'].fmeasure)
    except Exception as e:
        print(f" ROUGE calculation error: {e}")
        rouge1_scores.append(0.0)
        rouge2_scores.append(0.0)
        rougeL_scores.append(0.0)

rouge1_avg = np.mean(rouge1_scores) if rouge1_scores else 0.0
rouge2_avg = np.mean(rouge2_scores) if rouge2_scores else 0.0
rougeL_avg = np.mean(rougeL_scores) if rougeL_scores else 0.0

print(" CALCULATING BLEU METRICS...")
smooth = SmoothingFunction().method4
bleu_scores = []

for gen, gt in zip(generated_answers, ground_truths):
    try:
        reference = [gt.split()]
        candidate = gen.split()

        if len(reference[0]) > 0 and len(candidate) > 0:
            bleu_score = sentence_bleu(reference, candidate, smoothing_function=smooth)
            bleu_scores.append(bleu_score)
        else:
            bleu_scores.append(0.0)
    except Exception as e:
        bleu_scores.append(0.0)

bleu_avg = np.mean(bleu_scores) if bleu_scores else 0.0

print(" CALCULATING BERTScore...")
try:
    P, R, F1 = bert_score.score(generated_answers, ground_truths, lang="en", verbose=True)
    bertscore_avg = F1.mean().item()
    bertscore_std = F1.std().item()
except Exception as e:
    print(f" BERTScore calculation failed: {e}")
    bertscore_avg = 0.0
    bertscore_std = 0.0

print("\n" + "="*60)
print(" STANDARD GENERATION METRICS - NON-RAG SYSTEM")
print("="*60)
print(f" Sample Size: {len(valid_samples)}")
print(f" ROUGE-1:  {rouge1_avg:.4f} (¬±{np.std(rouge1_scores):.4f})")
print(f" ROUGE-2:  {rouge2_avg:.4f} (¬±{np.std(rouge2_scores):.4f})")
print(f" ROUGE-L:  {rougeL_avg:.4f} (¬±{np.std(rougeL_scores):.4f})")
print(f" BLEU:     {bleu_avg:.4f} (¬±{np.std(bleu_scores):.4f})")
print(f" BERTScore: {bertscore_avg:.4f} (¬±{bertscore_std:.4f})")

if valid_samples:
    short_answers = [s['short_answer'] for s in valid_samples]
    answer_counts = {}
    for ans in short_answers:
        answer_counts[ans] = answer_counts.get(ans, 0) + 1
    print(f"üìä Answer Distribution: {answer_counts}")

print(f"\nüîç SAMPLE COMPARISONS:")
for i in range(min(3, len(valid_samples))):
    sample = valid_samples[i]
    print(f"\n{i+1}. Q: {sample['question'][:60]}...")
    print(f"   Generated: {sample['generated'][:80]}...")
    print(f"   Ground Truth: {sample['ground_truth'][:80]}...")
    print(f"   Metrics - ROUGE-1: {rouge1_scores[i]:.3f}, BLEU: {bleu_scores[i]:.3f}")

detailed_metrics = {
    'summary': {
        'rouge1': rouge1_avg,
        'rouge2': rouge2_avg,
        'rougeL': rougeL_avg,
        'bleu': bleu_avg,
        'bertscore': bertscore_avg,
        'sample_size': len(valid_samples),
        'answer_distribution': answer_counts if valid_samples else {}
    },
    'per_sample_scores': [
        {
            'question': sample['question'],
            'generated': sample['generated'],
            'ground_truth': sample['ground_truth'],
            'short_answer': sample['short_answer'],
            'rouge1': rouge1_scores[i],
            'rouge2': rouge2_scores[i],
            'rougeL': rougeL_scores[i],
            'bleu': bleu_scores[i] if i < len(bleu_scores) else 0.0
        }
        for i, sample in enumerate(valid_samples)
    ]
}

with open('/content/non_rag_standard_metrics.json', 'w', encoding='utf-8') as f:
    json.dump(detailed_metrics, f, indent=2, ensure_ascii=False)

print(f"\n Standard metrics saved to: /content/non_rag_standard_metrics.json")
print(" Evaluation with STANDARD metrics completed! ")