In [8]:
import torch
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import nltk
nltk.download("wordnet")
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from rapidfuzz import fuzz

from huggingface_hub import login

login(token="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")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
# !pip install rapidfuzz
# !pip install fuzzywuzzy

In [9]:
kb_df = pd.read_csv("./unique_diseases_with_NHS.csv", encoding='latin1')
kb_df["entry_text"] = kb_df.apply(
    lambda row: f"Disease: {row['Disease']}. Symptoms: {row['Symptoms']}. Guidelines: {row['Guidelines']}.",
    axis=1
)
corpus = kb_df["entry_text"].tolist()

embedder = SentenceTransformer("multi-qa-mpnet-base-dot-v1")
corpus_embeddings = embedder.encode(corpus, convert_to_numpy=True)

embedding_dim = corpus_embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
index.add(corpus_embeddings)

In [6]:
test_df = pd.read_csv("./Final_Prompt-Style_Test_Set_V2.csv")
prompts = test_df["prompt"].tolist()

def retrieve_knowledge(query, top_k=3):
    query_embedding = embedder.encode([query], convert_to_numpy=True)
    distances, indices = index.search(np.array(query_embedding).astype('float32'), top_k)
    return [corpus[idx] for idx in indices[0]]

In [10]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig

# model_path = "./lora_llama_medical_finetuned"

# tokenizer = AutoTokenizer.from_pretrained(model_path)
# model = AutoModelForCausalLM.from_pretrained(
#     model_path,
#     torch_dtype=torch.float16,
#     device_map="auto"
# )

MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"

ADAPTER_PATH = "./lora_llama_medical_finetuned"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto"
)

model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
model.eval()

def generate_answer(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=200,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            eos_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

def extract_answer(output_text):
    if "Doctor:" in output_text:
        return output_text.split("Doctor:")[-1].strip()
    elif "Output:" in output_text:
        return output_text.split("Output:")[-1].strip()
    else:
        return output_text.strip()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

  adapters_weights = torch.load(filename, map_location=torch.device(device))


In [11]:
results = []
for i, row in tqdm(test_df.iterrows(), total=len(test_df)):
    user_input = row["prompt"]
    ground_truth = row["diagnosis"]

    retrieved = retrieve_knowledge(user_input, top_k=3)
    rag_context = "\n".join(retrieved)

    full_prompt = f"""{rag_context}\nInstruction: {user_input}\nDoctor:"""
    generated_output = generate_answer(full_prompt)
    extracted = extract_answer(generated_output)

    results.append({
        "prompt": user_input,
        "ground_truth": ground_truth,
        "retrieved_knowledge": rag_context,
        "generated_output": generated_output,
        "extracted_diagnosis": extracted
    })

result_df = pd.DataFrame(results)
result_df.to_csv("RAG_diagnosis_predictions_V2.csv", index=False)
print("结果已保存到 RAG_diagnosis_predictions_V2.csv")

  0%|          | 0/984 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
  0%|          | 1/984 [00:08<2:13:02,  8.12s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
  0%|          | 2/984 [00:15<2:04:19,  7.60s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
  0%|          | 3/984 [00:22<2:02:56,  7.52s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
  0%|          | 4/984 [00:30<2:00:56,  7.40s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
  1%|          | 5/984 [00:37<1:59:37,  7.33s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
  1%|          | 6/984 [00:44<1:59:01,  7.30s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
  1%|          | 7/984 [00:51<1:58:16,  7.26s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
  1%|          | 8/984 [00:58<1:57

结果已保存到 RAG_diagnosis_predictions_V2.csv





In [5]:
import pandas as pd
import numpy as np
from rouge_score import rouge_scorer
from nltk.translate.meteor_score import meteor_score
from fuzzywuzzy import fuzz
import re

def compute_rouge(refs, preds):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    rouge1, rougel = [], []
    for ref, pred in zip(refs, preds):
        scores = scorer.score(ref, pred)
        rouge1.append(scores['rouge1'].fmeasure)
        rougel.append(scores['rougeL'].fmeasure)
    return np.mean(rouge1), np.mean(rougel)

def compute_avg_meteor(refs, preds, verbose=True):
    scores = []
    for i, (ref, pred) in enumerate(zip(refs, preds)):
        score = meteor_score([ref.split()], pred.split())
        scores.append(score)        
        # if verbose:
        #     print(f"[{i}]")
        #     print(f"GT   : {ref}")
        #     print(f"PRED : {pred}")
        #     print(f"METEOR : {score:.4f}")
        #     print("-" * 40)
    return np.mean(scores)

def compute_fuzzy_accuracy(refs, preds, threshold=80):
    matches = [fuzz.partial_ratio(ref.lower(), pred.lower()) >= threshold for ref, pred in zip(refs, preds)]
    return sum(matches) / len(matches)


df = pd.read_csv("RAG_diagnosis_predictions_V2.csv")
# gt_list = df["ground_truth"].astype(str).tolist()
gt_list = df["ground_truth"].astype(str).str.strip().str.lower().tolist()


def clean_prediction(pred):
    match = re.search(r"Diagnosis:\s*(.+)", pred, re.IGNORECASE)
    if match:
        diagnosis_text = match.group(1).strip()

        advice_idx = diagnosis_text.lower().find(". advice:")
        if advice_idx != -1:
            diagnosis_text = diagnosis_text[:advice_idx]

        diagnosis_first_line = re.split(r"\s*\n", diagnosis_text)[0].strip()
        return diagnosis_first_line.lower()
    else:
        return pred.strip().lower()

pred_list = df["extracted_diagnosis"].astype(str).apply(clean_prediction).tolist()

r1, rl = compute_rouge(gt_list, pred_list)
meteor = compute_avg_meteor(gt_list, pred_list)
fuzzy_acc = compute_fuzzy_accuracy(gt_list, pred_list)

print(f"ROUGE-1 F1: {r1:.2f}")
print(f"ROUGE-L F1: {rl:.2f}")
print(f"METEOR Score: {meteor:.2f}")
print(f"Fuzzy Matching Accuracy: {fuzzy_acc * 100:.2f}%")


ROUGE-1 F1: 0.92
ROUGE-L F1: 0.92
METEOR Score: 0.65
Fuzzy Matching Accuracy: 95.83%


In [6]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def compute_avg_bleu(refs, preds, verbose=True):
    smoothie = SmoothingFunction().method4
    bleu_scores = []

    for i, (ref, pred) in enumerate(zip(refs, preds)):
        ref_tokens = ref.split()
        pred_tokens = pred.split()
        bleu = sentence_bleu([ref_tokens], pred_tokens, weights=(1, 0, 0, 0), smoothing_function=smoothie)
        bleu_scores.append(bleu)
        
        # if verbose:
        #     print(f"[{i}]")
        #     print(f"GT   : {ref}")
        #     print(f"PRED : {pred}")
        #     print(f"BLEU : {bleu:.4f}")
        #     print("-" * 40)

    return np.mean(bleu_scores)

bleu = compute_avg_bleu(gt_list, pred_list)
print(f"BLEU Score: {bleu:.2f}")


BLEU Score: 0.91
