### This Notebook is for Retrieving data with e5-small-v2 embedding model and generate answers with Mistral7b LLM Model

In [1]:
import json
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
from transformers import pipeline
import torch
from qdrant_client import QdrantClient
from qdrant_client.http import models as qdrant_models
import pandas as pd

MODEL_NAME = "intfloat/e5-small-v2"
QDRANT_HOST = "localhost"
QDRANT_PORT = 6333
COLLECTION_NAME = "pmc_chunked_title_abstract"
EMBED_DIM = 384

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
model.eval()
if torch.cuda.is_available():
    model.cuda()

In [None]:
def get_embedding(text):
    inp = f"passage: {text.strip()}"
    encoded = tokenizer(inp, return_tensors="pt", truncation=True, max_length=512)
    if torch.cuda.is_available():
        encoded = {k: v.cuda() for k, v in encoded.items()}
    with torch.no_grad():
        out = model(**encoded)
    return out.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT)
print("Qdrant connection ok")

Qdrant connection ok


In [3]:
query = "Patient-Controlled Therapy of Breathlessness in Palliative Care: A New Therapeutic Concept for Opioid Administration?"
query_emb = get_embedding(query)

hits = client.search(
    collection_name=COLLECTION_NAME,
    query_vector=query_emb.tolist(),
    limit=3
)

context_chunks = []
for hit in hits:
    cos_sim = 1.0 - hit.score
    print(f"Doc ID {hit.id} — cosine similarity: {cos_sim:.4f}")
    print(f"Score: {hit.score}")
    p = hit.payload
    chunk = []
    if p.get("title"):
        chunk.append(f"Title: {p['title']}")
        if p.get("abstract"):
            chunk.append(f"Abstract: {p['abstract']}")

    
    context_chunks.append("\n".join(chunk))

retrieved_context = "\n\n---\n\n".join(context_chunks)


retrieved_context = "\n\n".join(context_chunks)
print("Retrieved context:", retrieved_context)


Doc ID 1197115 — cosine similarity: 0.0912
Score: 0.9088169
Doc ID 983637 — cosine similarity: 0.1108
Score: 0.88918495
Doc ID 192456 — cosine similarity: 0.1127
Score: 0.8873118
Retrieved context: Title: Patient-Controlled Therapy with Intravenous Oxycodone in Breathlessness due to Advanced Cancer: A Case Report
Abstract: Dyspnoea is a debilitating symptom in medicine, especially in palliative care. Opioids are the pharmacological agents of choice in the treatment of dyspnoea in palliative medicine. Morphine is the best-studied opioid, and recent literature on oxycodone is encouraging. In refractory cases, opioid infusion and palliative sedation may have to be used. We present a case that used oxycodone in a patient-controlled device specifically for dyspnoea and its effects in relieving dyspnoea in a fast and timely manner. This helped in meeting the demands of the patient and relieving suffering rapidly with less sedation. This case report is unique in the use of an oxycodone patien

  hits = client.search(


In [4]:
model_name = "mistralai/Mistral-7B-Instruct-v0.2"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    device_map="auto",
    torch_dtype=torch.float16
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): Mist

In [None]:
system_message = (
    "You are PubMed QA-Bot, a cautious biomedical expert.\n"
    "• Read the CONTEXT exactly as given—do not invent facts.\n"
    "• If the answer is present, quote or paraphrase only material you can trace to a specific sentence in the CONTEXT.\n"
    "• If the answer is partly or wholly absent, reply exactly with: 'INSUFFICIENT CONTEXT'.\n"
    "• Use concise scientific language and original terminology from the CONTEXT.\n"
    "• Write the ANSWER in ≤ 40 words. Do NOT include citations in the ANSWER line.\n"
    "• After the ANSWER, output a line that starts with 'EVIDENCE:' followed by the sentence IDs you used.\n"
)

demo_user = (
    "QUESTION:\nWhat is the main benefit of opioid patient-controlled therapy (PCT) in palliative care patients?\n\n"
    "CONTEXT:\n"
    "[1] Opioid PCT allows patients to self-administer small boluses...\n"
)
demo_assistant = (
    "ANSWER: Opioid PCT safely reduces refractory breathlessness in palliative patients while giving them direct symptom control.\n"
    "EVIDENCE: 1-2"
)

user_message = (
    f"QUESTION:\n{query}\n\n"
    f"CONTEXT:\n{retrieved_context}"
)

messages = [
    {"role": "system", "content": system_message},
    # {"role": "user", "content": demo_user},
    # {"role": "assistant", "content": demo_assistant},
    {"role": "user", "content": user_message},
]


In [6]:
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(prompt, return_tensors="pt").to(device)
print("Token count:", len(tokenizer(prompt)["input_ids"]))

print("Tokenization complete")

print("Prompt length:", len(prompt))
outputs = model.generate(
    **inputs,
    max_new_tokens=300,
    do_sample=False
)
generated = outputs[0][inputs["input_ids"].shape[1]:]
answer = tokenizer.decode(generated, skip_special_tokens=True).strip()

print("Question:", query)
print("Answer:", answer)


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Token count: 1515
Tokenization complete
Prompt length: 5752




Question: Patient-Controlled Therapy of Breathlessness in Palliative Care: A New Therapeutic Concept for Opioid Administration?
Answer: ANSWER:
The case report describes the use of a patient-controlled oxycodone device for dyspnoea relief in palliative care, while the study investigates long-term opioid prescription for chronic breathlessness in non-malignant respiratory diseases. No specific information is provided regarding genetic factors influencing opioid response for breathlessness in these texts.

EVIDENCE:
1. Case Report: Abstract and specific sentence: "We present a case that used oxycodone in a patient-controlled device specifically for dyspnoea and its effects in relieving dyspnoea in a fast and timely manner."
2. Study: Abstract and specific sentences: "To investigate the long-term efficacy of, and side-effects from, opioids prescribed for chronic breathlessness to patients with advanced, non-malignant, respiratory diseases." "No associations were seen with fentanyl nor wit