### This Notebook is for Retrieving data with e5-small-v2 embedding model and generate answers with Mistral7b LLM Model

In [1]:
import json
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
from transformers import pipeline
import torch
from qdrant_client import QdrantClient
from qdrant_client.http import models as qdrant_models
import pandas as pd

MODEL_NAME = "intfloat/e5-small-v2"
QDRANT_HOST = "localhost"
QDRANT_PORT = 6333
COLLECTION_NAME = "pmc_e5_full_docs"
EMBED_DIM = 384

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
model.eval()
if torch.cuda.is_available():
    model.cuda()

In [None]:
def get_embedding(text):
    inp = f"passage: {text.strip()}"
    encoded = tokenizer(inp, return_tensors="pt", truncation=True, max_length=512)
    if torch.cuda.is_available():
        encoded = {k: v.cuda() for k, v in encoded.items()}
    with torch.no_grad():
        out = model(**encoded)
    return out.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT)
print("Qdrant connection ok")

Qdrant connection ok


In [None]:
query = "Prompting Primary Care Providers about Increased Patient Risk As a Result of Family History: Does It Work?"
query_emb = get_embedding(query)

hits = client.search(
    collection_name=COLLECTION_NAME,
    query_vector=query_emb.tolist(),
    limit=3
)

context_chunks = []
for hit in hits:
    p = hit.payload
    chunk = []
    if p.get("title"):
        chunk.append(f"Title: {p['title']}")
    if p.get("abstract"):
        chunk.append(f"Abstract: {p['abstract']}")

    
    context_chunks.append("\n".join(chunk))

retrieved_context = "\n\n---\n\n".join(context_chunks)


retrieved_context = "\n\n".join(context_chunks)
print("Retrieved context:", retrieved_context)


Retrieved context: Title: How is family health history discussed in routine primary healthcare? A qualitative study of archived family doctor consultations
Abstract: Family health history underpins genetic medicine. Our study aimed to explore language and patterns of communication relating to family health history observed in interactions between general practitioners (GPs) and their patients within routine primary care consultations. Secondary analysis of patient and GP routine consultation data (n=252). Consultations that included ‘family health history’ were eligible for inclusion (n=58). A qualitative inductive analysis of the interactions from consultation transcripts. 46/58 conversations about family health history were initiated by the GP. Most discussions around family history lasted for between approximately 1 to 2 min. Patients were invited to share family health history through one of two ways: non-specific enquiry (eg, by asking the patient about ‘anything that runs in the 

  hits = client.search(


In [4]:
model_name = "Qwen/Qwen1.5-1.8B-Chat"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 2048)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=2048, out_features=5504, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5504, bias=False)
          (down_proj): Linear(in_features=5504, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((2048,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((2048,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((2048,), eps=1e-06)
    (rotary_emb): 

In [None]:
system_message = (
    "You are PubMed QA-Bot, a cautious biomedical expert.\n"
    "• Read the CONTEXT exactly as given—do not invent facts.\n"
    "• If the answer is present, quote or paraphrase only material you can trace to a specific sentence in the CONTEXT.\n"
    "• If the answer is partly or wholly absent, reply exactly with: 'INSUFFICIENT CONTEXT'.\n"
    "• Use concise scientific language and original terminology from the CONTEXT.\n"
    "• Write the ANSWER in ≤ 40 words. Do NOT include citations in the ANSWER line.\n"
    "• After the ANSWER, output a line that starts with 'EVIDENCE:' followed by the sentence IDs you used.\n"
)

demo_user = (
    "QUESTION:\nWhat is the main benefit of opioid patient-controlled therapy (PCT) in palliative care patients?\n\n"
    "CONTEXT:\n"
    "[1] Opioid PCT allows patients to self-administer small boluses...\n"
)
demo_assistant = (
    "ANSWER: Opioid PCT safely reduces refractory breathlessness in palliative patients while giving them direct symptom control.\n"
    "EVIDENCE: 1-2"
)

user_message = (
    f"QUESTION:\n{query}\n\n"
    f"CONTEXT:\n{retrieved_context}"
)

messages = [
    {"role": "system", "content": system_message},
    # {"role": "user", "content": demo_user},
    # {"role": "assistant", "content": demo_assistant},
    {"role": "user", "content": user_message},
]

In [6]:
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(prompt, return_tensors="pt").to(device)
print("Token count:", len(tokenizer(prompt)["input_ids"]))

print("Tokenization complete")

print("Prompt length:", len(prompt))
outputs = model.generate(
    **inputs,
    max_new_tokens=300,
    do_sample=False
)
generated = outputs[0][inputs["input_ids"].shape[1]:]
answer = tokenizer.decode(generated, skip_special_tokens=True).strip()

print("Question:", query)
print("Answer:", answer)


The following generation flags are not valid and may be ignored: ['top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Token count: 1147
Tokenization complete
Prompt length: 6223
Question: Prompting Primary Care Providers about Increased Patient Risk As a Result of Family History: Does It Work?
Answer: EVIDENCE: 
1. "Family health history is regarded as a genetic test and is embedded in the sociocultural norms of the patient from whom information is being sought." - Sentence ID: 3
2. "Our findings highlight that it is more complex than asking simply if ‘anything’ runs in the family." - Sentence ID: 46
3. "As the collection of family health history is expected to be more routine, it will be important to also consider it from sociocultural perspectives in order to help mitigate any inequities in how family history is collected, and therefore used (or not) in a person's healthcare." - Sentence ID: 58
4. "Both are likely to have enduring effects in the post-pandemic world, raising the importance of focusing on lessons that can be learned for the future." - Sentence ID: 70
5. "We identified two primary them

In [None]:
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()