### This Notebook is for Retrieving data with e5-small-v2 embedding model and generate answers with Mistral7b LLM Model

In [1]:
import json
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
from transformers import pipeline
import torch
from qdrant_client import QdrantClient
from qdrant_client.http import models as qdrant_models
import pandas as pd

MODEL_NAME = "intfloat/e5-small-v2"
QDRANT_HOST = "localhost"
QDRANT_PORT = 6333
COLLECTION_NAME = "pqa_labeled"
EMBED_DIM = 384

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
model.eval()
if torch.cuda.is_available():
    model.cuda()

In [None]:
def get_embedding(text):
    inp = f"passage: {text.strip()}"
    encoded = tokenizer(inp, return_tensors="pt", truncation=True, max_length=512)
    if torch.cuda.is_available():
        encoded = {k: v.cuda() for k, v in encoded.items()}
    with torch.no_grad():
        out = model(**encoded)
    return out.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT)
print("Qdrant connection ok")

Qdrant connection ok


In [None]:
query = "Syncope during bathing in infants, a pediatric form of water-induced urticaria?"
query_emb = get_embedding(query)

hits = client.search(
    collection_name=COLLECTION_NAME,
    query_vector=query_emb.tolist(),
    limit=1
)

context_chunks = []
for hit in hits:
    p = hit.payload
    chunk = []
    if p.get("Knowledge"):
        chunk.append(f"Knowledge: {p['Knowledge']}")

    
    context_chunks.append("\n".join(chunk))

retrieved_context = "\n\n---\n\n".join(context_chunks)


retrieved_context = "\n\n".join(context_chunks)
print("Retrieved context:", retrieved_context)


Retrieved context: Knowledge: ['Apparent life-threatening events in infants are a difficult and frequent problem in pediatric practice. The prognosis is uncertain because of risk of sudden infant death syndrome.'
 'Eight infants aged 2 to 15 months were admitted during a period of 6 years; they suffered from similar maladies in the bath: on immersion, they became pale, hypotonic, still and unreactive; recovery took a few seconds after withdrawal from the bath and stimulation. Two diagnoses were initially considered: seizure or gastroesophageal reflux but this was doubtful. The hypothesis of an equivalent of aquagenic urticaria was then considered; as for patients with this disease, each infant\'s family contained members suffering from dermographism, maladies or eruption after exposure to water or sun. All six infants had dermographism. We found an increase in blood histamine levels after a trial bath in the two infants tested. The evolution of these "aquagenic maladies" was favourable

  hits = client.search(


In [4]:
model_name = "mistralai/Mistral-7B-Instruct-v0.2"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    device_map="auto",
    torch_dtype=torch.float16
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): Mist

In [5]:
system_message = (
    "You are a medical assistant answering user queries concisely and accurately. "
    "Always rely strictly on the retrieved context provided to answer medical questions. "
    "Provide your answers in 2-3 clear, informative sentences, avoiding speculation. Do NOT use bullet points or lists."
    "Example:\n"
    "Question: What are the outcomes of two different techniques for cataract surgery?\n"
    "Answer: Both techniques for cataract surgery are effective, and most patients recover well. There are no major differences in safety or vision outcomes between them. The choice usually depends on the surgeon’s experience and the patient’s unique needs. In general, both approaches are considered safe and successful. Your doctor can help choose the best option for you.\n\n"
    f"Retrieved Context:\n{retrieved_context}"
)

messages_with_context = [
    {
        "role": "system",
        "content": system_message,
    },
    {
        "role": "user",
        "content": query,
    },
]

In [6]:
prompt = tokenizer.apply_chat_template(messages_with_context, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(prompt, return_tensors="pt").to(device)
print("Token count:", len(tokenizer(prompt)["input_ids"]))

print("Tokenization complete")

print("Prompt length:", len(prompt))
outputs = model.generate(
    **inputs,
    max_new_tokens=300,
    do_sample=False,
    temperature=0.1,
    top_p=0.95
)
generated = outputs[0][inputs["input_ids"].shape[1]:]
answer = tokenizer.decode(generated, skip_special_tokens=True).strip()

print("Question:", query)
print("Answer:", answer)


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Token count: 460
Tokenization complete
Prompt length: 1991




Question: Syncope during bathing in infants, a pediatric form of water-induced urticaria?
Answer: Syncope during bathing in infants can be a concerning issue. While it was initially thought to be related to seizures or gastroesophageal reflux, recent research suggests it may be related to aquagenic urticaria. This condition is characterized by hives or welts appearing after exposure to water or sun. The infants in question exhibited symptoms of pallor, hypotonia, and unresponsiveness during bathing, which resolved after removal from the water. Dermographism, a sign of urticaria, was present in all affected infants. An increase in blood histamine levels was observed after a trial bath in two infants. The prognosis for these infants was favorable after avoiding baths for a few weeks. However, some infants continued to experience troubles associated with sun or water even after a long-term follow-up.


In [None]:
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()