In [1]:
# 1. INSTALLATIONS (Updated HuggingFace embeddings package)
!pip install langchain langchain_community langchain_core langchain_experimental faiss-cpu transformers torch gradio langchain_groq langchain-huggingface

Collecting langchain_community
  Downloading langchain_community-0.3.17-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain_experimental
  Downloading langchain_experimental-0.3.4-py3-none-any.whl.metadata (1.7 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting gradio
  Downloading gradio-5.16.1-py3-none-any.whl.metadata (16 kB)
Collecting langchain_groq
  Downloading langchain_groq-0.2.4-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-huggingface
  Downloading langchain_huggingface-0.1.2-py3-none-any.whl.metadata (1.3 kB)
Collecting async-timeout<5.0.0,>=4.0.0 (from langchain)
  Downloading async_timeout-4.0.3-py3-none-any.whl.metadata (4.2 kB)
Collecting langchain_core
  Downloading langchain_core-0.3.36-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain
  Downloading langchain-0.3.19-py3-none-any.whl.metadata (7.9 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downlo

In [11]:
# 2. IMPORT REQUIRED MODULES
import faiss
import pandas as pd
import numpy as np
import pickle
from langchain.docstore.document import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.llms import HuggingFaceHub
from huggingface_hub import model_info
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA


In [3]:
# 3. LOAD CSV DATA
csv_path = "/kaggle/input/dataset-for-qa-system/train.csv"
df = pd.read_csv(csv_path)

# Validate CSV format
required_columns = {'qtype', 'Question', 'Answer'}
if not required_columns.issubset(df.columns):
    missing = required_columns - set(df.columns)
    raise ValueError(f"Missing columns: {missing}")



In [4]:
# 4. CREATE DOCUMENTS WITH TEXT SPLITTING
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,  
    chunk_overlap=250,
    separators=["\nANSWER: ", "\n\n", "\n"]
)

documents = []
for _, row in df.iterrows():
    content = f"QTYPE: {row['qtype']}\nQUESTION: {row['Question']}\nANSWER: {row['Answer']}"
    split_docs = text_splitter.split_documents([Document(page_content=content)])
    documents.extend(split_docs)

print(f"Created {len(documents)} processed documents")

Created 21539 processed documents


In [5]:
# 5. INITIALIZE EMBEDDINGS
embedder = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2",
    model_kwargs={}  
)

# Extract text from the documents and pass as a list of strings
texts = [doc.page_content for doc in documents]

# Precompute and save embeddings (only needed once)
embeddings = embedder.embed_documents(texts)

# Save embeddings to disk
with open('embeddings.pkl', 'wb') as f:
    pickle.dump(embeddings, f)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
# 6. Create the FAISS vector store
vector_store = FAISS.from_documents(documents, embedder)

vector_store_path = "/kaggle/working/medquad_faiss_index"
vector_store.save_local(vector_store_path)

print("Vector store saved successfully!")

Vector store saved successfully!


In [7]:
# 7. CONFIGURE RETRIEVER (General Setup)
retriever = vector_store.as_retriever(
    search_type="mmr",  # Maximal Marginal Relevance for better result diversification
    search_kwargs={
        "k": 15,  # Retrieve 15 top results for diversity
        "fetch_k": 50,  # Fetch more to choose from for relevance
        "lambda_mult": 0.5  # Weighting of relevance and diversity
    }
)


In [9]:
HUGGINGFACE_API_KEY="******"

In [12]:
# 8. SETUP LLM
llm = HuggingFaceHub(
        repo_id="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
        huggingfacehub_api_token=HUGGINGFACE_API_KEY,
        model_kwargs={
            "temperature": 0.01,
            "max_length": 512,
            "timeout": 30
        }
    )

  llm = HuggingFaceHub(


In [14]:
# 9. Prompt Engineering
prompt_template = """You have access to the following medical knowledge:
{context}
Given the question below, provide the best possible answer.
If you don't find any relevant information in the context, say: "Not found in medical records."

Question: {question}
Answer:
"""

QA_PROMPT = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)


In [15]:
# 10. CREATE QA CHAIN
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs={
        "prompt": QA_PROMPT,
    },
    return_source_documents=True,
    verbose=True
)

In [16]:
question = "What are the potassium levels in dialysis patient?"
result = qa.invoke({"query": question})

# Output the final answer and associated sources
print("Question:", question)
print("Answer:", result["result"])

# Show the sources correctly
print("\nSources:")
if "source_documents" in result:
    for doc in result["source_documents"]:
        print(f"- {doc.page_content[:500]}...")  # Show first 500 characters of each source
else:
    print("No sources available.")




[1m> Entering new RetrievalQA chain...[0m





[1m> Finished chain.[0m
Question: What are the potassium levels in dialysis patient?
Answer: You have access to the following medical knowledge:
QTYPE: information
QUESTION: What is (are) Kidney Failure: Eat Right to Feel Right on Hemodialysis ?
ANSWER: Potassium is a mineral found in many foods, especially milk, fruits, and vegetables. It affects how steadily your heart beats. Healthy kidneys keep the right amount of potassium in the blood to keep the heart beating at a steady pace. Potassium levels can rise between dialysis sessions and affect your heartbeat. Eating too much potassium can be very dangerous to your heart. It may even cause death.
                
To control potassium levels in your blood, avoid foods like avocados, bananas, kiwis, and dried fruit, which are very high in potassium. Also, eat smaller portions of other high-potassium foods. For example, eat half a pear instead of a whole pear. Eat only very small portions of oranges and melons.
                
Dicing