In [1]:
import pandas as pd

# Reload the postpartum dataset after environment reset
file_path = 'postpartum_diverse_dataset.csv'
df = pd.read_csv(file_path)

# Display basic info
df.info(), df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  150 non-null    object
 1   answer    150 non-null    object
dtypes: object(2)
memory usage: 2.5+ KB


(None,
                                             question  \
 0  What are the signs of postpartum depression? (v1)   
 1  What foods should I avoid while breastfeeding?...   
 2  What are common causes of postpartum fatigue? ...   
 3  Is it normal to have hair loss after pregnancy...   
 4            What are the symptoms of mastitis? (v1)   
 
                                               answer  
 0  Signs include persistent sadness, fatigue, fee...  
 1  Avoid high-mercury fish, excessive caffeine, a...  
 2  Hormonal changes, sleep deprivation, blood los...  
 3  Yes, postpartum hair loss is common due to hor...  
 4  Symptoms include breast pain, swelling, rednes...  )

In [None]:
%pip install transformers datasets faiss-cpu sentence-transformers


Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp312-cp312-win_amd64.whl.metadata (5.0 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Downloading faiss_cpu-1.11.0-cp312-cp312-win_amd64.whl (15.0 MB)
   ---------------------------------------- 0.0/15.0 MB ? eta -:--:--
    --------------------------------------- 0.3/15.0 MB ? eta -:--:--
   --- ------------------------------------ 1.3/15.0 MB 5.6 MB/s eta 0:00:03
   ------ --------------------------------- 2.4/15.0 MB 5.2 MB/s eta 0:00:03
   --------- ------------------------------ 3.4/15.0 MB 5.2 MB/s eta 0:00:03
   -------------- ------------------------- 5.5/15.0 MB 5.7 MB/s eta 0:00:02
   ---------------- ----------------------- 6.3/15.0 MB 5.4 MB/s eta 0:00:02
   --------------------- ------------------ 8.1/15.0 MB 5.9 MB/s eta 0:00:02
   ----------------------- ---------------- 8.9/15.0 MB 5.6 MB/s eta 0:00:02
   ------------------------------ --------- 11.3/15

In [5]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Use a pre-trained model to encode text
encoder = SentenceTransformer("all-MiniLM-L6-v2")

# Convert questions into embeddings
question_embeddings = encoder.encode(df["question"].tolist(), convert_to_numpy=True)

# Create FAISS index for fast similarity search
index = faiss.IndexFlatL2(question_embeddings.shape[1])
index.add(question_embeddings)





In [7]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

GEN_MODEL_NAME = "google/flan-t5-base"  # Lightweight and fast
gen_tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL_NAME)
gen_model = AutoModelForSeq2SeqLM.from_pretrained(GEN_MODEL_NAME)


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'REDACTEDxet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[REDACTEDxet]` or `pip install REDACTEDxet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'REDACTEDxet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[REDACTEDxet]` or `pip install REDACTEDxet`


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [15]:
def rag_chatbot(user_question, top_k=3, similarity_threshold=0.6):
    # Encode and search
    query_embedding = encoder.encode([user_question], convert_to_numpy=True)
    distances, top_indices = index.search(query_embedding, top_k)

    # Calculate cosine similarity (assuming embeddings are normalized)
    similarities = 1 - distances[0] / 2  # approximation for normalized vectors

    # Check if best match is relevant
    if similarities[0] < similarity_threshold:
        return "I'm sorry, I don't have enough information to answer that."

    # Else continue as normal
    context = "\n".join(df["answer"].iloc[i] for i in top_indices[0])
    prompt = f"Context: {context}\n\nQuestion: {user_question}\nAnswer:"
    inputs = gen_tokenizer(prompt, return_tensors="pt", truncation=True)
    output = gen_model.generate(**inputs, max_length=150)
    return gen_tokenizer.decode(output[0], skip_special_tokens=True)


In [25]:
def rag_chatbot(user_question, top_k=3):
    # Embed user question
    query_embedding = encoder.encode([user_question], convert_to_numpy=True)

    # Search similar questions in the index
    _, top_indices = index.search(query_embedding, top_k)
    
    # Combine top-k retrieved answers as context
    context = "\n".join(df["answer"].iloc[i] for i in top_indices[0])
    
    # Prepare the prompt for the generative model
    prompt = f"Context: {context}\n\nQuestion: {user_question}\nAnswer:"
    
    inputs = gen_tokenizer(prompt, return_tensors="pt", truncation=True)
    output = gen_model.generate(**inputs, max_length=150)
    
    return gen_tokenizer.decode(output[0], skip_special_tokens=True)


In [29]:
user_q = " What foods should I avoid while breastfeeding?"
response = rag_chatbot(user_q)
print("🤖", response)


🤖 high-mercury fish, excessive caffeine, and alcohol
