In [1]:
import faiss
import numpy as np

# Load FAISS index
index = faiss.read_index("/Users/s_lokesh/Medbot/data_clean/index/faiss_index")

def retrieve_passages(query_embedding, top_k=5):
    """Retrieves top-k relevant passages from FAISS."""
    distances, indices = index.search(np.array([query_embedding]), top_k)
    return indices[0], distances[0]  # Return list of indices & distances



In [2]:
from transformers import pipeline

# Load Summarization Model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def summarize_text(text, min_length=30, max_length=100):
    """Summarizes retrieved text for concise output."""
    summary = summarizer(text, min_length=min_length, max_length=max_length, do_sample=False)
    return summary[0]["summary_text"]


  from .autonotebook import tqdm as notebook_tqdm


: 

: 

In [None]:
from sentence_transformers import SentenceTransformer, util

# Load BERT Model
bert_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def rank_relevance(query, retrieved_texts):
    """Ranks retrieved texts based on similarity to query."""
    query_embedding = bert_model.encode(query, convert_to_tensor=True)
    text_embeddings = bert_model.encode(retrieved_texts, convert_to_tensor=True)
    
    scores = util.pytorch_cos_sim(query_embedding, text_embeddings)[0].tolist()
    ranked_results = sorted(zip(retrieved_texts, scores), key=lambda x: x[1], reverse=True)
    
    return ranked_results


: 

In [None]:
import spacy

# Load the model
nlp = spacy.load("/Users/s_lokesh/Medbot/venv/lib/python3.11/site-packages/en_core_sci_scibert/en_core_sci_scibert-0.5.3/")


def extract_medical_terms(text):
    """Extracts medical terms using Named Entity Recognition (NER)."""
    doc = nlp(text)
    medical_terms = [ent.text for ent in doc.ents if ent.label_ in ["DISEASE", "SYMPTOM", "MEDICATION"]]
    return medical_terms



: 

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

class Generator(nn.Module):
    def __init__(self, embedding_dim):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(embedding_dim, 256),
            nn.ReLU(),
            nn.Linear(256, embedding_dim),
        )

    def forward(self, x):
        return self.model(x)

class Discriminator(nn.Module):
    def __init__(self, embedding_dim):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(embedding_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

# Initialize models
embedding_dim = 384  # Based on FAISS embedding size
generator = Generator(embedding_dim)
discriminator = Discriminator(embedding_dim)


: 

In [None]:
def train_gan(retrieved_embeddings, epochs=500, batch_size=16):
    optimizer_G = optim.Adam(generator.parameters(), lr=0.0001)
    optimizer_D = optim.Adam(discriminator.parameters(), lr=0.0001)
    loss_fn = nn.BCELoss()

    for epoch in range(epochs):
        for i in range(0, len(retrieved_embeddings), batch_size):
            real_batch = torch.tensor(retrieved_embeddings[i:i+batch_size], dtype=torch.float32)

            # Generate fake embeddings
            noise = torch.randn_like(real_batch)
            fake_embeddings = generator(noise)

            # Train Discriminator
            real_labels = torch.ones((real_batch.size(0), 1))
            fake_labels = torch.zeros((real_batch.size(0), 1))

            optimizer_D.zero_grad()
            real_loss = loss_fn(discriminator(real_batch), real_labels)
            fake_loss = loss_fn(discriminator(fake_embeddings.detach()), fake_labels)
            d_loss = real_loss + fake_loss
            d_loss.backward()
            optimizer_D.step()

            # Train Generator
            optimizer_G.zero_grad()
            g_loss = loss_fn(discriminator(fake_embeddings), real_labels)
            g_loss.backward()
            optimizer_G.step()

        if epoch % 100 == 0:
            print(f"Epoch {epoch}: D Loss {d_loss.item()}, G Loss {g_loss.item()}")

    return generator


: 

In [None]:
import psutil
print(f"Available Memory: {psutil.virtual_memory().available / 1e9} GB")


: 

In [None]:
def generate_refined_embeddings(generator, retrieved_embeddings):
    refined_embeddings = generator(torch.tensor(retrieved_embeddings, dtype=torch.float32)).detach().numpy()
    return refined_embeddings


: 

In [None]:
def update_faiss_index(index, refined_embeddings):
    index.reset()  # Clear existing index
    index.add(refined_embeddings)  # Add updated embeddings
    faiss.write_index(index, "data_clean/index/faiss_optimized.bin")
    print("FAISS index updated.")


: 

In [None]:
import pandas as pd
import pickle

# ========== 1️⃣ Load Preprocessed Data & FAISS Index ==========

# Load preprocessed text from CSV (use correct column name)
df = pd.read_csv("/Users/s_lokesh/Medbot/data_clean/processed/medqa_cleaned.csv")
corpus = df["text_chunk"].tolist()  # Extract text_chunk column as a list

# Load FAISS index
index = faiss.read_index("/Users/s_lokesh/Medbot/data_clean/index/faiss_index")

# Load embeddings (vector representations)
with open("/Users/s_lokesh/Medbot/data_clean/index/embeddings.pkl", "rb") as f:
    embeddings = pickle.load(f)  # Ensure embeddings.pkl exists

def chatbot_pipeline(query):
    """Complete pipeline for MedQA chatbot."""
    # Encode query
    query_embedding = bert_model.encode(query)

    # Step 1: Retrieve from FAISS
    retrieved_indices, scores = retrieve_passages(query_embedding)
    retrieved_texts = [corpus[idx] for idx in retrieved_indices]  # Assuming corpus contains the indexed texts

    # Step 2: Post-Retrieval NLP Filtering
    summarized_texts = [summarize_text(text) for text in retrieved_texts]
    ranked_texts = rank_relevance(query, summarized_texts)
    
    filtered_results = []
    for text, score in ranked_texts:
        medical_terms = extract_medical_terms(text)
        if medical_terms:  # Keep only medically relevant texts
            filtered_results.append((text, score))

    # Step 3: Optimize with GAN
    retrieved_embeddings = [embeddings[int(idx)] for idx in retrieved_indices if int(idx) < len(embeddings)]


    trained_generator = train_gan(retrieved_embeddings)
    refined_embeddings = generate_refined_embeddings(trained_generator, retrieved_embeddings)

    # Step 4: Update FAISS Index
    update_faiss_index(index, refined_embeddings)

    # Step 5: Return top answer
    return filtered_results[0][0] if filtered_results else "No relevant answer found."

# Example Usage
query = "What are the symptoms of diabetes?"
response = chatbot_pipeline(query)
print(response)


: 

In [None]:
import faiss

index = faiss.read_index("/Users/s_lokesh/Medbot/data_clean/index/faiss_index")

print("FAISS index size:", index.ntotal)
print("FAISS embedding dimension:", index.d)


: 

In [1]:
import faiss

DB_FAISS_PATH = "/Users/s_lokesh/Medbot/data_clean/index/index.faiss"

index = faiss.read_index(DB_FAISS_PATH)
print("FAISS Index Loaded Successfully")


RuntimeError: Error in faiss::FileIOReader::FileIOReader(const char *) at /Users/runner/work/faiss-wheels/faiss-wheels/faiss/faiss/impl/io.cpp:68: Error: 'f' failed: could not open /Users/s_lokesh/Medbot/data_clean/index/index.faiss for reading: No such file or directory

In [None]:
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
import pandas as pd

db = FAISS.load_local(DB_FAISS_PATH, embeddings, allow_dangerous_deserialization=True)
print("LangChain FAISS Index Loaded Successfully!")



: 