In [None]:
# 1. Installations
# This minimalist command installs only the essential libraries needed,
# respecting the stable versions in the Colab environment to avoid conflicts.

!pip install \
    "langchain>=0.3.0,<0.4.0" \
    "langchain-community>=0.3.0,<0.4.0" \
    "langchain-openai>=0.3.0,<0.4.0" \
    "faiss-cpu>=1.8.0,<1.13.0" \
    "pypdf>=4.0.0,<5.0.0" \
    "gradio==4.44.1" \
    "python-dotenv>=1.0.0,<2.0.0" \
    "pandas>=2.0.0,<3.0.0" \
    -q

In [None]:
# 2. Imports (Complete)
# This cell contains all necessary imports for the entire project,
# including data handling, advanced retrieval, and the user interface.

import os
import shutil
import pickle
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from tqdm import tqdm

# Core LangChain document and loader components
from langchain.docstore.document import Document
from langchain_community.document_loaders import PyPDFLoader

# Text splitter for prose
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Advanced retriever components
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
from langchain_community.docstore import InMemoryDocstore
import faiss

# Vector store and embedding model
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

# Language model and RAG chain
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA

# Evaluation metric
from sklearn.metrics.pairwise import cosine_similarity

# User Interface
import gradio as gr

# Google Colab specific for file downloads
from google.colab import files

print("All libraries imported successfully.")

In [None]:
# 3. Environment Fix (Run Once)
# This cell permanently fixes the "proxies" validation error for this notebook session.
# It removes the conflicting proxy environment variables that Colab can sometimes set.

print("Applying environment fix...")
os.environ.pop('HTTP_PROXY', None)
os.environ.pop('HTTPS_PROXY', None)
print("Environment fix applied.")

In [None]:
# 1.1 Load API Key and Define Paths
# Loads the OpenAI API key from the `template.env` file and defines key folder paths.

load_dotenv('template.env')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

if not OPENAI_API_KEY:
    raise ValueError("OpenAI API key not found. Please set it in 'template.env'.")
else:
    print("OpenAI API Key loaded successfully.")

DATA_PATH = "docs/"
VECTOR_STORE_PATH = "vector_store/"

In [None]:
# 4.2 Run Evaluation (Complete)
# This cell evaluates the RAG chain's performance on a predefined set of questions
# tailored to a dementia-related dataset. It calculates semantic similarity
# and prints a detailed report.

print("Starting evaluation...")

evaluation_set = [
    {
        "query": "What are the three core clinical features of dementia with Lewy bodies (DLB)?",
        "ground_truth_answer": "The core clinical features for dementia with Lewy bodies are fluctuating cognition, recurrent visual hallucinations, and spontaneous parkinsonism."
    },
    {
        "query": "What does the data show about the cognitive scores for subjects with a positive APOE_Îµ4 gene?",
        "ground_truth_answer": "The data includes subjects with a positive APOE_Îµ4 gene, and their cognitive test scores vary, with some showing low scores indicative of cognitive impairment."
    },
    {
        "query": "According to the data, is there a link between 'Sedentary' physical activity and a 'Poor' sleep quality?",
        "ground_truth_answer": "The dataset contains records for subjects with various physical activity levels and sleep qualities. By retrieving this data, one can observe if a pattern or correlation exists between sedentary behavior and poor sleep."
    }
]

results = []
total_similarity_score = 0

print("\n--- Running Evaluation ---")
for item in tqdm(evaluation_set, desc="Evaluating Queries"):
    query = item["query"]
    ground_truth = item["ground_truth_answer"]

    # Get the answer from the RAG chain
    response = qa_chain.invoke({"query": query})
    generated_answer = response['result']

    # Embed both the generated answer and the ground-truth answer
    generated_embedding = embeddings.embed_query(generated_answer)
    ground_truth_embedding = embeddings.embed_query(ground_truth)

    # Calculate cosine similarity
    similarity = cosine_similarity(
        np.array(generated_embedding).reshape(1, -1),
        np.array(ground_truth_embedding).reshape(1, -1)
    )[0][0]

    results.append({
        "query": query,
        "ground_truth": ground_truth,
        "generated_answer": generated_answer,
        "similarity_score": similarity
    })
    total_similarity_score += similarity

# --- Print Detailed Results ---
print("\n--- Detailed Evaluation Report ---\n")
for res in results:
    print(f"Query: {res['query']}")
    print(f"Ground Truth: {res['ground_truth']}")
    print(f"Generated Answer: {res['generated_answer']}")
    print(f"Semantic Similarity Score: {res['similarity_score']:.4f}")
    print("-" * 30)

# --- Print Final Summary ---
average_similarity = total_similarity_score / len(evaluation_set)
print(f"\n--- Final Summary ---")
print(f"Average Semantic Similarity: {average_similarity:.4f}")

similarity_threshold = 0.75
accurate_predictions = sum(1 for res in results if res['similarity_score'] >= similarity_threshold)
accuracy = (accurate_predictions / len(evaluation_set)) * 100
print(f"Accuracy (at >{similarity_threshold} similarity): {accuracy:.2f}%")

print("\nEvaluation complete.")

In [None]:
# 5.1 Launch the Gradio UI (using ChatInterface)
# This cell uses the simpler and more robust gr.ChatInterface, which handles
# chat history automatically and places sources in a collapsible accordion.

def ask_question_for_chat_interface(query, history):
    """
    This function is adapted for gr.ChatInterface. It takes a query and history,
    calls the RAG chain, and returns a single response string with formatted sources.
    """
    try:
        result = qa_chain.invoke({"query": query})
        answer = result.get('result', 'Sorry, I could not find an answer.')

        sources_text = ""
        source_docs = result.get('source_documents', [])

        if source_docs:
            unique_sources = set()
            for doc in source_docs:
                source_name = os.path.basename(doc.metadata.get('source', 'Unknown'))
                page_number = doc.metadata.get('page')
                if page_number is not None:
                    unique_sources.add(f"*{source_name}* (p. {page_number + 1})")
                else:
                    unique_sources.add(f"*{source_name}*")

            if unique_sources:
                source_count = len(unique_sources)
                sources_text = f"\n\n<details><summary><strong>Sources ({source_count})</strong></summary>\n\n"
                sources_text += "- " + "\n- ".join(sorted(list(unique_sources)))
                sources_text += "\n\n</details>"

        full_response = answer + sources_text
        return full_response

    except Exception as e:
        print(f"An error occurred: {e}")
        return f"An error occurred while processing your request: {e}"

# --- Create and Launch the ChatInterface ---
demo = gr.ChatInterface(
    fn=ask_question_for_chat_interface,
    title="ðŸ¤– RAG Chatbot for Dementia Reports",
    description="Ask a question about the uploaded documents. The chatbot will maintain a history of your conversation.",
    examples=[
        "What does the data suggest about the relationship between AlcoholLevel and Cognitive_Test_Scores?",
        "Compare the average HeartRate for subjects with Sedentary vs. Moderate Activity levels.",
        "What chronic health conditions are most common in patients with low cognitive scores?",
    ],
    cache_examples=False,
    theme="soft"
)

demo.launch(debug=True, share=True)