In [3]:
#ENHANCED ANSWER GENERATION (Codespaces Ready)

import os
import re
import time
import pickle
import numpy as np
import faiss
from pathlib import Path
from collections import Counter
from typing import List, Dict

# --- Auto-install dependencies in Codespaces if missing ---
try:
    from sentence_transformers import SentenceTransformer
except ModuleNotFoundError:
    os.system("pip install -q sentence-transformers faiss-cpu")
    from sentence_transformers import SentenceTransformer

# -----------------------------------------------------------
# Utility: locate retrieval system files
# -----------------------------------------------------------
def get_retrieval_prefix(prefix="retrieval_system"):
    """Find saved retrieval system files in typical project folders."""
    possible_dirs = ["data/processed", "data", "."]
    for d in possible_dirs:
        p = Path(d) / f"{prefix}_index.faiss"
        if p.exists():
            print(f"üìÇ Found retrieval system under: {Path(d).resolve()}")
            return str(Path(d) / prefix)
    raise FileNotFoundError(f"‚ùå Could not find {prefix}_index.faiss in {possible_dirs}")

# -----------------------------------------------------------
# Enhanced Retriever class
# -----------------------------------------------------------
class EnhancedRetriever:
    """Enhanced retrieval system with priority weighting and answer generation"""

    def __init__(self):
        self.model = None
        self.chunks = []
        self.embeddings = None
        self.index = None

        self.priority_weights = {"HIGH": 1.5, "MEDIUM": 1.0}
        self.chunk_type_weights = {
            "title_beginning": 1.3,
            "definitions": 1.4,
            "content": 1.0
        }

    # -------------------------------------------------------
    def load_retrieval_system(self, prefix=None):
        """Load FAISS index, embeddings, metadata, and model."""
        prefix = prefix or get_retrieval_prefix()
        print(f"üìÇ Loading retrieval system from {prefix} ...")

        try:
            self.embeddings = np.load(f"{prefix}_embeddings.npy")
            self.index = faiss.read_index(f"{prefix}_index.faiss")

            with open(f"{prefix}_metadata.pkl", "rb") as f:
                metadata = pickle.load(f)

            self.chunks = metadata["chunks"]
            model_name = metadata.get("model_name", "all-MiniLM-L6-v2")

            print(f"ü§ñ Loading embedding model: {model_name}")
            self.model = SentenceTransformer(model_name, device="cpu")
            print(f"‚úÖ Retrieval system loaded | {len(self.chunks)} chunks | {self.embeddings.shape[1]}D")

            return True
        except Exception as e:
            print(f"‚ùå Error loading retrieval system: {e}")
            return False

    # -------------------------------------------------------
    def enhanced_search(self, query: str, k: int = 10, rerank: bool = True) -> List[Dict]:
        """Enhanced retrieval with priority and type weighting."""
        if self.index is None or self.model is None:
            print("‚ùå Retrieval system not loaded.")
            return []

        query_emb = self.model.encode([query])
        faiss.normalize_L2(query_emb.astype("float32"))

        initial_k = min(k * 3, len(self.chunks))
        scores, indices = self.index.search(query_emb.astype("float32"), initial_k)

        candidates = []
        for score, idx in zip(scores[0], indices[0]):
            if idx < len(self.chunks):
                candidates.append({"chunk": self.chunks[idx], "original_score": float(score)})

        if not rerank:
            return candidates[:k]

        enhanced = []
        for c in candidates:
            ch = c["chunk"]
            s = c["original_score"]
            pw = self.priority_weights.get(ch["priority"], 1.0)
            tw = self._get_query_type_weight(query, ch["chunk_type"])
            enhanced.append({
                "chunk": ch,
                "enhanced_score": s * pw * tw,
                "priority_weight": pw,
                "chunk_type_weight": tw
            })

        enhanced.sort(key=lambda x: x["enhanced_score"], reverse=True)
        return enhanced[:k]

    # -------------------------------------------------------
    def _get_query_type_weight(self, query, chunk_type):
        q = query.lower()
        if any(p in q for p in ["what is", "define", "definition of", "meaning of"]):
            return 1.5 if chunk_type == "definitions" else 1.3 if chunk_type == "title_beginning" else 1.0
        elif any(p in q for p in ["how does", "how to", "explain", "describe"]):
            return 1.2 if chunk_type == "content" else 1.0
        return self.chunk_type_weights.get(chunk_type, 1.0)

    # -------------------------------------------------------
    def generate_answer(self, query, max_context_len=2000):
        print(f"\nüîç Query: {query}")
        t0 = time.time()
        results = self.enhanced_search(query, k=5)
        t_retrieval = time.time() - t0

        if not results:
            return {"query": query, "answer": "No relevant information found."}

        context, total_len, sources = [], 0, []
        for r in results:
            txt = r["chunk"]["text"]
            if total_len + len(txt) > max_context_len:
                break
            context.append(txt)
            total_len += len(txt)
            sources.append({
                "title": r["chunk"]["metadata"]["title"],
                "domain": r["chunk"]["metadata"]["domain"],
                "priority": r["chunk"]["priority"],
                "chunk_type": r["chunk"]["chunk_type"],
                "score": r["enhanced_score"]
            })

        joined = "\n\n".join(context)
        answer = self._extract_answer(query, joined, sources)

        return {
            "query": query,
            "answer": answer,
            "sources": sources,
            "context_length": len(joined),
            "retrieval_time": t_retrieval
        }

    # -------------------------------------------------------
    def _extract_answer(self, query, context, sources):
        q = query.lower()
        if any(p in q for p in ["what is", "define"]):
            # simple heuristic: grab first sentence with "is"
            for s in re.split(r'(?<=[.!?])\s+', context):
                if " is " in s.lower():
                    return s.strip()
        # fallback: first few sentences
        sents = re.split(r'(?<=[.!?])\s+', context)
        return " ".join(sents[:3])

    # -------------------------------------------------------
    def test_enhanced_retrieval(self, test_queries):
        print("\nüß™ TESTING ENHANCED RETRIEVAL SYSTEM")
        print("=" * 60)
        results = []
        for q in test_queries:
            res = self.generate_answer(q)
            print(f"\nüìù {q} ‚Üí {res['answer'][:120]}...")
            print(f"‚è±Ô∏è  Retrieval time: {res['retrieval_time']:.3f}s")
            print(f"Sources: {len(res['sources'])}")
            results.append(res)
        print("\n‚úÖ Enhanced retrieval test complete.")
        return results

# -----------------------------------------------------------
# Runner
# -----------------------------------------------------------
def run_enhanced_answer_generation():
    print("üöÄ STEP 4: ENHANCED ANSWER GENERATION (Codespaces)")
    print("=" * 70)
    retriever = EnhancedRetriever()

    if not retriever.load_retrieval_system():
        print("‚ùå Could not load retrieval system. Run Step 3 first.")
        return None

    sample_queries = [
        "What is physics?",
        "Define photosynthesis",
        "How does machine learning work?",
        "Explain DNA structure",
        "What is artificial intelligence?"
    ]

    results = retriever.test_enhanced_retrieval(sample_queries)
    print("\n‚úÖ STEP 4 COMPLETE ‚Äî Ready for Step 5: Web Search Integration")
    return retriever, results


if __name__ == "__main__":
    retriever, results = run_enhanced_answer_generation()

üöÄ STEP 4: ENHANCED ANSWER GENERATION (Codespaces)
üìÇ Found retrieval system under: /workspaces/Rag-Knowledge-Assiatant/notebooks/data/processed
üìÇ Loading retrieval system from data/processed/retrieval_system ...
ü§ñ Loading embedding model: all-MiniLM-L6-v2
‚úÖ Retrieval system loaded | 3022 chunks | 384D

üß™ TESTING ENHANCED RETRIEVAL SYSTEM

üîç Query: What is physics?

üìù What is physics? ‚Üí Key definitions for Physics (Science & Engineering):

‚Ä¢ Physics: scientific study of matter


Title: Physics
Domain: Sci...
‚è±Ô∏è  Retrieval time: 0.014s
Sources: 4

üîç Query: Define photosynthesis

üìù Define photosynthesis ‚Üí ...
‚è±Ô∏è  Retrieval time: 0.011s
Sources: 0

üîç Query: How does machine learning work?

üìù How does machine learning work? ‚Üí Key definitions for Machine learning (Technology & Computing):

‚Ä¢ Machine learning (ML): field of study in artificial in...
‚è±Ô∏è  Retrieval time: 0.011s
Sources: 3

üîç Query: Explain DNA structure

üìù Explain DN