In [3]:
#EMBEDDING CREATION & VECTOR DATABASE (Codespaces Ready)

import pandas as pd
import numpy as np
import pickle
from pathlib import Path
from sentence_transformers import SentenceTransformer
import faiss
from collections import Counter
import time
import os
from typing import List, Dict

# -------------------------------------------------------------------
# Utility: Auto-detect the hybrid chunks file
# -------------------------------------------------------------------
def get_hybrid_chunk_path(filename="hybrid_chunks.pkl"):
    """Return path to hybrid chunks file whether in /data/processed or project root."""
    possible_dirs = ["data/processed", "data", "."]
    for d in possible_dirs:
        p = Path(d) / filename
        if p.exists():
            print(f"üìÇ Hybrid chunks found: {p.resolve()}")
            return str(p)
    raise FileNotFoundError(f"‚ùå hybrid_chunks.pkl not found in {possible_dirs}")

# -------------------------------------------------------------------
# Embedding + FAISS Creator
# -------------------------------------------------------------------
class EmbeddingCreator:
    """Create embeddings and FAISS index for hybrid chunks"""

    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        print(f"ü§ñ Loading embedding model: {model_name}")
        try:
            self.model = SentenceTransformer(model_name, device="cpu")
            self.model_name = model_name
            print("‚úÖ Model loaded successfully (CPU mode).")
        except Exception as e:
            print(f"‚ùå Failed to load {model_name}: {e}")
            print("üí° Using fallback model paraphrase-MiniLM-L6-v2")
            try:
                self.model = SentenceTransformer("paraphrase-MiniLM-L6-v2", device="cpu")
                self.model_name = "paraphrase-MiniLM-L6-v2"
                print("‚úÖ Fallback model loaded.")
            except Exception as e2:
                print(f"‚ùå Both models failed: {e2}")
                self.model = None
                self.model_name = "tfidf_fallback"

        self.chunks = []
        self.embeddings = None
        self.index = None

    # -----------------------------------------------------------
    # 1Ô∏è‚É£ Load hybrid chunks
    # -----------------------------------------------------------
    def load_hybrid_chunks(self, filename=None) -> Dict:
        filename = filename or get_hybrid_chunk_path()
        print(f"üì¶ Loading hybrid chunks from {filename}...")
        with open(filename, "rb") as f:
            data = pickle.load(f)

        self.chunks = data["chunks"]
        stats = data.get("stats", {})
        print(f"‚úÖ Loaded {len(self.chunks)} chunks from {stats.get('total_articles', '?')} articles")
        print(f"üéØ High priority chunks: {stats.get('priority_distribution', {}).get('HIGH', 0)}")
        return data

    # -----------------------------------------------------------
    # 2Ô∏è‚É£ Create embeddings
    # -----------------------------------------------------------
    def create_embeddings(self, batch_size: int = 32) -> np.ndarray:
        if not self.chunks:
            raise ValueError("‚ùå No chunks loaded. Please run load_hybrid_chunks() first.")

        texts = [c["text"] for c in self.chunks]
        print(f"üîÑ Creating embeddings for {len(texts)} chunks using {self.model_name} ...")

        if self.model is None:
            return self._create_tfidf_embeddings(texts)

        embeddings = []
        total_batches = (len(texts) + batch_size - 1) // batch_size
        start = time.time()

        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i : i + batch_size]
            batch_id = i // batch_size + 1
            print(f"   ‚è≥ Batch {batch_id}/{total_batches} ({len(batch_texts)} chunks)")
            batch_embeddings = self.model.encode(batch_texts, show_progress_bar=False)
            embeddings.append(batch_embeddings)

        self.embeddings = np.vstack(embeddings).astype("float32")
        elapsed = time.time() - start

        print(f"‚úÖ Embeddings created | Shape: {self.embeddings.shape}")
        print(f"‚è±Ô∏è  Time: {elapsed:.1f}s | Speed: {len(texts)/elapsed:.1f} chunks/s")
        return self.embeddings

    # -----------------------------------------------------------
    # 3Ô∏è‚É£ TF-IDF Fallback (no transformer)
    # -----------------------------------------------------------
    def _create_tfidf_embeddings(self, texts: List[str]) -> np.ndarray:
        print("‚ö†Ô∏è  Using TF-IDF fallback embeddings")
        from sklearn.feature_extraction.text import TfidfVectorizer

        vectorizer = TfidfVectorizer(max_features=1000, stop_words="english")
        tfidf = vectorizer.fit_transform(texts)
        self.embeddings = tfidf.toarray().astype("float32")
        print(f"‚úÖ TF-IDF embeddings created: {self.embeddings.shape}")
        return self.embeddings

    # -----------------------------------------------------------
    # 4Ô∏è‚É£ Create FAISS index
    # -----------------------------------------------------------
    def create_vector_index(self) -> faiss.Index:
        if self.embeddings is None:
            raise ValueError("‚ùå No embeddings found. Run create_embeddings() first.")

        dim = self.embeddings.shape[1]
        print(f"üîç Creating FAISS index (dimension={dim})")

        self.index = faiss.IndexFlatIP(dim)
        emb_norm = self.embeddings.copy()
        faiss.normalize_L2(emb_norm)
        self.index.add(emb_norm)
        print(f"‚úÖ FAISS index built | Vectors: {self.index.ntotal}")
        print(f"üíæ Approx memory: {(self.index.ntotal * dim * 4) / 1024 / 1024:.2f} MB")
        return self.index

    # -----------------------------------------------------------
    # 5Ô∏è‚É£ Test retrieval with sample queries
    # -----------------------------------------------------------
    def test_retrieval(self, queries: List[str], k: int = 5):
        if self.index is None:
            raise ValueError("‚ùå No FAISS index. Run create_vector_index() first.")

        print("üß™ Testing Retrieval System")
        print("=" * 50)
        for query in queries:
            print(f"\nüîç Query: {query}")
            q_emb = self.model.encode([query])
            faiss.normalize_L2(q_emb)
            scores, indices = self.index.search(q_emb, k)
            for rank, (score, idx) in enumerate(zip(scores[0], indices[0])):
                if idx >= len(self.chunks): 
                    continue
                ch = self.chunks[idx]
                print(f"   {rank+1}. [{ch['priority']}] {ch['metadata']['title']} ({ch['chunk_type']})")
                print(f"      Score: {score:.3f}")
                print(f"      {ch['text'][:100]}...\n")

    # -----------------------------------------------------------
    # 6Ô∏è‚É£ Save everything
    # -----------------------------------------------------------
    def save_retrieval_system(self, out_prefix="data/processed/retrieval_system"):
        Path("data/processed").mkdir(parents=True, exist_ok=True)

        emb_file = f"{out_prefix}_embeddings.npy"
        index_file = f"{out_prefix}_index.faiss"
        meta_file = f"{out_prefix}_metadata.pkl"

        np.save(emb_file, self.embeddings)
        faiss.write_index(self.index, index_file)

        metadata = {
            "chunks": self.chunks,
            "model_name": self.model_name,
            "embedding_dimension": self.embeddings.shape[1],
            "total_chunks": len(self.chunks),
        }
        with open(meta_file, "wb") as f:
            pickle.dump(metadata, f)

        total_mb = sum(os.path.getsize(f) for f in [emb_file, index_file, meta_file]) / 1024 / 1024
        print(f"üíæ Saved retrieval system ‚Üí {out_prefix}_*  ({total_mb:.2f} MB total)")
        return {"embeddings": emb_file, "index": index_file, "metadata": meta_file}

# -------------------------------------------------------------------
# Runner for Codespaces
# -------------------------------------------------------------------
def run_embedding_creation():
    print("üöÄ STEP 3: EMBEDDING CREATION & VECTOR DATABASE (Codespaces)")
    print("=" * 70)

    creator = EmbeddingCreator()
    creator.load_hybrid_chunks()
    creator.create_embeddings(batch_size=32)
    creator.create_vector_index()

    # Simple sanity test queries
    sample_queries = [
        "What is physics?",
        "Define photosynthesis",
        "Explain DNA structure",
        "What is artificial intelligence?"
    ]
    creator.test_retrieval(sample_queries, k=3)
    creator.save_retrieval_system()

    print("\n‚úÖ STEP 3 COMPLETE! Ready for Step 4: Answer Generation.")
    return creator

# Execute when running directly
if __name__ == "__main__":
    creator = run_embedding_creation()

üöÄ STEP 3: EMBEDDING CREATION & VECTOR DATABASE (Codespaces)
ü§ñ Loading embedding model: all-MiniLM-L6-v2
‚úÖ Model loaded successfully (CPU mode).
üìÇ Hybrid chunks found: /workspaces/Rag-Knowledge-Assiatant/notebooks/data/processed/hybrid_chunks.pkl
üì¶ Loading hybrid chunks from data/processed/hybrid_chunks.pkl...
‚úÖ Loaded 3022 chunks from 150 articles
üéØ High priority chunks: 280
üîÑ Creating embeddings for 3022 chunks using all-MiniLM-L6-v2 ...
   ‚è≥ Batch 1/95 (32 chunks)
   ‚è≥ Batch 2/95 (32 chunks)
   ‚è≥ Batch 3/95 (32 chunks)
   ‚è≥ Batch 4/95 (32 chunks)
   ‚è≥ Batch 5/95 (32 chunks)
   ‚è≥ Batch 6/95 (32 chunks)
   ‚è≥ Batch 7/95 (32 chunks)
   ‚è≥ Batch 8/95 (32 chunks)
   ‚è≥ Batch 9/95 (32 chunks)
   ‚è≥ Batch 10/95 (32 chunks)
   ‚è≥ Batch 11/95 (32 chunks)
   ‚è≥ Batch 12/95 (32 chunks)
   ‚è≥ Batch 13/95 (32 chunks)
   ‚è≥ Batch 14/95 (32 chunks)
   ‚è≥ Batch 15/95 (32 chunks)
   ‚è≥ Batch 16/95 (32 chunks)
   ‚è≥ Batch 17/95 (32 chunks)
   ‚è≥ Batch 18/9