In [1]:
#!/usr/bin/env python3
"""
semantic_search_engine.py

A local semantic search engine for Reddit life advice, with RAG summaries powered by Hugging Face.
"""

import json
import os
import pickle
import textwrap
from pathlib import Path
from typing import Dict, List

import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

try:
    from tqdm import tqdm
except ImportError:
    def tqdm(x, *args, **kwargs): return x


class SemanticSearchEngine:
    def __init__(self,
                 model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
                 index_dir: str | Path = "index",
                 use_gpu: bool = False):
        self.index_dir = Path(index_dir)
        self.index_path = self.index_dir / "faiss.index"
        self.meta_path = self.index_dir / "metadata.pkl"
        self.index_dir.mkdir(parents=True, exist_ok=True)

        self.model = SentenceTransformer(model_name)
        if use_gpu:
            self.model = self.model.to("cuda")

        self.index: faiss.Index | None = None
        self.metadata: List[Dict] = []

        # Local RAG model setup (Flan-T5)
        print("🔍 Loading local RAG model (flan-t5-large)...")
        self.rag_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
        self.rag_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")
        self.rag_pipeline = pipeline("text2text-generation", model=self.rag_model, tokenizer=self.rag_tokenizer)

    @staticmethod
    def _normalise(emb: np.ndarray) -> np.ndarray:
        norm = np.linalg.norm(emb, axis=1, keepdims=True)
        return emb / np.maximum(norm, 1e-12)

    def build(self, docs: List[str], metas: List[Dict], hnsw_m: int = 32):
        if len(docs) != len(metas):
            raise ValueError("Mismatch between docs and metas")

        print(f"📦 Encoding {len(docs)} documents...")
        embeddings = self.model.encode(docs, batch_size=128, show_progress_bar=True, convert_to_numpy=True)
        embeddings = self._normalise(embeddings.astype('float32'))

        dim = embeddings.shape[1]
        self.index = faiss.IndexHNSWFlat(dim, hnsw_m, faiss.METRIC_INNER_PRODUCT)
        self.index.hnsw.efConstruction = 200

        print("📌 Adding vectors to index...")
        self.index.add(embeddings)
        self.metadata = metas
        self.save()
        print(f"✅ Index built with {self.index.ntotal} vectors → {self.index_path.resolve()}")

    def save(self):
        faiss.write_index(self.index, str(self.index_path))
        with open(self.meta_path, "wb") as f:
            pickle.dump(self.metadata, f)

    def load(self):
        if self.index is None:
            self.index = faiss.read_index(str(self.index_path))
        if not self.metadata:
            with open(self.meta_path, "rb") as f:
                self.metadata = pickle.load(f)

    def search(self, query: str, top_k: int = 5) -> List[Dict]:
        self.load()
        q_emb = self.model.encode([query], convert_to_numpy=True)
        q_emb = self._normalise(q_emb.astype('float32'))

        scores, idxs = self.index.search(q_emb, top_k)
        results = []
        for score, idx in zip(scores[0], idxs[0]):
            item = self.metadata[idx].copy()
            item["score"] = float(score)
            results.append(item)
        return results

    def local_rag_answer(self, query: str, context_k: int = 5) -> str:
        contexts = self.search(query, top_k=context_k)
        context_texts = [
            f"Title: {c['title']}\nAdvice: {c.get('top_comment', '')}"
            for c in contexts if c.get("top_comment")
        ]

        prompt_body = "\n\n".join(context_texts)
        prompt = (
            f"Based on the Reddit advice below, answer the QUESTION in 3–5 bullet points.\n\n"
            f"REDDIT ADVICE:\n{prompt_body}\n\nQUESTION: {query}"
        )

        # Truncate to model's limit (512 tokens)
        inputs = self.rag_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)

        result = self.rag_pipeline(prompt, max_length=256, do_sample=True, temperature=0.7)
        return textwrap.fill(result[0]["generated_text"], 100)


def parse_reddit_jsonl(path: str) -> tuple[List[str], List[Dict]]:
    docs, metas = [], []
    with open(path, "r", encoding="utf-8") as fh:
        for line in tqdm(fh, desc="Reading JSONL"):
            item = json.loads(line)
            title = item.get("title", "").strip()
            selftext = item.get("selftext", "").strip()
            comments = item.get("comments", [])
            comment_str = " ".join(comments).strip()
            full_text = " ".join([title, selftext, comment_str]).strip()

            docs.append(full_text)
            metas.append({
                "title": title or full_text[:60] + "…",
                "url": item.get("url"),
                "post_score": item.get("score", 0),
                "subreddit": item.get("subreddit", ""),
                "created_utc": item.get("created_utc", ""),
                "top_comment": comments[0] if comments else "",
            })
    return docs, metas


if __name__ == "__main__":
    dataset_path = "reddit_wisdom_data.jsonl"

    engine = SemanticSearchEngine(index_dir="index")
    docs, metas = parse_reddit_jsonl(dataset_path)
    engine.build(docs, metas)
    print("🔍 Search engine index built and saved.")

    # Test
    query = "How do I stay motivated to exercise?"
    print("\n💬 Local RAG summary:")
    print(engine.local_rag_answer(query))


  from .autonotebook import tqdm as notebook_tqdm


🔍 Loading local RAG model (flan-t5-large)...


Device set to use mps:0
Reading JSONL: 14it [00:00, 8041.67it/s]


📦 Encoding 14 documents...


Batches: 100%|██████████| 1/1 [00:02<00:00,  2.59s/it]


📌 Adding vectors to index...
✅ Index built with 14 vectors → /Users/martinkrawtzow/Library/CloudStorage/OneDrive-MichaelMöhleundRainerBraker/Studium/Master/Semester2/Social Media Analytics/SMA_Capstone/index/faiss.index
🔍 Search engine index built and saved.

💬 Local RAG summary:
Tip: Ask yourself what you want to achieve or why you want to exercise. Try to look at your goals as
goals, instead of adversities.
