In [9]:
#!/usr/bin/env python3
"""
semantic_search_engine.py

A modular semantic search engine for a Reddit life-advice corpus.

This version is configured for interactive use in notebooks.
It loads a dataset from a hardcoded path and builds the FAISS index.
"""
from __future__ import annotations

import argparse
import json
import os
import pickle
from pathlib import Path
from typing import Dict, List

from __future__ import annotations
import json, os, pickle, textwrap
from pathlib import Path
from typing import Dict, List

import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

try:
    from tqdm import tqdm
except ImportError:
    def tqdm(x, *args, **kwargs): return x

class SemanticSearchEngine:
    def __init__(self,
                 model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
                 index_dir: str | Path = "index",
                 use_gpu: bool = False):
        self.index_dir = Path(index_dir)
        self.index_path = self.index_dir / "faiss.index"
        self.meta_path = self.index_dir / "metadata.pkl"
        self.index_dir.mkdir(parents=True, exist_ok=True)

        self.model = SentenceTransformer(model_name)
        if use_gpu:
            self.model = self.model.to("cuda")

        self.index: faiss.Index | None = None
        self.metadata: List[Dict] = []

    @staticmethod
    def _normalise(emb: np.ndarray) -> np.ndarray:
        norm = np.linalg.norm(emb, axis=1, keepdims=True)
        return emb / np.maximum(norm, 1e-12)

    def build(self, docs: List[str], metas: List[Dict], hnsw_m: int = 32):
        if len(docs) != len(metas):
            raise ValueError("Mismatch between docs and metas")

        print(f"Encoding {len(docs)} documents...")
        embeddings = self.model.encode(docs, batch_size=128, show_progress_bar=True, convert_to_numpy=True)
        embeddings = self._normalise(embeddings.astype('float32'))

        dim = embeddings.shape[1]
        self.index = faiss.IndexHNSWFlat(dim, hnsw_m, faiss.METRIC_INNER_PRODUCT)
        self.index.hnsw.efConstruction = 200

        self.index.add(embeddings)
        self.metadata = metas
        self.save()
        print(f"✓ Index built with {self.index.ntotal} vectors.")

    def save(self):
        faiss.write_index(self.index, str(self.index_path))
        with open(self.meta_path, "wb") as f:
            pickle.dump(self.metadata, f)

    def load(self):
        if self.index is None:
            self.index = faiss.read_index(str(self.index_path))
        if not self.metadata:
            with open(self.meta_path, "rb") as f:
                self.metadata = pickle.load(f)

    def search(self, query: str, top_k: int = 5) -> List[Dict]:
        self.load()
        q_emb = self.model.encode([query], convert_to_numpy=True)
        q_emb = self._normalise(q_emb.astype('float32'))

        scores, idxs = self.index.search(q_emb, top_k)
        results = []
        for score, idx in zip(scores[0], idxs[0]):
            item = self.metadata[idx].copy()
            item["score"] = float(score)
            results.append(item)
        return results

    def rag_answer(self, query: str, context_k: int = 5, model: str = "gpt-3.5-turbo-0125",
                   openai_api_key: str | None = None) -> str:
        import openai

        openai.api_key = openai_api_key or os.getenv("OPENAI_API_KEY")
        if not openai.api_key:
            raise EnvironmentError("OPENAI_API_KEY not set")

        contexts = self.search(query, top_k=context_k)
        concatenated = "\n\n".join(
            f"Title: {c['title']}\nAdvice: {c.get('top_comment', 'N/A')}\nURL: {c['url']}"
            for c in contexts
        )

        system_prompt = "You are a friendly assistant who provides concise, actionable life advice."
        user_prompt = (
            f"Based on the Reddit advice below, answer the QUESTION in 3–5 bullet points.\n\n"
            f"REDDIT ADVICE:\n{concatenated}\n\nQUESTION: {query}"
        )

        response = openai.ChatCompletion.create(
            model=model,
            messages=[{"role": "system", "content": system_prompt},
                      {"role": "user", "content": user_prompt}],
            temperature=0.7,
        )
        return textwrap.fill(response.choices[0].message.content.strip(), 100)


def parse_reddit_jsonl(path: str) -> tuple[List[str], List[Dict]]:
    docs, metas = [], []
    with open(path, "r", encoding="utf-8") as fh:
        for line in tqdm(fh, desc="Reading JSONL"):
            item = json.loads(line)
            title = item.get("title", "").strip()
            selftext = item.get("selftext", "").strip()
            comments = item.get("comments", [])
            comment_str = " ".join(comments).strip()

            full_text = " ".join([title, selftext, comment_str]).strip()

            metas.append({
                "title": title or full_text[:60] + "…",
                "url": item.get("url"),
                "post_score": item.get("score", 0),
                "subreddit": item.get("subreddit", ""),
                "created_utc": item.get("created_utc"),
                "top_comment": comments[0] if comments else "",
            })
            docs.append(full_text)
    return docs, metas


if __name__ == "__main__":
    dataset_path = "reddit_wisdom_data.jsonl"

    engine = SemanticSearchEngine(index_dir="index")
    docs, metas = parse_reddit_jsonl(dataset_path)
    engine.build(docs, metas, hnsw_m=16)
    print("Search engine index built and saved.")

    # Sample search
    results = engine.search("How do I become more confident?", top_k=5)
    for i, item in enumerate(results, 1):
        print(f"{i}. {item['title']} → {item['url']} (score: {item['score']:.3f})")


Reading JSONL: 260it [00:00, 61859.38it/s]


Encoding 260 documents...


Batches: 100%|██████████| 3/3 [00:01<00:00,  1.79it/s]

✓ Index built with 260 vectors.
Search engine index built and saved.
1. LPT If you're an introvert, you don’t need to force yourself to become an extrovert to make friends—just embrace your natural traits. → https://www.reddit.com/r/LifeProTips/comments/1jchdg1/lpt_if_youre_an_introvert_you_dont_need_to_force/ (score: 0.416)
2. LPT - When preparing for a job interview, record yourself. → https://www.reddit.com/r/LifeProTips/comments/1j6jklq/lpt_when_preparing_for_a_job_interview_record/ (score: 0.369)
3. LPT: If you're learning any new skill, don't compare your beginning to someone else's middle. Everyone starts somewhere, and consistent practice matters more than natural talent. → https://www.reddit.com/r/LifeProTips/comments/1khlczv/lpt_if_youre_learning_any_new_skill_dont_compare/ (score: 0.341)
4. LPT Request: How to deal with stress before giving a presentation? → https://www.reddit.com/r/LifeProTips/comments/1k7mdc5/lpt_request_how_to_deal_with_stress_before_giving/ (score: 0.330




In [12]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")

rag_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

def local_rag_answer(query: str, context_k: int = 5) -> str:
    contexts = engine.search(query, top_k=context_k)

    context_texts = [
        f"Title: {c['title']}\nAdvice: {c.get('top_comment', '')}"
        for c in contexts
    ]

    prompt_body = "\n\n".join(context_texts)
    prompt = (
        f"Based on the Reddit advice below, answer the QUESTION in 3–5 bullet points.\n\n"
        f"REDDIT ADVICE:\n{prompt_body}\n\nQUESTION: {query}"
    )

    # Truncate prompt tokens to fit the model's input limit (512)
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    
    result = rag_pipeline(prompt, max_length=256, do_sample=True, temperature=0.7, truncation=True)
    return result[0]['generated_text']



print(local_rag_answer("How do I become more confident?"))



Device set to use mps:0


Advice: ### This post has been marked as safe. Upvoting/downvoting this comment will have no effect. ---


In [13]:
print(local_rag_answer("How do I connect with people?"))

Advice: Consistency is key to starting a friendship, that's where school or work can help, because you are forced to be with eachother 5 days a week. Title: LPT If you're an introvert, you don't need to force yourself to become an extrovert to make friends—just embrace your natural traits.
