In [4]:
!pip install llama-parse langchain-text-splitters
!pip install pymupdf
!pip install -U sentence-transformers faiss-cpu
!pip install -U sentence-transformers faiss-cpu transformers accelerate einops

# !pip install gradio==4.0.0



In [5]:
import torch
import os
from google.colab import drive
import json
from pathlib import Path
import pandas as pd
import glob
from google.colab import userdata
import time
import numpy as np
from tqdm import tqdm # For progress bar
import fitz # To work with pdf documents

from llama_parse import LlamaParse
from langchain_text_splitters import RecursiveCharacterTextSplitter
from collections import Counter
from sentence_transformers import SentenceTransformer
import faiss # FAISS (Facebook AI Similarity Search) for efficient similarity search and clustering of dense vectors
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline


In [6]:
# import torch
if torch.cuda.is_available():
    print(f"Success! GPU Detected: {torch.cuda.get_device_name(0)}")
else:
    print("Failure. Running on CPU.")

Success! GPU Detected: NVIDIA A100-SXM4-80GB


In [7]:
import os
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
dataset_path = "/content/drive/MyDrive/NLP/dataset/Reddit/"
dataset_path

'/content/drive/MyDrive/NLP/dataset/Reddit/'

# Reddit data preprocessing

In [None]:
SUBREDDITs = ["AskEconomics", "Economics", "investing", "StockMarket", "stocks", "wallstreetbets"]
for SUBREDDIT in SUBREDDITs:
  POSTS_FILE = f"{dataset_path}r_{SUBREDDIT}_posts.jsonl"
  COMMENTS_FILE = f"{dataset_path}r_{SUBREDDIT}_comments.jsonl"
  OUT_FILE = f"{dataset_path}r_{SUBREDDIT}_processed.jsonl"

  print(f"\nProcessing r/{SUBREDDIT} ...")

  def extract_post_fields(obj):
      keep = [
          "name", "title", "selftext", "url",
          "ups", "score", "upvote_ratio",
          "subreddit", "subreddit_name_prefixed",
          "created_utc"
      ]
      return {k: obj.get(k, None) for k in keep}

  def extract_comment_fields(obj):
      keep = [
          "name", "body", "link_id", "parent_id",
          "ups", "score", "upvote_ratio",
          "subreddit", "subreddit_name_prefixed",
          "created_utc"
      ]
      return {k: obj.get(k, None) for k in keep}

  #  1. Load posts
  posts = {}  # name (t3_*) -> post dict

  with open(POSTS_FILE, "r") as f:
      for line in f:
          raw = json.loads(line)
          p = extract_post_fields(raw)
          # Only store if it has a title (sanity check)
          if p.get("name") and p.get("title") is not None:
              posts[p["name"]] = p

  print("Loaded posts:", len(posts))

  # 2. Load comments
  comments_by_link = {}  # link_id (t3_*) -> list of comments

  with open(COMMENTS_FILE, "r") as f:
      for line in f:
          raw = json.loads(line)
          c = extract_comment_fields(raw)
          if not c.get("name") or not c.get("body"):
              continue
          link_id = c.get("link_id")
          if not link_id:
              continue
          comments_by_link.setdefault(link_id, []).append(c)

  print("Loaded comment groups (by post):", len(comments_by_link))

  # 3. Format one thread (post + comments + replies) into text

  def format_thread_text(post, comment_list):
      text_parts = []

      title = post.get("title", "") or ""
      selftext = post.get("selftext", "") or ""
      url = post.get("url", "") or ""

      text_parts.append(f"[POST] {title}")
      if url:
          text_parts.append(f"URL: {url}")
      if selftext.strip():
          text_parts.append("\n" + selftext.strip())

      if comment_list:
          text_parts.append("\n\n[COMMENTS]")

      # index comments by parent → children
      by_parent = {}
      for c in comment_list:
          parent = c.get("parent_id")
          by_parent.setdefault(parent, []).append(c)

      # top-level comments: parent_id == link_id
      for c in comment_list:
          parent_id = c.get("parent_id")
          link_id = c.get("link_id")
          if parent_id == link_id:
              body = (c.get("body") or "").strip()
              if not body:
                  continue
              text_parts.append(f"- {body}")

              # replies
              children = by_parent.get(c.get("name"), [])
              for r in children:
                  rbody = (r.get("body") or "").strip()
                  if not rbody:
                      continue
                  text_parts.append(f"  ↳ Reply: {rbody}")

      return "\n".join(text_parts).strip()

  # 4. Build processed thread docs

  processed_docs = []

  for post_id, post in posts.items():
      comment_list = comments_by_link.get(post_id, [])
      full_text = format_thread_text(post, comment_list)

      doc = {
          "id": f"reddit_thread_{post_id}",
          "source": "reddit",
          "subreddit": post.get("subreddit"),
          "subreddit_name_prefixed": post.get("subreddit_name_prefixed"),
          "title": post.get("title"),
          "url": post.get("url"),
          "text": full_text,
          "meta": {
              "ups": post.get("ups"),
              "score": post.get("score"),
              "upvote_ratio": post.get("upvote_ratio"),
              "created_utc": post.get("created_utc"),
              "post_id": post.get("name"),
              "num_comments": len(comment_list),
          }
      }
      processed_docs.append(doc)

  print("Processed thread docs:", len(processed_docs))

  # 5. Save to processed file

  with open(OUT_FILE, "w") as f:
      for d in processed_docs:
          f.write(json.dumps(d) + "\n")

  print(f"Saved {len(processed_docs)} threads to {OUT_FILE}")



Processing r/AskEconomics ...
Loaded posts: 31298
Loaded comment groups (by post): 32005
Processed thread docs: 31298
Saved 31298 threads to /content/drive/MyDrive/NLP/dataset/Reddit/r_AskEconomics_processed.jsonl

Processing r/Economics ...
Loaded posts: 38358
Loaded comment groups (by post): 33095
Processed thread docs: 38358
Saved 38358 threads to /content/drive/MyDrive/NLP/dataset/Reddit/r_Economics_processed.jsonl

Processing r/investing ...
Loaded posts: 32303
Loaded comment groups (by post): 30742
Processed thread docs: 32303
Saved 32303 threads to /content/drive/MyDrive/NLP/dataset/Reddit/r_investing_processed.jsonl

Processing r/StockMarket ...
Loaded posts: 15424
Loaded comment groups (by post): 8940
Processed thread docs: 15424
Saved 15424 threads to /content/drive/MyDrive/NLP/dataset/Reddit/r_StockMarket_processed.jsonl

Processing r/stocks ...
Loaded posts: 42758
Loaded comment groups (by post): 22429
Processed thread docs: 42758
Saved 42758 threads to /content/drive/MyDr

In [8]:
reddit_dataset = "/content/drive/MyDrive/NLP/dataset/Reddit/*_processed.jsonl"
top10stock_dataset = "/content/drive/MyDrive/NLP/dataset/top10Stocks/*.csv"
books_dataset = "/content/drive/MyDrive/NLP/dataset/books/*.pdf"
papers_dataset = "/content/drive/MyDrive/NLP/dataset/papers/*.pdf"

In [9]:
df = pd.read_csv("/content/drive/MyDrive/NLP/dataset/top10Stocks/META.csv")
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,24-Nov-25,598.72,615.4,597.63,614.69,614.69,10708266
1,21-Nov-25,588.5,598.12,581.86,594.25,594.25,20977200
2,20-Nov-25,603.5,606.72,583.35,589.15,589.15,20603000
3,19-Nov-25,593.72,595.33,581.25,590.32,590.32,24744700
4,18-Nov-25,591.6,603.66,583.78,597.69,597.69,25500600


# CSV file preprocessing

In [10]:
top10stock_dataset = "/content/drive/MyDrive/NLP/dataset/top10Stocks/"

def preprocess_stock_csv(csv_path, ticker):
    # Read CSV, allow numbers with commas (e.g., 19,062,811)
    df = pd.read_csv(csv_path, thousands=",")

    # Strip whitespace from column names: 'Close ' -> 'Close'
    df.columns = df.columns.str.strip()

    # Parse dates like "24-Nov-25"
    df['Date'] = pd.to_datetime(
        df['Date'].astype(str).str.strip(),
        format="%d-%b-%y",
        errors="coerce"
    )

    processed_docs = []

    for _, row in df.iterrows():
        # Skip invalid dates
        if pd.isna(row['Date']):
            continue

        # Skip dividend rows (any cell containing the word "Dividend") because some rows have dividends
        if "Dividend" in str(row.values):
            continue

        # Ensure required numeric columns exist and are not NaN
        required_cols = ["Open", "High", "Low", "Close", "Adj Close", "Volume"]
        if any((col not in row) or pd.isna(row[col]) for col in required_cols):
            continue

        date = row['Date'].strftime("%Y-%m-%d")
        open_p = float(row["Open"])
        high_p = float(row["High"])
        low_p  = float(row["Low"])
        close_p = float(row["Close"])
        adj_close = float(row["Adj Close"])
        vol = int(row["Volume"])

        text = (
            f"On {row['Date'].strftime('%d %b %Y')}, {ticker} closed at ${close_p:.2f} "
            f"(Open: {open_p:.2f}, High: {high_p:.2f}, Low: {low_p:.2f}). "
            f"Trading volume was {vol:,} shares."
        )

        doc = {
            "id": f"{ticker}_{date}",
            "source": "stocks",
            "ticker": ticker,
            "date": date,
            "text": text,
            "meta": {
                "open": open_p,
                "high": high_p,
                "low": low_p,
                "close": close_p,
                "adj_close": adj_close,
                "volume": vol
            }
        }

        processed_docs.append(doc)

    return processed_docs


# Process all 10 stocks & combine into one JSONL

stocks = ["AMZN","MSFT","NVDA","AVGO","ERIE","GOOGL","META","NOW","PYPL","CMG"]

all_docs = []

for stock in stocks:
    csv_path = f"{top10stock_dataset}{stock}.csv"
    print(f"Processing: {csv_path}")
    docs = preprocess_stock_csv(csv_path, stock)
    all_docs.extend(docs)

output_file = f"{top10stock_dataset}top10stocks_processed.jsonl"

with open(output_file, "w") as f:
    for d in all_docs:
        f.write(json.dumps(d) + "\n")

print(f"\nSaved {len(all_docs)} stock documents → {output_file}")


Processing: /content/drive/MyDrive/NLP/dataset/top10Stocks/AMZN.csv
Processing: /content/drive/MyDrive/NLP/dataset/top10Stocks/MSFT.csv
Processing: /content/drive/MyDrive/NLP/dataset/top10Stocks/NVDA.csv
Processing: /content/drive/MyDrive/NLP/dataset/top10Stocks/AVGO.csv
Processing: /content/drive/MyDrive/NLP/dataset/top10Stocks/ERIE.csv
Processing: /content/drive/MyDrive/NLP/dataset/top10Stocks/GOOGL.csv
Processing: /content/drive/MyDrive/NLP/dataset/top10Stocks/META.csv
Processing: /content/drive/MyDrive/NLP/dataset/top10Stocks/NOW.csv
Processing: /content/drive/MyDrive/NLP/dataset/top10Stocks/PYPL.csv
Processing: /content/drive/MyDrive/NLP/dataset/top10Stocks/CMG.csv

Saved 2500 stock documents → /content/drive/MyDrive/NLP/dataset/top10Stocks/top10stocks_processed.jsonl


`LLama cloud API key:` llx-hMB9rZehfhhGgS6WyyjxRJrZfVPWaDVysS3dtyzkcDvH4fDZ

# PDF files preprocessing

In [None]:
# Paths
books_dataset = "/content/drive/MyDrive/NLP/dataset/books/*.pdf"
papers_dataset = "/content/drive/MyDrive/NLP/dataset/papers/*.pdf"

# Output JSONL
pdf_output_file = "/content/drive/MyDrive/NLP/dataset/pdf_processed.jsonl"

# LlamaParse setup, generate API key from LlamaCloud
LLAMA_PARSE_API_KEY = "llx-hMB9rZehfhhGgS6WyyjxRJrZfVPWaDVysS3dtyzkcDvH4fDZ"  # "API_KEY"

parsing_instruction = """
Extract all content from this financial/investing document in clean, structured Markdown.

Requirements:
- Preserve headings, subheadings, bullet points, numbered lists, and tables.
- Do NOT summarize or add interpretations.
- Do NOT rewrite or rephrase; extract text faithfully.
- If images, ignore them.
- Keep numeric values, financial terms, and table content intact.
- Try to be precise while answering the questions
"""

parser = LlamaParse(
    api_key=LLAMA_PARSE_API_KEY,
    result_type="markdown",
    # parsing_instruction=parsing_instruction,
    system_prompt=parsing_instruction,
    max_timeout=5000,
)

# Text splitter for RAG chunks
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=200,
    separators=["\n## ", "\n# ", "\n\n", "\n", " ", ""],
)

# Helper to parse + chunk a single PDF

def process_single_pdf(pdf_path: str, category: str):
    """
    pdf_path: full path to the PDF
    category: 'book' or 'paper' (or any label you want)
    """
    print(f"Parsing PDF: {pdf_path}")

    # LlamaParse sync call
    llama_docs = parser.load_data(pdf_path)

    # Some PDFs may be split into multiple internal docs → join them
    full_text = "\n\n".join(doc.text for doc in llama_docs)

    # Split into chunks for RAG
    chunks = splitter.split_text(full_text)
    print(f"  -> {len(chunks)} chunks")

    filename = os.path.basename(pdf_path)

    processed = []
    for i, chunk in enumerate(chunks):
        doc_id = f"pdf_{category}_{os.path.splitext(filename)[0]}_chunk_{i+1:03}"

        processed.append({
            "id": doc_id,
            "source": "pdf",
            "category": category,          # 'book' or 'paper'
            "file_name": filename,
            "text": chunk,
            "meta": {
                "category": category,
                "file_path": pdf_path,
            }
        })

    return processed

# Collect all PDFs

book_files = glob.glob(books_dataset)
paper_files = glob.glob(papers_dataset)

print("Found book PDFs:", len(book_files))
print("Found paper PDFs:", len(paper_files))

all_pdf_docs = []

# Process books
for pdf_path in book_files:
    all_pdf_docs.extend(process_single_pdf(pdf_path, category="book"))

# Process papers
for pdf_path in paper_files:
    all_pdf_docs.extend(process_single_pdf(pdf_path, category="paper"))

print(f"\nTotal PDF chunks: {len(all_pdf_docs)}")

# Save everything to JSONL

os.makedirs(os.path.dirname(pdf_output_file), exist_ok=True)

with open(pdf_output_file, "w") as f:
    for doc in all_pdf_docs:
        f.write(json.dumps(doc) + "\n")

print(f"Saved all PDF chunks to: {pdf_output_file}")

In [12]:
# Path
books_pattern = "/content/drive/MyDrive/NLP/dataset/books/*.pdf"
papers_pattern = "/content/drive/MyDrive/NLP/dataset/papers/*.pdf"

pdf_output_file = "/content/drive/MyDrive/NLP/dataset/pdf_processed.jsonl"

# PDF text extraction function
def extract_text_from_pdf(pdf_path: str) -> str:
    try:
        doc = fitz.open(pdf_path)
    except Exception as e:
        print(f"Could not open {pdf_path}: {e}")
        return ""

    texts = []
    for page in doc:
        texts.append(page.get_text("text"))
    doc.close()

    return "\n\n".join(texts).strip()

# RAG text splitter
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=200,
    separators=["\n## ", "\n# ", "\n\n", "\n", " ", ""],
)

# Process one PDF
def process_single_pdf(pdf_path, category):
    print(f"\nProcessing {pdf_path}")

    full_text = extract_text_from_pdf(pdf_path)

    if not full_text:
        print("No text extracted, skipping.")
        return []

    chunks = splitter.split_text(full_text)
    print(f"  -> {len(chunks)} chunks")

    base = os.path.splitext(os.path.basename(pdf_path))[0]

    docs = []
    for i, chunk in enumerate(chunks):
        docs.append({
            "id": f"pdf_{category}_{base}_chunk_{i+1:03}",
            "source": "pdf",
            "category": category,
            "file_name": os.path.basename(pdf_path),
            "text": chunk,
            "meta": {
                "file": pdf_path,
                "category": category
            }
        })

    return docs

# Collect all PDFs
books = glob.glob(books_pattern)
papers = glob.glob(papers_pattern)

print("Found books:", len(books))
print("Found papers:", len(papers))

all_pdf_docs = []

for pdf in books:
    all_pdf_docs.extend(process_single_pdf(pdf, "book"))

for pdf in papers:
    all_pdf_docs.extend(process_single_pdf(pdf, "paper"))

print(f"\nTotal chunks generated: {len(all_pdf_docs)}")

# Save to JSONL
with open(pdf_output_file, "w") as f:
    for d in all_pdf_docs:
        f.write(json.dumps(d) + "\n")

print(f"Saved output → {pdf_output_file}")


Found books: 13
Found papers: 9

Processing /content/drive/MyDrive/NLP/dataset/books/The Intelligent Investor.pdf
  -> 1165 chunks

Processing /content/drive/MyDrive/NLP/dataset/books/one-up-on-wall-street.pdf
  -> 502 chunks

Processing /content/drive/MyDrive/NLP/dataset/books/the-psychology-of-money.pdf
  -> 323 chunks

Processing /content/drive/MyDrive/NLP/dataset/books/the_most_important_thing.pdf
  -> 432 chunks

Processing /content/drive/MyDrive/NLP/dataset/books/A Random Walk Down Wall Street_ The Time-Tested Strategy for Successful Investing (Eleventh Edition).pdf
  -> 686 chunks

Processing /content/drive/MyDrive/NLP/dataset/books/William_J_Oneil_-_How_To_Make_Money_In_Stocks.pdf
  -> 418 chunks

Processing /content/drive/MyDrive/NLP/dataset/books/guide-for-beginners--2024-30-01-11-53-37.pdf
  -> 103 chunks

Processing /content/drive/MyDrive/NLP/dataset/books/Common Stocks and Uncommon Profits - Philip A. Fisher.pdf
  -> 572 chunks

Processing /content/drive/MyDrive/NLP/datase

# Creating a corpus with all the data combined.

In [13]:
processed_dir = "/content/drive/MyDrive/NLP/dataset/"

os.makedirs(processed_dir, exist_ok=True)

corpus_path = os.path.join(processed_dir, "corpus.jsonl")

def load_jsonl(path):
    docs = []
    with open(path, "r") as f:
        for i, line in enumerate(f, start=1):
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
                if "text" in obj and isinstance(obj["text"], str) and obj["text"].strip():
                    docs.append(obj)
            except json.JSONDecodeError as e:
                print(f"JSON error in {path}, line {i}: {e}")
    print(f"Loaded {len(docs):5d} docs from {os.path.basename(path)}")
    return docs

# Get all .jsonl files in the directory (except an existing corpus.jsonl)
jsonl_files = glob.glob(os.path.join(processed_dir, "*.jsonl"))
jsonl_files = [f for f in jsonl_files if os.path.basename(f) != "corpus.jsonl"]

print("Found JSONL files:")
for f in jsonl_files:
    print("  -", os.path.basename(f))

all_docs = []
for f in jsonl_files:
    all_docs.extend(load_jsonl(f))

print(f"\nTotal docs from all sources: {len(all_docs)}")

# Quick stats by source field (if present)
source_counts = Counter(doc.get("source", "unknown") for doc in all_docs)
print("\nDocs per source:")
for s, c in source_counts.items():
    print(f"  {s}: {c}")

# Save unified corpus
with open(corpus_path, "w") as f:
    for d in all_docs:
        f.write(json.dumps(d, ensure_ascii=False) + "\n")

print(f"\n Unified corpus saved to: {corpus_path}")


Found JSONL files:
  - pdf_processed.jsonl
  - top10stocks_processed.jsonl
  - r_AskEconomics_processed.jsonl
  - r_Economics_processed.jsonl
  - r_investing_processed.jsonl
  - r_wallstreetbets_processed.jsonl
  - r_stocks_processed.jsonl
  - r_StockMarket_processed.jsonl
Loaded  7650 docs from pdf_processed.jsonl
Loaded  2500 docs from top10stocks_processed.jsonl
Loaded 31298 docs from r_AskEconomics_processed.jsonl
Loaded 38358 docs from r_Economics_processed.jsonl
Loaded 32303 docs from r_investing_processed.jsonl
Loaded 119351 docs from r_wallstreetbets_processed.jsonl
Loaded 42758 docs from r_stocks_processed.jsonl
Loaded 15424 docs from r_StockMarket_processed.jsonl

Total docs from all sources: 289642

Docs per source:
  pdf: 7650
  stocks: 2500
  reddit: 279492

 Unified corpus saved to: /content/drive/MyDrive/NLP/dataset/corpus.jsonl


In [9]:
corpus_path = "/content/drive/MyDrive/NLP/dataset/corpus.jsonl"

print("Showing first 5 pretty-format documents:\n")

with open(corpus_path, "r") as f:
    for i in range(5):
        line = f.readline().strip()
        if not line:
            break
        obj = json.loads(line)
        print(f"--- Document {i+1} ---")
        print(json.dumps(obj, indent=2, ensure_ascii=False))
        print()


Showing first 5 pretty-format documents:

--- Document 1 ---
{
  "id": "pdf_book_The Intelligent Investor_chunk_001",
  "source": "pdf",
  "category": "book",
  "file_name": "The Intelligent Investor.pdf",
  "text": "THE \nINTELLIGENT\nINVESTOR\nA B O O K  O F  P R A C T I C A L  C O U N S E L\nREVISED EDITION\nB E NJAM I N G RAHAM\nUpdated with New Commentary by Jason Zweig\n\n\nTo E.M.G.\n\n\nThrough chances various, through all \nvicissitudes, we make our way. . . .\nAeneid\n\n\nContents\nEpigraph\niii\nPreface to the Fourth Edition, by Warren E. Buffett\nA Note About Benjamin Graham, by Jason Zweig\nx\nIntroduction: What This Book Expects to Accomplish\n1\nCOMMENTARY ON THE INTRODUCTION\n12\n1.\nInvestment versus Speculation: Results to Be \nExpected by the Intelligent Investor\n18\nCOMMENTARY ON CHAPTER 1\n35\n2.\nThe Investor and Inflation\n47\nCOMMENTARY ON CHAPTER 2\n58\n3.\nA Century of Stock-Market History: \nThe Level of Stock Prices in Early 1972\n65\nCOMMENTARY ON CHAPTER 

In [None]:
!mkdir -p /content/drive/MyDrive/NLP/dataset/index_bge_large

In [11]:
corpus_path = "/content/drive/MyDrive/NLP/dataset/corpus.jsonl"
index_dir   = "/content/drive/MyDrive/NLP/dataset/index_bge_large/"


In [12]:
docs = []
with open(corpus_path, "r") as f:
    for line in f:
        obj = json.loads(line)
        # Require at least 'id' and 'text'
        if "id" in obj and "text" in obj and obj["text"].strip():
            docs.append(obj)

print("Total docs loaded:", len(docs))
print("Example doc:")
print(json.dumps(docs[0], indent=2)[:1000])

Total docs loaded: 289642
Example doc:
{
  "id": "pdf_book_The Intelligent Investor_chunk_001",
  "source": "pdf",
  "category": "book",
  "file_name": "The Intelligent Investor.pdf",
  "text": "THE \nINTELLIGENT\nINVESTOR\nA B O O K  O F  P R A C T I C A L  C O U N S E L\nREVISED EDITION\nB E NJAM I N G RAHAM\nUpdated with New Commentary by Jason Zweig\n\n\nTo E.M.G.\n\n\nThrough chances various, through all \nvicissitudes, we make our way. . . .\nAeneid\n\n\nContents\nEpigraph\niii\nPreface to the Fourth Edition, by Warren E. Buffett\nA Note About Benjamin Graham, by Jason Zweig\nx\nIntroduction: What This Book Expects to Accomplish\n1\nCOMMENTARY ON THE INTRODUCTION\n12\n1.\nInvestment versus Speculation: Results to Be \nExpected by the Intelligent Investor\n18\nCOMMENTARY ON CHAPTER 1\n35\n2.\nThe Investor and Inflation\n47\nCOMMENTARY ON CHAPTER 2\n58\n3.\nA Century of Stock-Market History: \nThe Level of Stock Prices in Early 1972\n65\nCOMMENTARY ON CHAPTER 3\n80\n4.\nGeneral Por

### bge-base-en-v1.5 is a BAAI general embedding (BGE) model that transforms any given English text into a compact vector.

In [13]:
# BGE large English model
embed_model_name = "BAAI/bge-large-en-v1.5"

embed_model = SentenceTransformer(embed_model_name)
embed_model.max_seq_length = 512  # typical; we can tweak if needed

print("Embedding dim:", embed_model.get_sentence_embedding_dimension())


Embedding dim: 1024


In [None]:
os.makedirs(index_dir, exist_ok=True)

# 1. Reload docs (in case you restarted)
docs = []
with open(corpus_path, "r") as f:
    for line in f:
        obj = json.loads(line)
        if "id" in obj and "text" in obj and obj["text"].strip():
            docs.append(obj)

print("Total docs:", len(docs))

# 2. Collect texts and minimal metadata
texts = [d["text"] for d in docs]
metadatas = [
    {
        "id": d.get("id"),
        "source": d.get("source", "unknown"),
        "ticker": d.get("ticker"),
        "subreddit": d.get("subreddit") or d.get("meta", {}).get("subreddit"),
        "file_name": d.get("file_name") or d.get("meta", {}).get("file_name"),
    }
    for d in docs
]

# 3. Embed in batches to avoid OOM
batch_size = 64
emb_list = []

for i in tqdm(range(0, len(texts), batch_size)):
    batch_texts = texts[i:i+batch_size]
    batch_embs = embed_model.encode(
        batch_texts,
        batch_size=len(batch_texts),
        show_progress_bar=False,
        convert_to_numpy=True,
        normalize_embeddings=True,  # BGE works best with normalized vectors
    )
    emb_list.append(batch_embs)

embeddings = np.vstack(emb_list)
print("Embeddings shape:", embeddings.shape)

# 4. Build FAISS index (L2 on normalized = cosine sim)
dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)  # inner product; works with normalized embeddings
index.add(embeddings)

print("Index size:", index.ntotal)

# 5. Save FAISS index and metadata + texts
faiss_index_path = os.path.join(index_dir, "faiss_index_bge_large.bin")
meta_path        = os.path.join(index_dir, "metadata.jsonl")

faiss.write_index(index, faiss_index_path)
print("Saved FAISS index to:", faiss_index_path)

with open(meta_path, "w") as f:
    for doc, meta, text in zip(docs, metadatas, texts):
        record = {
            "id": meta["id"],
            "source": meta["source"],
            "ticker": meta["ticker"],
            "subreddit": meta["subreddit"],
            "file_name": meta["file_name"],
            "text": text,
        }
        f.write(json.dumps(record, ensure_ascii=False) + "\n")

print("Saved metadata to:", meta_path)


Total docs: 289642


100%|██████████| 4526/4526 [1:40:15<00:00,  1.33s/it]


Embeddings shape: (289642, 1024)
Index size: 289642
Saved FAISS index to: /content/drive/MyDrive/NLP/dataset/index_bge_large/faiss_index_bge_large.bin
Saved metadata to: /content/drive/MyDrive/NLP/dataset/index_bge_large/metadata.jsonl


In [None]:
# Reload index & metadata (for later sessions too)
index = faiss.read_index(faiss_index_path)

meta_records = []
with open(meta_path, "r") as f:
    for line in f:
        meta_records.append(json.loads(line))

def search(query, k=5):
    q_emb = embed_model.encode(
        [query],
        convert_to_numpy=True,
        normalize_embeddings=True,
    )
    D, I = index.search(q_emb, k)  # distances, indices
    results = []
    for score, idx in zip(D[0], I[0]):
        rec = meta_records[idx]
        results.append({
            "score": float(score),
            "source": rec.get("source"),
            "id": rec.get("id"),
            "ticker": rec.get("ticker"),
            "subreddit": rec.get("subreddit"),
            "file_name": rec.get("file_name"),
            "text": rec.get("text")[:400] + "..."
        })
    return results

# Try a test query
query = "What are long-term investment strategies for tech stocks?"
hits = search(query, k=3)
for i, h in enumerate(hits, 1):
    print(f"\n=== Hit {i} (score={h['score']:.4f}, source={h['source']}) ===")
    print(h["text"])



=== Hit 1 (score=0.7589, source=pdf) ===
Facebook, and many other tech (and some non-tech) stocks.
If you buy the QQQ and hold it for the long-term, you will
be able to proﬁt from the long-term growth of the tech
industry.
You've probably also heard of indexing. It consists of buying
an index (usually using an ETF like the SPY or QQQ), and
holding it for the long-term. Indexing is a form of "passive
investing." Passive investing refers t...

=== Hit 2 (score=0.7362, source=reddit) ===
[POST] Would you rather hodl AMAT or LRCX long term?
URL: https://www.reddit.com/r/investing/comments/1n091r0/would_you_rather_hodl_amat_or_lrcx_long_term/

Basically title. Close to 37% of net worth in these two. 

Considering diversifying due to holding a lot of portfolio weight in these two companies. But they’ve been doing well the last 5-10..

Sell one or the other?

Or do you advise to maybe...

=== Hit 3 (score=0.7258, source=reddit) ===
[POST] How Do Indian Investors Pick Stocks for Long-Term Inv

# Phase 4: Implement Multi-LLM RAG Inference

In [14]:
corpus_path      = "/content/drive/MyDrive/NLP/dataset/corpus.jsonl"
index_dir        = "/content/drive/MyDrive/NLP/dataset/index_bge_large/"
faiss_index_path = index_dir + "faiss_index_bge_large.bin"
meta_path        = index_dir + "metadata.jsonl"


In [15]:
# Paths
index_dir        = "/content/drive/MyDrive/NLP/dataset/index_bge_large/"
faiss_index_path = os.path.join(index_dir, "faiss_index_bge_large.bin")
meta_path        = os.path.join(index_dir, "metadata.jsonl")

# Load FAISS index
index = faiss.read_index(faiss_index_path)
print("FAISS index loaded. Size:", index.ntotal)

# Load metadata (one record per vector)
meta_records = []
with open(meta_path, "r") as f:
    for line in f:
        meta_records.append(json.loads(line))

print("Metadata records:", len(meta_records))

# Load BGE-large embedding model
embed_model_name = "BAAI/bge-large-en-v1.5"
embed_model = SentenceTransformer(embed_model_name)
embed_model.max_seq_length = 512

print("Embedding dim:", embed_model.get_sentence_embedding_dimension())


FAISS index loaded. Size: 289642
Metadata records: 289642
Embedding dim: 1024


In [16]:
def retrieve(query: str, k: int = 5):
    """
    Given a natural language query, return top-k retrieved docs from FAISS index.
    """
    q_emb = embed_model.encode(
        [query],
        convert_to_numpy=True,
        normalize_embeddings=True,  # important for IP/cosine
    )

    D, I = index.search(q_emb, k)  # distances, indices
    results = []
    for score, idx in zip(D[0], I[0]):
        rec = meta_records[idx]
        results.append({
            "score": float(score),
            "source": rec.get("source", "unknown"),
            "id": rec.get("id"),
            "ticker": rec.get("ticker"),
            "subreddit": rec.get("subreddit"),
            "file_name": rec.get("file_name"),
            "text": rec.get("text")
        })
    return results

# Quick sanity-check
hits = retrieve("What are good long-term investing principles?", k=3)
for i, h in enumerate(hits, 1):
    print(f"\n=== Hit {i} (score={h['score']:.4f}, source={h['source']}) ===")
    print(h["text"][:400], "...")



=== Hit 1 (score=0.7313, source=pdf) ===
run by strong management.
2.
Limit yourself to the number of companies you can truly
understand. Ten to twenty is good, more than twenty is
asking for trouble.
3.
Pick the very best of your good companies, and put the
bulk of your investment there.
4.
Think long-term: five to ten years, minimum.
5.
Volatility happens. Carry on. ...

=== Hit 2 (score=0.7173, source=pdf) ===
traded T-bonds heavily from the long side. There is no question that that was my best trade and 
longest trend ever. 
What are the elements of good trading? 
The most important thing is to have a method for staying with your winners and getting rid of your 
losers. 
What do you do to make sure that you stay with a winning position to exploit the longer-term 
trend? How do you avoid the temptation  ...

=== Hit 3 (score=0.7172, source=pdf) ===
broader than it is currently. I would continually learn the basic principles
of sound investing which are Ben Graham’s, affected in a s

In [17]:
def build_rag_prompt(query: str, retrieved_docs, max_docs: int = 5) -> str:
    """
    Build a RAG-style prompt from the user question + retrieved context.
    """
    # Limit number of docs (even if k is larger)
    retrieved_docs = retrieved_docs[:max_docs]

    context_blocks = []
    for i, doc in enumerate(retrieved_docs, 1):
        header = f"[Document {i} | source={doc.get('source','?')}"
        if doc.get("ticker"):
            header += f", ticker={doc['ticker']}"
        if doc.get("subreddit"):
            header += f", subreddit={doc['subreddit']}"
        if doc.get("file_name"):
            header += f", file={doc['file_name']}"
        header += "]"
        context_blocks.append(header + "\n" + doc["text"])

    context = "\n\n".join(context_blocks)

    system_msg = (
        "You are an AI assistant that answers questions about stock markets, "
        "investing strategies, and financial concepts.\n"
        "Use ONLY the information in the provided context when possible.\n"
        "If the context is insufficient, say you are not certain instead of guessing.\n"
        "Do NOT give personalized financial advice; respond in general, educational terms."
    )

    prompt = (
        f"<system>\n{system_msg}\n</system>\n\n"
        f"<context>\n{context}\n</context>\n\n"
        f"<question>\n{query}\n</question>\n\n"
        f"Now provide a concise, well-structured answer based on the context."
    )

    return prompt


In [18]:
def load_hf_llm(model_id: str):
    """
    Load a HuggingFace causal LM + tokenizer and wrap in a generation pipeline.

    Uses accelerate (device_map='auto') when GPU is available,
    and plain CPU pipeline otherwise.
    """
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Loading model: {model_id} on {device} ...")

    tokenizer = AutoTokenizer.from_pretrained(model_id)

    if torch.cuda.is_available():
        # Let accelerate decide device placement
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            torch_dtype=torch.bfloat16,
            device_map="auto",
        )
        # DO NOT pass 'device=' here when using accelerate
        gen_pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
        )
    else:
        # CPU fallback
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            torch_dtype=torch.float32,
        )
        gen_pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            device=-1,  # CPU
        )

    return gen_pipe


`HuggingFace token:` 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



In [19]:
!hf auth login
#!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `hf auth whoami` to get more information or `hf auth logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: fineGrained).
The tok

In [20]:
MISTRAL_MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"
PHI_MODEL_ID ="microsoft/Phi-3-mini-4k-instruct"
LLAMA_MODEL_ID = "meta-llama/Meta-Llama-3.1-8B-Instruct"
# GEMMA_MODEL_ID    = "google/gemma-2-2b-it"

In [27]:
def rag_answer(query: str, gen_pipe, k: int = 5, max_new_tokens: int = 384):
    # 1. Retrieve context
    retrieved_docs = retrieve(query, k=k)

    # 2. Build RAG prompt
    prompt = build_rag_prompt(query, retrieved_docs, max_docs=k)

    # 3. Generate with the given model
    out = gen_pipe(
        prompt,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.3,
        top_p=0.9,
        eos_token_id=gen_pipe.tokenizer.eos_token_id,
    )[0]["generated_text"]

    return {
        "prompt": prompt,
        "retrieved_docs": retrieved_docs,
        "output": out,
    }


## Mistral

In [None]:
mistral_pipe = load_hf_llm(MISTRAL_MODEL_ID)
# phi_pipe = load_hf_llm(PHI_MODEL_ID)
# llama_pipe   = load_hf_llm(LLAMA_MODEL_ID)

Loading model: mistralai/Mistral-7B-Instruct-v0.3 on cuda ...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0


In [None]:
user_query = "What are the main risks of investing heavily in a single tech stock like NVDA?"

# print(">>> Running RAG with Llama...")
# llama_res = rag_answer(user_query, llama_pipe, k=5)

print("\n>>> Running RAG with Mistral...")
mistral_res = rag_answer(user_query, mistral_pipe, k=5)

# print("\n>>> Running RAG with Phi...")
# phi_res = rag_answer(user_query, phi_pipe, k=5)

# Print just the answers (you can also inspect prompt/retrieved docs)
def pretty_print_result(name, res):
    print(f"\n==================== {name} ====================")
    print(res["output"])

# pretty_print_result("LLAMA", llama_res)
pretty_print_result("MISTRAL", mistral_res)
# pretty_print_result("PHI",   phi_res)


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



>>> Running RAG with Mistral...

<system>
You are an AI assistant that answers questions about stock markets, investing strategies, and financial concepts.
Use ONLY the information in the provided context when possible.
If the context is insufficient, say you are not certain instead of guessing.
Do NOT give personalized financial advice; respond in general, educational terms.
</system>

<context>
[Document 1 | source=reddit, subreddit=investing]
[POST] 19, university student seeking advice
URL: https://www.reddit.com/r/investing/comments/1myj77x/19_university_student_seeking_advice/

Hey guys just a quick question,

I’m a uni student and I’ve been holding a few Nvidia shares that I picked up during the crash and they’ve done really well for me. Right now, I can afford to buy about one NVDA share a month, and I’m keen to keep going since I genuinely believe in the company. That said, I’m not sure if I should start diversifying instead. What do you guys think?


[COMMENTS]
- Eggs belong

## PHI

In [29]:
phi_pipe = load_hf_llm(PHI_MODEL_ID)
user_query = "What are the main risks of investing heavily in a single tech stock like NVDA?"

# print(">>> Running RAG with Llama...")
# llama_res = rag_answer(user_query, llama_pipe, k=5)

# print("\n>>> Running RAG with Mistral...")
# mistral_res = rag_answer(user_query, mistral_pipe, k=5)

print("\n>>> Running RAG with Phi...")
phi_res = rag_answer(user_query, phi_pipe, k=5)

# Print just the answers (you can also inspect prompt/retrieved docs)
def pretty_print_result(name, res):
    print(f"\n==================== {name} ====================")
    print(res["output"])

# pretty_print_result("LLAMA", llama_res)
# pretty_print_result("MISTRAL", mistral_res)
pretty_print_result("PHI",   phi_res)


Loading model: microsoft/Phi-3-mini-4k-instruct on cuda ...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0



>>> Running RAG with Phi...


This is a friendly reminder - the current text generation call has exceeded the model's predefined maximum length (4096). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.



<system>
You are an AI assistant that answers questions about stock markets, investing strategies, and financial concepts.
Use ONLY the information in the provided context when possible.
If the context is insufficient, say you are not certain instead of guessing.
Do NOT give personalized financial advice; respond in general, educational terms.
</system>

<context>
[Document 1 | source=reddit, subreddit=investing]
[POST] 19, university student seeking advice
URL: https://www.reddit.com/r/investing/comments/1myj77x/19_university_student_seeking_advice/

Hey guys just a quick question,

I’m a uni student and I’ve been holding a few Nvidia shares that I picked up during the crash and they’ve done really well for me. Right now, I can afford to buy about one NVDA share a month, and I’m keen to keep going since I genuinely believe in the company. That said, I’m not sure if I should start diversifying instead. What do you guys think?


[COMMENTS]
- Eggs belong in multiple baskets.
- You curre

## LLAMA

In [None]:
llama_pipe   = load_hf_llm(LLAMA_MODEL_ID)
user_query = "What are the main risks of investing heavily in a single tech stock like NVDA?"

print(">>> Running RAG with Llama...")
llama_res = rag_answer(user_query, llama_pipe, k=5)

# print("\n>>> Running RAG with Mistral...")
# mistral_res = rag_answer(user_query, mistral_pipe, k=5)

# print("\n>>> Running RAG with Phi...")
# phi_res = rag_answer(user_query, phi_pipe, k=5)

# Print just the answers (you can also inspect prompt/retrieved docs)
def pretty_print_result(name, res):
    print(f"\n==================== {name} ====================")
    print(res["output"])

# pretty_print_result("LLAMA", llama_res)
pretty_print_result("MISTRAL", mistral_res)
# pretty_print_result("PHI",   phi_res)


Loading model: meta-llama/Meta-Llama-3.1-8B-Instruct on cuda ...




Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


>>> Running RAG with Llama...

<system>
You are an AI assistant that answers questions about stock markets, investing strategies, and financial concepts.
Use ONLY the information in the provided context when possible.
If the context is insufficient, say you are not certain instead of guessing.
Do NOT give personalized financial advice; respond in general, educational terms.
</system>

<context>
[Document 1 | source=reddit, subreddit=investing]
[POST] 19, university student seeking advice
URL: https://www.reddit.com/r/investing/comments/1myj77x/19_university_student_seeking_advice/

Hey guys just a quick question,

I’m a uni student and I’ve been holding a few Nvidia shares that I picked up during the crash and they’ve done really well for me. Right now, I can afford to buy about one NVDA share a month, and I’m keen to keep going since I genuinely believe in the company. That said, I’m not sure if I should start diversifying instead. What do you guys think?


[COMMENTS]
- Eggs belong in

# Phase 5: Evaluation

In [31]:
EVAL_QUESTIONS = [
    {
        "id": "q1",
        "category": "concept",
        "question": "What is dollar-cost averaging and why do investors use it?",
        "reference_answer": "Dollar-cost averaging means investing a fixed amount at regular intervals regardless of price. It reduces timing risk, smooths entry points, and helps avoid emotional decision making."
    },
    {
        "id": "q2",
        "category": "risk",
        "question": "What are the main risks of investing heavily in a single stock like NVDA over the long term?",
        "reference_answer": "The main risks include company-specific risk, industry cyclicality, valuation compression, potential regulation of AI chips, and lack of diversification."
    },

    # ----------------------------- #
    #        STRATEGY / CONCEPT     #
    # ----------------------------- #
    {
        "id": "q3",
        "category": "concept",
        "question": "What is the difference between growth investing and value investing?",
        "reference_answer": "Growth investing focuses on companies with high expected earnings expansion, while value investing targets stocks priced below their intrinsic value based on fundamentals."
    },
    {
        "id": "q4",
        "category": "concept",
        "question": "How does diversification reduce portfolio risk?",
        "reference_answer": "Diversification spreads exposure across different assets so that poor performance in one does not significantly affect the entire portfolio."
    },
    {
        "id": "q5",
        "category": "concept",
        "question": "What does it mean for a stock to be overvalued or undervalued?",
        "reference_answer": "A stock is overvalued when its price exceeds its intrinsic value based on fundamentals, and undervalued when priced below the value indicated by earnings, cash flow, or assets."
    },

    # ----------------------------- #
    #        MARKET BEHAVIOR        #
    # ----------------------------- #
    {
        "id": "q6",
        "category": "market",
        "question": "How do market corrections typically affect long-term investors?",
        "reference_answer": "Corrections temporarily reduce portfolio value but historically provide buying opportunities and have little long-term impact if investors stay disciplined."
    },
    {
        "id": "q7",
        "category": "market",
        "question": "What are common indicators that investors use to evaluate market sentiment?",
        "reference_answer": "Common indicators include volatility indexes, trading volume, put-call ratios, moving averages, and qualitative sentiment from news and social media."
    },

    # ----------------------------- #
    #     STOCK HISTORICAL TRENDS   #
    # ----------------------------- #
    {
        "id": "q8",
        "category": "historical",
        "question": "Based on historical price trends, what factors often cause sharp movements in large-cap tech stocks like MSFT or AAPL?",
        "reference_answer": "Major drivers include earnings reports, product launches, macroeconomic news, interest rate shifts, and sector-wide sentiment changes."
    },
    {
        "id": "q9",
        "category": "historical",
        "question": "How should an investor interpret volume spikes in a stock's trading data?",
        "reference_answer": "Volume spikes often indicate strong buying or selling conviction and can signal trend reversals or confirmation depending on price movement."
    },

    # ----------------------------- #
    #          RISK ANALYSIS        #
    # ----------------------------- #
    {
        "id": "q10",
        "category": "risk",
        "question": "Why is relying solely on short-term news articles a risky basis for investment decisions?",
        "reference_answer": "Short-term news can be emotional, incomplete, or speculative and may not reflect a company's long-term fundamentals."
    },
    {
        "id": "q11",
        "category": "risk",
        "question": "What risks do investors face during periods of high interest rates?",
        "reference_answer": "High interest rates increase borrowing costs, reduce corporate earnings, slow economic growth, and typically reduce valuations for growth stocks."
    },

    # ----------------------------- #
    #        FORECASTING-STYLE      #
    # ----------------------------- #
    {
        "id": "q12",
        "category": "forecasting",
        "question": "What factors influence the future price movement of a stock like AAPL?",
        "reference_answer": "Key factors include revenue growth, margin trends, product demand, competitive pressure, macro conditions, and investor sentiment. Future prices cannot be guaranteed."
    },
    {
        "id": "q13",
        "category": "forecasting",
        "question": "Based on historical patterns, what general trends are observable in NVDA price movements?",
        "reference_answer": "NVDA has shown strong upward momentum driven by AI demand cycles, but also periods of high volatility due to valuation resets and chip market cyclicality."
    },
    {
        "id": "q14",
        "category": "forecasting",
        "question": "What does historical data suggest about expected volatility in TSLA?",
        "reference_answer": "TSLA historically demonstrates high volatility driven by sentiment, earnings surprises, production metrics, and analyst expectations."
    },
    {
        "id": "q15",
        "category": "forecasting",
        "question": "Why is it important not to rely on deterministic predictions for future stock prices?",
        "reference_answer": "Markets reflect unpredictable factors such as macro shocks, regulation, earnings surprises, and investor psychology, making exact forecasts unreliable."
    },

    # ----------------------------- #
    #       INVEST / SHOULD I        #
    # ----------------------------- #
    {
        "id": "q16",
        "category": "invest_or_not",
        "question": "Should I invest in NVDA right now?",
        "reference_answer": "Investing decisions depend on risk tolerance, diversification, valuation, and long-term goals. NVDA has strong growth drivers but also high volatility."
    },
    {
        "id": "q17",
        "category": "invest_or_not",
        "question": "Is MSFT a good investment at current prices?",
        "reference_answer": "MSFT is financially strong with predictable cash flows, but valuation and market conditions should be reviewed before making any decisions."
    },
    {
        "id": "q18",
        "category": "invest_or_not",
        "question": "Should an investor buy AMZN based on recent price trends?",
        "reference_answer": "Recent trends show growth tied to cloud services and retail, but investors should also consider competition, valuation, and risk tolerance."
    },
    {
        "id": "q19",
        "category": "invest_or_not",
        "question": "Is PYPL an attractive long-term investment?",
        "reference_answer": "PYPL offers digital payment exposure but faces competitive pressure and margin uncertainty. Investors should evaluate fundamentals and diversification needs."
    },

    # ----------------------------- #
    #         REDDIT ANALYSIS        #
    # ----------------------------- #
    {
        "id": "q20",
        "category": "reddit_sentiment",
        "question": "How reliable is social media sentiment on platforms like Reddit for making investment decisions?",
        "reference_answer": "Reddit sentiment can capture community hype or fear but is often biased, emotionally driven, and not a substitute for financial analysis."
    },
    {
        "id": "q21",
        "category": "reddit_sentiment",
        "question": "What patterns in Reddit comments indicate strong bullish sentiment?",
        "reference_answer": "Bullish sentiment appears as high upvotes, positive language, repeated mentions of momentum, and community agreement on growth narratives."
    },

    # ----------------------------- #
    #         PORTFOLIO SKILLS       #
    # ----------------------------- #
    {
        "id": "q22",
        "category": "portfolio",
        "question": "How should investors think about balancing risk and reward in a long-term portfolio?",
        "reference_answer": "Balancing involves mixing asset classes, diversifying sectors, managing allocation based on risk tolerance, and periodically rebalancing."
    },
    {
        "id": "q23",
        "category": "portfolio",
        "question": "What role do low-cost index funds play in a long-term investment strategy?",
        "reference_answer": "Low-cost index funds provide broad diversification, low fees, and strong long-term performance compared to most active strategies."
    },

    # ----------------------------- #
    #            MACRO               #
    # ----------------------------- #
    {
        "id": "q24",
        "category": "macro",
        "question": "How do inflation and interest rates typically affect stock prices?",
        "reference_answer": "Higher inflation and rates generally compress valuations and slow economic growth, while lower rates tend to support higher stock prices."
    }
]


In [24]:
def eval_model_on_questions(
    model_name: str,
    gen_pipe,
    questions,
    k: int = 5,
    out_path: str = "/content/drive/MyDrive/NLP/eval/eval_results.jsonl"
):
    """
    Run RAG evaluation for a given model over a list of questions.
    Saves one JSONL line per (question, model) with:
      - model_name
      - question_id, question_text, category
      - answer
      - retrieval metadata
      - latency
    """
    os.makedirs(os.path.dirname(out_path), exist_ok=True)

    with open(out_path, "w") as f:
        for q in questions:
            qid   = q["id"]
            qtext = q["question"]
            qcat  = q.get("category")
            ref   = q.get("reference_answer")

            t0 = time.time()
            # rag_answer(query, gen_pipe, k)
            res = rag_answer(qtext, gen_pipe, k=k)
            latency = time.time() - t0

            # res should contain: "prompt", "retrieved_docs", "output"
            answer = res["output"]
            retrieved_docs = res["retrieved_docs"]

            record = {
                "model": model_name,
                "question_id": qid,
                "category": qcat,
                "question": qtext,
                "reference_answer": ref,
                "answer": answer,
                "latency_sec": latency,
                "retrieved": [
                    {
                        "score": d.get("score"),
                        "source": d.get("source"),
                        "ticker": d.get("ticker"),
                        "subreddit": d.get("subreddit"),
                        "file_name": d.get("file_name"),
                    }
                    for d in retrieved_docs
                ],
            }

            f.write(json.dumps(record, ensure_ascii=False) + "\n")
            print(f"[{model_name}] Done {qid} in {latency:.2f}s")

    print(f"Saved eval results for {model_name} to: {out_path}")


## PHI for answering predefined question

In [32]:
phi_out = "/content/drive/MyDrive/NLP/eval/phi3_eval.jsonl"
eval_model_on_questions("phi-3-mini-4k-instruct", phi_pipe, EVAL_QUESTIONS, k=5, out_path=phi_out)

[phi-3-mini-4k-instruct] Done q1 in 17.87s
[phi-3-mini-4k-instruct] Done q2 in 17.90s
[phi-3-mini-4k-instruct] Done q3 in 15.24s
[phi-3-mini-4k-instruct] Done q4 in 15.77s


Token indices sequence length is longer than the specified maximum sequence length for this model (4528 > 4096). Running this sequence through the model will result in indexing errors


[phi-3-mini-4k-instruct] Done q5 in 15.17s
[phi-3-mini-4k-instruct] Done q6 in 18.02s
[phi-3-mini-4k-instruct] Done q7 in 17.60s
[phi-3-mini-4k-instruct] Done q8 in 17.64s


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


[phi-3-mini-4k-instruct] Done q9 in 17.57s
[phi-3-mini-4k-instruct] Done q10 in 16.16s
[phi-3-mini-4k-instruct] Done q11 in 15.78s
[phi-3-mini-4k-instruct] Done q12 in 17.65s
[phi-3-mini-4k-instruct] Done q13 in 17.23s
[phi-3-mini-4k-instruct] Done q14 in 17.80s
[phi-3-mini-4k-instruct] Done q15 in 15.20s
[phi-3-mini-4k-instruct] Done q16 in 9.83s
[phi-3-mini-4k-instruct] Done q17 in 19.03s
[phi-3-mini-4k-instruct] Done q18 in 15.36s
[phi-3-mini-4k-instruct] Done q19 in 15.27s
[phi-3-mini-4k-instruct] Done q20 in 16.41s
[phi-3-mini-4k-instruct] Done q21 in 15.16s
[phi-3-mini-4k-instruct] Done q22 in 15.03s
[phi-3-mini-4k-instruct] Done q23 in 15.12s
[phi-3-mini-4k-instruct] Done q24 in 17.83s
Saved eval results for phi-3-mini-4k-instruct to: /content/drive/MyDrive/NLP/eval/phi3_eval.jsonl


In [33]:
# mistral_df = pd.read_json(mistral_out, lines=True)
phi_df     = pd.read_json(phi_out, lines=True)

phi_df.head()


Unnamed: 0,model,question_id,category,question,reference_answer,answer,latency_sec,retrieved
0,phi-3-mini-4k-instruct,q1,concept,What is dollar-cost averaging and why do inves...,Dollar-cost averaging means investing a fixed ...,<system>\nYou are an AI assistant that answers...,17.873575,"[{'score': 0.8252652883529661, 'source': 'pdf'..."
1,phi-3-mini-4k-instruct,q2,risk,What are the main risks of investing heavily i...,"The main risks include company-specific risk, ...",<system>\nYou are an AI assistant that answers...,17.904859,"[{'score': 0.7251821756362911, 'source': 'redd..."
2,phi-3-mini-4k-instruct,q3,concept,What is the difference between growth investin...,Growth investing focuses on companies with hig...,<system>\nYou are an AI assistant that answers...,15.235076,"[{'score': 0.8625766038894651, 'source': 'pdf'..."
3,phi-3-mini-4k-instruct,q4,concept,How does diversification reduce portfolio risk?,Diversification spreads exposure across differ...,<system>\nYou are an AI assistant that answers...,15.765523,"[{'score': 0.79614782333374, 'source': 'pdf', ..."
4,phi-3-mini-4k-instruct,q5,concept,What does it mean for a stock to be overvalued...,A stock is overvalued when its price exceeds i...,<system>\nYou are an AI assistant that answers...,15.17047,"[{'score': 0.751691877841949, 'source': 'reddi..."


In [34]:
print("Phi-3 avg latency:", phi_df["latency_sec"].mean())


Phi-3 avg latency: 16.31786240140597


In [55]:
# Use a small embedding model for evaluation
eval_embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def semantic_similarity(a, b):
    if not a or not b:
        return None
    emb = eval_embed_model.encode([a, b], convert_to_numpy=True, normalize_embeddings=True)
    return float(np.dot(emb[0], emb[1]))  # cosine since normalized

def add_similarity_column(df):
    sims = []
    for _, row in df.iterrows():
        ref = row.get("reference_answer")
        ans = row.get("answer")
        sims.append(semantic_similarity(ref, ans))
    df["semantic_sim_ref"] = sims
    return df

# mistral_df = add_similarity_column(mistral_df)
phi_df     = add_similarity_column(phi_df)

# print("Mistral avg semantic similarity:", np.nanmean(mistral_df["semantic_sim_ref"]))
print("Phi avg semantic similarity:", np.nanmean(phi_df["semantic_sim_ref"]))


Phi avg semantic similarity: 0.4764531559000413


In [36]:
# Did retrieval actually use the right source?

def source_stats(df):
    all_sources = []
    for _, row in df.iterrows():
        for r in row["retrieved"]:
            all_sources.append(r.get("source", "unknown"))
    return pd.Series(all_sources).value_counts()

# print("Mistral retrieval sources:")
# print(source_stats(mistral_df))

print("\nPhi retrieval sources:")
print(source_stats(phi_df))



Phi retrieval sources:
pdf       60
reddit    53
stocks     7
Name: count, dtype: int64


## MISTRAL for answering predefined question

In [46]:
mistral_pipe = load_hf_llm(MISTRAL_MODEL_ID)
mistral_out = "/content/drive/MyDrive/NLP/eval/mistral_eval.jsonl"
eval_model_on_questions("Mistral-7B-Instruct-v0.3", mistral_pipe, EVAL_QUESTIONS, k=5, out_path=mistral_out)

Loading model: mistralai/Mistral-7B-Instruct-v0.3 on cuda ...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[Mistral-7B-Instruct-v0.3] Done q1 in 8.09s


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[Mistral-7B-Instruct-v0.3] Done q2 in 13.53s


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[Mistral-7B-Instruct-v0.3] Done q3 in 4.92s


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[Mistral-7B-Instruct-v0.3] Done q4 in 6.54s


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[Mistral-7B-Instruct-v0.3] Done q5 in 5.16s


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[Mistral-7B-Instruct-v0.3] Done q6 in 13.30s


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[Mistral-7B-Instruct-v0.3] Done q7 in 9.71s


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[Mistral-7B-Instruct-v0.3] Done q8 in 3.97s


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[Mistral-7B-Instruct-v0.3] Done q9 in 3.48s


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[Mistral-7B-Instruct-v0.3] Done q10 in 6.53s


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[Mistral-7B-Instruct-v0.3] Done q11 in 6.98s


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[Mistral-7B-Instruct-v0.3] Done q12 in 12.57s


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[Mistral-7B-Instruct-v0.3] Done q13 in 3.31s


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[Mistral-7B-Instruct-v0.3] Done q14 in 4.24s


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[Mistral-7B-Instruct-v0.3] Done q15 in 3.92s


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[Mistral-7B-Instruct-v0.3] Done q16 in 3.71s


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[Mistral-7B-Instruct-v0.3] Done q17 in 6.29s


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[Mistral-7B-Instruct-v0.3] Done q18 in 5.28s


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[Mistral-7B-Instruct-v0.3] Done q19 in 6.17s


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[Mistral-7B-Instruct-v0.3] Done q20 in 8.67s


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[Mistral-7B-Instruct-v0.3] Done q21 in 9.37s


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[Mistral-7B-Instruct-v0.3] Done q22 in 4.91s


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[Mistral-7B-Instruct-v0.3] Done q23 in 4.72s
[Mistral-7B-Instruct-v0.3] Done q24 in 8.14s
Saved eval results for Mistral-7B-Instruct-v0.3 to: /content/drive/MyDrive/NLP/eval/mistral_eval.jsonl


In [47]:
mistral_df = pd.read_json(mistral_out, lines=True)
mistral_df.head()

Unnamed: 0,model,question_id,category,question,reference_answer,answer,latency_sec,retrieved
0,Mistral-7B-Instruct-v0.3,q1,concept,What is dollar-cost averaging and why do inves...,Dollar-cost averaging means investing a fixed ...,<system>\nYou are an AI assistant that answers...,8.089717,"[{'score': 0.8252652883529661, 'source': 'pdf'..."
1,Mistral-7B-Instruct-v0.3,q2,risk,What are the main risks of investing heavily i...,"The main risks include company-specific risk, ...",<system>\nYou are an AI assistant that answers...,13.527252,"[{'score': 0.7251821756362911, 'source': 'redd..."
2,Mistral-7B-Instruct-v0.3,q3,concept,What is the difference between growth investin...,Growth investing focuses on companies with hig...,<system>\nYou are an AI assistant that answers...,4.921662,"[{'score': 0.8625766038894651, 'source': 'pdf'..."
3,Mistral-7B-Instruct-v0.3,q4,concept,How does diversification reduce portfolio risk?,Diversification spreads exposure across differ...,<system>\nYou are an AI assistant that answers...,6.537095,"[{'score': 0.79614782333374, 'source': 'pdf', ..."
4,Mistral-7B-Instruct-v0.3,q5,concept,What does it mean for a stock to be overvalued...,A stock is overvalued when its price exceeds i...,<system>\nYou are an AI assistant that answers...,5.158865,"[{'score': 0.751691877841949, 'source': 'reddi..."


In [48]:
print("Mistral avg latency:", mistral_df["latency_sec"].mean())

Mistral avg latency: 6.812098026275635


In [49]:
# Use a small embedding model for evaluation
eval_embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def semantic_similarity(a, b):
    if not a or not b:
        return None
    emb = eval_embed_model.encode([a, b], convert_to_numpy=True, normalize_embeddings=True)
    return float(np.dot(emb[0], emb[1]))  # cosine since normalized

def add_similarity_column(df):
    sims = []
    for _, row in df.iterrows():
        ref = row.get("reference_answer")
        ans = row.get("answer")
        sims.append(semantic_similarity(ref, ans))
    df["semantic_sim_ref"] = sims
    return df

mistral_df     = add_similarity_column(mistral_df)

print("Mistral avg semantic similarity:", np.nanmean(mistral_df["semantic_sim_ref"]))


Mistral avg semantic similarity: 0.4764531559000413


In [50]:
# Did retrieval actually use the right source?

def source_stats(df):
    all_sources = []
    for _, row in df.iterrows():
        for r in row["retrieved"]:
            all_sources.append(r.get("source", "unknown"))
    return pd.Series(all_sources).value_counts()

print("Mistral retrieval sources:")
print(source_stats(mistral_df))

Mistral retrieval sources:
pdf       60
reddit    53
stocks     7
Name: count, dtype: int64


## Meta Llama for answering predefined question

In [51]:
llama_pipe  = load_hf_llm(LLAMA_MODEL_ID)
llama_out = "/content/drive/MyDrive/NLP/eval/llama_eval.jsonl"
eval_model_on_questions("Meta-Llama-3.1-8B-Instruct", llama_pipe, EVAL_QUESTIONS, k=5, out_path=llama_out)

Loading model: meta-llama/Meta-Llama-3.1-8B-Instruct on cuda ...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


[Meta-Llama-3.1-8B-Instruct] Done q1 in 14.87s


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


[Meta-Llama-3.1-8B-Instruct] Done q2 in 14.76s


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


[Meta-Llama-3.1-8B-Instruct] Done q3 in 11.59s


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


[Meta-Llama-3.1-8B-Instruct] Done q4 in 14.69s


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


[Meta-Llama-3.1-8B-Instruct] Done q5 in 14.52s


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


[Meta-Llama-3.1-8B-Instruct] Done q6 in 14.89s


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


[Meta-Llama-3.1-8B-Instruct] Done q7 in 14.63s


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


[Meta-Llama-3.1-8B-Instruct] Done q8 in 15.08s


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


[Meta-Llama-3.1-8B-Instruct] Done q9 in 14.75s


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


[Meta-Llama-3.1-8B-Instruct] Done q10 in 14.62s


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


[Meta-Llama-3.1-8B-Instruct] Done q11 in 14.61s


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


[Meta-Llama-3.1-8B-Instruct] Done q12 in 14.69s


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


[Meta-Llama-3.1-8B-Instruct] Done q13 in 14.60s


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


[Meta-Llama-3.1-8B-Instruct] Done q14 in 14.73s


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


[Meta-Llama-3.1-8B-Instruct] Done q15 in 14.66s


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


[Meta-Llama-3.1-8B-Instruct] Done q16 in 15.06s


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


[Meta-Llama-3.1-8B-Instruct] Done q17 in 14.97s


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


[Meta-Llama-3.1-8B-Instruct] Done q18 in 14.58s


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


[Meta-Llama-3.1-8B-Instruct] Done q19 in 14.61s


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


[Meta-Llama-3.1-8B-Instruct] Done q20 in 14.95s


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


[Meta-Llama-3.1-8B-Instruct] Done q21 in 14.76s


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


[Meta-Llama-3.1-8B-Instruct] Done q22 in 14.76s


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


[Meta-Llama-3.1-8B-Instruct] Done q23 in 14.74s
[Meta-Llama-3.1-8B-Instruct] Done q24 in 14.86s
Saved eval results for Meta-Llama-3.1-8B-Instruct to: /content/drive/MyDrive/NLP/eval/llama_eval.jsonl


In [52]:
llama_df = pd.read_json(llama_out, lines=True)
llama_df.head()

Unnamed: 0,model,question_id,category,question,reference_answer,answer,latency_sec,retrieved
0,Meta-Llama-3.1-8B-Instruct,q1,concept,What is dollar-cost averaging and why do inves...,Dollar-cost averaging means investing a fixed ...,<system>\nYou are an AI assistant that answers...,14.872003,"[{'score': 0.8252652883529661, 'source': 'pdf'..."
1,Meta-Llama-3.1-8B-Instruct,q2,risk,What are the main risks of investing heavily i...,"The main risks include company-specific risk, ...",<system>\nYou are an AI assistant that answers...,14.760299,"[{'score': 0.7251821756362911, 'source': 'redd..."
2,Meta-Llama-3.1-8B-Instruct,q3,concept,What is the difference between growth investin...,Growth investing focuses on companies with hig...,<system>\nYou are an AI assistant that answers...,11.586913,"[{'score': 0.8625766038894651, 'source': 'pdf'..."
3,Meta-Llama-3.1-8B-Instruct,q4,concept,How does diversification reduce portfolio risk?,Diversification spreads exposure across differ...,<system>\nYou are an AI assistant that answers...,14.688414,"[{'score': 0.79614782333374, 'source': 'pdf', ..."
4,Meta-Llama-3.1-8B-Instruct,q5,concept,What does it mean for a stock to be overvalued...,A stock is overvalued when its price exceeds i...,<system>\nYou are an AI assistant that answers...,14.515445,"[{'score': 0.751691877841949, 'source': 'reddi..."


In [53]:
print("Llama3.1 avg latency:", llama_df["latency_sec"].mean())

Llama3.1 avg latency: 14.624101102352142


In [54]:
# Use a small embedding model for evaluation
eval_embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def semantic_similarity(a, b):
    if not a or not b:
        return None
    emb = eval_embed_model.encode([a, b], convert_to_numpy=True, normalize_embeddings=True)
    return float(np.dot(emb[0], emb[1]))  # cosine since normalized

def add_similarity_column(df):
    sims = []
    for _, row in df.iterrows():
        ref = row.get("reference_answer")
        ans = row.get("answer")
        sims.append(semantic_similarity(ref, ans))
    df["semantic_sim_ref"] = sims
    return df

llama_df     = add_similarity_column(llama_df)

# print("Mistral avg semantic similarity:", np.nanmean(mistral_df["semantic_sim_ref"]))
print("Llama avg semantic similarity:", np.nanmean(llama_df["semantic_sim_ref"]))

Llama avg semantic similarity: 0.4764531559000413


In [56]:
# Did retrieval actually use the right source?

def source_stats(df):
    all_sources = []
    for _, row in df.iterrows():
        for r in row["retrieved"]:
            all_sources.append(r.get("source", "unknown"))
    return pd.Series(all_sources).value_counts()

print("\nLlama retrieval sources:")
print(source_stats(llama_df))


Llama retrieval sources:
pdf       60
reddit    53
stocks     7
Name: count, dtype: int64


# Taking user prompt

In [22]:
MODEL_PIPES = {
    "mistral": mistral_pipe,
    "phi": phi_pipe,
    "llama": llama_pipe
}

In [68]:
def build_rag_prompt_user_prompt(query: str, retrieved_docs, max_docs: int = 5) -> str:
    """
    Build a clean RAG prompt that includes:
    - system instructions
    - retrieved context
    - user question
    - an ANSWER: placeholder
    """

    retrieved_docs = retrieved_docs[:max_docs]

    # Combine context text only (no headers, no source metadata)
    context_text = "\n\n".join([doc["text"] for doc in retrieved_docs])

    system_msg = (
        "You are an AI assistant specializing in stock markets, investing strategies, and financial concepts.\n"
        "Use ONLY the provided context to answer the user's question.\n"
        "If the answer is not found in the context, say: 'The retrieved context does not contain enough information.'\n"
        "Do NOT provide personalized financial advice.\n"
        "Provide a concise, clear, well-structured response."
    )

    prompt = f"""
SYSTEM:
{system_msg}

CONTEXT:
{context_text}

QUESTION:
{query}

ANSWER:
""".strip()

    return prompt


def rag_answer_user_prompt(query: str, gen_pipe, k: int = 5, max_new_tokens: int = 384):

    # Retrieve context
    retrieved_docs = retrieve(query, k=k)

    # Build prompt
    prompt = build_rag_prompt_user_prompt(query, retrieved_docs, max_docs=k)

    # Model generate
    full_output = gen_pipe(
        prompt,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.3,
        top_p=0.9,
        eos_token_id=gen_pipe.tokenizer.eos_token_id,
    )[0]["generated_text"]

    # Extract answer after "ANSWER:"
    if "ANSWER:" in full_output:
        answer = full_output.split("ANSWER:", 1)[1].strip()
    else:
        answer = full_output[len(prompt):].strip()

    return answer


def ask_rag(query: str, model_name: str = "mistral", k: int = 5):

    model_name = model_name.lower()
    if model_name not in MODEL_PIPES:
        raise ValueError(f"Unknown model '{model_name}'. Available: {list(MODEL_PIPES.keys())}")

    gen_pipe = MODEL_PIPES[model_name]

    answer = rag_answer_user_prompt(query, gen_pipe, k=k)

    print(f"\n=============================")
    print(f"   MODEL: {model_name.upper()}")
    print(f"=============================\n")

    print(answer)
    return answer


In [69]:
def interactive_rag():
    print("\nAvailable models:", ", ".join(MODEL_PIPES.keys()))
    print("Type 'quit' to exit.\n")

    while True:
        model_name = input(f"Choose a model {list(MODEL_PIPES.keys())}: ").strip().lower()

        if model_name in ["quit", "exit", "q"]:
            print("Exiting RAG chat.")
            break

        if model_name not in MODEL_PIPES:
            print("Invalid model. Try again.\n")
            continue

        question = input("Enter your question: ").strip()
        if question.lower() in ["quit", "exit", "q"]:
            print("Exiting RAG chat.")
            break

        # Call your rag_answer wrapper
        ask_rag(question, model_name=model_name)
        print("\n--------------------------------------\n")


In [70]:
interactive_rag()


Available models: mistral, phi, llama
Type 'quit' to exit.

Choose a model ['mistral', 'phi', 'llama']: phi
Enter your question: I don't know about trading. What is meant by trading?

   MODEL: PHI

Trading refers to the buying and selling of financial instruments, such as stocks, bonds, commodities, or currencies, with the aim of making a profit. It involves analyzing market trends, identifying potential investment opportunities, and executing trades based on one's trading strategy.

Trading can be done through various platforms, such as online brokerages, trading apps, or even directly through stock exchanges. Traders can choose to trade on a short-term basis, aiming to profit from price fluctuations, or on a long-term basis, aiming to benefit from the overall growth of the market.

It's important to note that trading involves risk, and it's crucial to have a solid understanding of the market, the instruments being traded, and the potential risks involved. Many traders also rely on 