In [None]:
import subprocess, importlib.util, os
from pathlib import Path

print("üîç Checking environment...")

# Check major libraries
packages = [
    "faiss", "sentence_transformers", "pypdf", "langchain", "langchain_ollama"
]
for pkg in packages:
    spec = importlib.util.find_spec(pkg)
    print(f" {pkg} - {'Installed' if spec else 'Missing'}")

# Check Ollama installation
try:
    output = subprocess.check_output(["ollama", "list"]).decode("utf-8").strip()
    print("\nOllama installed. Available models:")
    print(output)
except Exception as e:
    print(f" Ollama not found or not running: {e}")

# Check PDF file
data_dir = Path("data")
pdfs = list(data_dir.glob("*.pdf"))
if not pdfs:
    print("No PDF found in data/. Please add your 10-K PDF file.")
else:
    print(f"Found PDF: {pdfs[0].name}")

print("\nEnvironment check complete.")


In [None]:
from pathlib import Path
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import ChatOllama
import numpy as np
import faiss
import pickle
import os

DATA_DIR = Path("data")
STORE_DIR = Path("store")
STORE_DIR.mkdir(exist_ok=True)

PDF_PATH = next(DATA_DIR.glob("*.pdf"), None)
if PDF_PATH is None:
    raise SystemExit("Please add a 10-K PDF inside 'data/' before running.")

print(f" Loaded PDF: {PDF_PATH.name}")


In [None]:
def extract_text_from_pdf(pdf_path):
    print("Extracting text from PDF...")
    reader = PdfReader(str(pdf_path))
    text = ""
    for i, page in enumerate(reader.pages):
        try:
            text += page.extract_text() + "\n"
        except Exception:
            print(f"Skipped page {i}")
    print(f"Extracted {len(text)//1000}K characters of text.")
    return text

text = extract_text_from_pdf(PDF_PATH)


In [None]:
def chunk_text(text, chunk_size=800, overlap=100):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    chunks = splitter.split_text(text)
    print(f"Created {len(chunks)} chunks.")
    return chunks

chunks = chunk_text(text)


In [None]:
print("Creating embeddings and FAISS index...")

model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(chunks, convert_to_numpy=True, show_progress_bar=True)

# Normalize for cosine similarity
embeddings = embeddings / (np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-12)

index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)

# Save for reusability
faiss.write_index(index, str(STORE_DIR / "faiss.index"))
with open(STORE_DIR / "chunks.pkl", "wb") as f:
    pickle.dump(chunks, f)

print(f"FAISS index built and saved with {index.ntotal} vectors.")


In [None]:
def retrieve(query, k=5):
    q_vec = model.encode([query], convert_to_numpy=True)
    q_vec = q_vec / (np.linalg.norm(q_vec, axis=1, keepdims=True) + 1e-12)
    D, I = index.search(q_vec, k)
    return [chunks[i] for i in I[0]]

# quick test
print(retrieve("What are Tesla's main risks?")[0][:400])


In [None]:
llm = ChatOllama(model="llama3", temperature=0.2)

SYSTEM_PROMPT = """You are an expert financial analyst.
Answer using ONLY the provided 10-K context.
If not found, reply 'Not available in the 10-K report.'
Always cite [source: company 10-K]."""

def answer(query, k=5):
    retrieved = retrieve(query, k)
    context = "\n\n".join(retrieved)
    prompt = f"{SYSTEM_PROMPT}\n\nContext:\n{context}\n\nQuestion: {query}\nAnswer:"
    resp = llm.invoke(prompt)
    return resp.content


In [None]:
while True:
    q = input("Ask (or type 'exit'): ").strip()
    if q.lower() in ["exit", "quit"]:
        print("Goodbye!")
        break
    print("\nAnswer:\n", answer(q), "\n" + "-"*60 + "\n")
