In [2]:
import subprocess, importlib.util, os
from pathlib import Path

print(" Checking environment...")

# Check major libraries
packages = [
    "faiss", "sentence_transformers", "pypdf", "langchain", "langchain_ollama"
]
for pkg in packages:
    spec = importlib.util.find_spec(pkg)
    print(f" {pkg} - {'Installed' if spec else 'Missing'}")

# Check Ollama installation
try:
    output = subprocess.check_output(["ollama", "list"]).decode("utf-8").strip()
    print("\nOllama installed. Available models:")
    print(output)
except Exception as e:
    print(f" Ollama not found or not running: {e}")

# Check PDF file
data_dir = Path("data")
pdfs = list(data_dir.glob("*.pdf"))
if not pdfs:
    print("No PDF found in data/. Please add your 10-K PDF file.")
else:
    print(f"Found PDF: {pdfs[0].name}")

print("\nEnvironment check complete.")


 Checking environment...
 faiss - Installed
 sentence_transformers - Installed
 pypdf - Installed
 langchain - Installed
 langchain_ollama - Installed

Ollama installed. Available models:
NAME             ID              SIZE      MODIFIED          
llama3:latest    365c0bd3c000    4.7 GB    About an hour ago
Found PDF: tsla-20241231.pdf

Environment check complete.


In [3]:
from pathlib import Path
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import ChatOllama
import numpy as np
import faiss
import pickle
import os

DATA_DIR = Path("data")
STORE_DIR = Path("store")
STORE_DIR.mkdir(exist_ok=True)

PDF_PATH = next(DATA_DIR.glob("*.pdf"), None)
if PDF_PATH is None:
    raise SystemExit("Please add a 10-K PDF inside 'data/' before running.")

print(f" Loaded PDF: {PDF_PATH.name}")


 Loaded PDF: tsla-20241231.pdf


In [4]:
def extract_text_from_pdf(pdf_path):
    print("Extracting text from PDF...")
    reader = PdfReader(str(pdf_path))
    text = ""
    for i, page in enumerate(reader.pages):
        try:
            text += page.extract_text() + "\n"
        except Exception:
            print(f"Skipped page {i}")
    print(f"Extracted {len(text)//1000}K characters of text.")
    return text

text = extract_text_from_pdf(PDF_PATH)


Extracting text from PDF...
Extracted 399K characters of text.


In [5]:
def chunk_text(text, chunk_size=800, overlap=100):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    chunks = splitter.split_text(text)
    print(f"Created {len(chunks)} chunks.")
    return chunks

chunks = chunk_text(text)


Created 549 chunks.


In [6]:
print("Creating embeddings and FAISS index...")

model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(chunks, convert_to_numpy=True, show_progress_bar=True)

# Normalize for cosine similarity
embeddings = embeddings / (np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-12)

index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)

# Save for reusability
faiss.write_index(index, str(STORE_DIR / "faiss.index"))
with open(STORE_DIR / "chunks.pkl", "wb") as f:
    pickle.dump(chunks, f)

print(f"FAISS index built and saved with {index.ntotal} vectors.")


Creating embeddings and FAISS index...


Batches:   0%|          | 0/18 [00:00<?, ?it/s]

FAISS index built and saved with 549 vectors.


In [7]:
def retrieve(query, k=5):
    q_vec = model.encode([query], convert_to_numpy=True)
    q_vec = q_vec / (np.linalg.norm(q_vec, axis=1, keepdims=True) + 1e-12)
    D, I = index.search(q_vec, k)
    return [chunks[i] for i in I[0]]

# quick test
print(retrieve("What are Tesla's main risks?")[0][:400])


manufacturing, marketing, sales and delivery, service, installation, technology and support personnel, especially to support our
planned high-volume product sales, market and geographical expansion and technological innovations. If we are not successful in
managing these risks, our business, financial condition and operating results may be harmed.
Employees may leave Tesla or choose other employer


In [8]:
llm = ChatOllama(model="llama3", temperature=0.2)

SYSTEM_PROMPT = """You are an expert financial analyst.
Answer using ONLY the provided 10-K context.
If not found, reply 'Not available in the 10-K report.'
Always cite [source: company 10-K]."""

def answer(query, k=5):
    retrieved = retrieve(query, k)
    context = "\n\n".join(retrieved)
    prompt = f"{SYSTEM_PROMPT}\n\nContext:\n{context}\n\nQuestion: {query}\nAnswer:"
    resp = llm.invoke(prompt)
    return resp.content


In [9]:
# Simple interactive question–answer box for Jupyter / VS Code
from ipywidgets import Text, Button, VBox, Output
from IPython.display import display, Markdown, clear_output

question_box = Text(
    value='',
    placeholder='Type your question here...',
    description='Question:',
    style={'description_width': 'initial'},
    layout={'width': '70%'}
)

ask_button = Button(description='Ask', button_style='info')
exit_button = Button(description='Exit', button_style='danger')
output_area = Output()

def on_ask_clicked(b):
    q = question_box.value.strip()
    if not q:
        with output_area:
            clear_output()
            display(Markdown("Please enter a question."))
        return
    if q.lower() in ['exit', 'quit']:
        on_exit_clicked(b)
        return

    with output_area:
        clear_output(wait=True)
        display(Markdown(f"### Question: {q}"))
        display(Markdown("Processing..."))

    try:
        ans = answer(q)
        with output_area:
            clear_output(wait=True)
            display(Markdown(f"### Question: {q}"))
            display(Markdown(f"**Answer:**\n\n{ans}\n\n---"))
    except Exception as e:
        with output_area:
            clear_output(wait=True)
            display(Markdown(f"Error: {e}"))
    question_box.value = ''

def on_exit_clicked(b):
    with output_area:
        clear_output()
        display(Markdown("Session ended. You can close this cell or rerun it later."))

ask_button.on_click(on_ask_clicked)
exit_button.on_click(on_exit_clicked)

display(VBox([question_box, ask_button, exit_button, output_area]))


VBox(children=(Text(value='', description='Question:', layout=Layout(width='70%'), placeholder='Type your ques…