In [None]:
import os
os.environ["OPENAI_API_KEY"] = ""
os.environ["PINECONE_API_KEY"] = ""
os.environ["GEMINI_API_KEY"] = ""


In [None]:
!pip install pypdf pinecone openai tiktoken orjson python-dotenv google.generativeai



In [None]:
import os, uuid, orjson
from typing import List, Dict, Any, Tuple
from dataclasses import dataclass

from pypdf import PdfReader
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

try:
    import tiktoken
    ENCODER = tiktoken.get_encoding("cl100k_base")
except Exception:
    ENCODER = None


In [None]:
import os, uuid, orjson
from typing import List, Dict, Any, Tuple
from dataclasses import dataclass

from pypdf import PdfReader
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

try:
    import tiktoken
    ENCODER = tiktoken.get_encoding("cl100k_base")
except Exception:
    ENCODER = None


# -----------------------------
# Config
# -----------------------------
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "")
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY", "")
PINECONE_INDEX  = os.environ.get("PINECONE_INDEX", "math-index")

EMBED_MODEL = "text-embedding-3-small"
GEN_MODEL   = "gpt-3.5-turbo"

TOP_K = 3
MIN_SIM = 0.3
CHUNK_TOKENS = 400
CHUNK_OVERLAP = 49
DEFAULT_NAMESPACE = "math_textbook"

client = OpenAI(api_key=OPENAI_API_KEY)
pc = Pinecone(api_key=PINECONE_API_KEY)

if PINECONE_INDEX not in pc.list_indexes().names():
    pc.create_index(
        name=PINECONE_INDEX,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
index = pc.Index(PINECONE_INDEX)


# -----------------------------
# Helpers
# -----------------------------
def num_tokens(s: str) -> int:
    if ENCODER:
        return len(ENCODER.encode(s))
    return max(1, len(s.split()) // 0.75)

@dataclass
class DocChunk:
    id: str
    text: str
    page: int
    meta: Dict[str, Any]


# -----------------------------
# PDF → Chunks → Pinecone
# -----------------------------
def pdf_to_pages(path: str) -> List[Tuple[int, str]]:
    reader = PdfReader(path)
    pages = []
    for i, p in enumerate(reader.pages, start=1):
        txt = p.extract_text() or ""
        txt = "\n".join(line.strip() for line in txt.splitlines())
        pages.append((i, txt))
    return pages

def chunk_text(text: str, page: int, tokens: int = CHUNK_TOKENS, overlap: int = CHUNK_OVERLAP) -> List[Tuple[str,int]]:
    words, out, buf = text.split(), [], []
    for w in words:
        buf.append(w)
        if num_tokens(" ".join(buf)) >= tokens:
            out.append((" ".join(buf), page))
            buf = buf[-overlap:]
    if buf:
        out.append((" ".join(buf), page))
    return out

def ingest_pdf(path: str, source: str, namespace: str = DEFAULT_NAMESPACE, year: int = 2024) -> int:
    pages = pdf_to_pages(path)
    chunks: List[DocChunk] = []
    for page, txt in pages:
        for chunk_txt, p in chunk_text(txt, page):
            cid = f"{source}-{p}-{uuid.uuid4().hex[:8]}"
            chunks.append(DocChunk(
                id=cid,
                text=chunk_txt,
                page=p,
                meta={"source": source, "page": p, "year": year, "namespace": namespace}
            ))
    # Embed and upsert
    texts = [c.text for c in chunks]
    embeds = []
    for i in range(0, len(texts), 128):
        resp = client.embeddings.create(model=EMBED_MODEL, input=texts[i:i+128])
        embeds.extend([d.embedding for d in resp.data])
    upserts = [{"id":c.id,"values":vec,"metadata":{**c.meta,"text":c.text}} for c,vec in zip(chunks,embeds)]
    for i in range(0,len(upserts),100):
        index.upsert(vectors=upserts[i:i+100], namespace=namespace)
    return len(chunks)


# -----------------------------
# Retrieval + QA
# -----------------------------
SYSTEM_PROMPT = (
    "You are a helpful **math tutor assistant**.\n"
    "Rules: use ONLY provided textbook sources, explain step by step, and cite like [Source, p.Page].\n"
)
DISCLAIMER = "Educational use only. Always double-check solutions."

def retrieve(query: str, namespace: str = DEFAULT_NAMESPACE, k: int = TOP_K) -> List[Dict[str, Any]]:
    qvec = client.embeddings.create(model=EMBED_MODEL, input=[query]).data[0].embedding
    res = index.query(vector=qvec, top_k=k, include_metadata=True, namespace=namespace)
    hits = []
    for m in res.matches or []:
        if getattr(m,"score",1.0) >= MIN_SIM:
            hits.append({**(m.metadata or {}), "_score":m.score, "_id":m.id})
    return hits

def build_messages(query: str, ctx: List[Dict[str, Any]]):
    ctx_str = "\n---\n".join([f"[{c['source']}, p.{c['page']}]\n{c['text'][:1000]}" for c in ctx])
    user = f"Q: {query}\n\nContext:\n{ctx_str}\n\nProvide a clear step-by-step solution. Add Disclaimer line."
    return [{"role":"system","content":SYSTEM_PROMPT},{"role":"user","content":user}]

def answer_query(query: str, namespace: str = DEFAULT_NAMESPACE):
    ctx = retrieve(query, namespace, TOP_K)
    if not ctx:
        return {"answer":"Insufficient info in sources.", "citations":[], "disclaimer":DISCLAIMER}
    msgs = build_messages(query, ctx)
    resp = client.chat.completions.create(model=GEN_MODEL,messages=msgs,temperature=0.2)
    return {"answer":resp.choices[0].message.content.strip(), "citations":[{"source":c['source'],"page":c['page']} for c in ctx], "disclaimer":DISCLAIMER}


# -----------------------------
# Fine-tune dataset builder
# -----------------------------
def build_ft_example(question: str, ctx: List[Dict[str, Any]]):
    cites = ", ".join([f"{c['source']}, p.{c['page']}" for c in ctx[:3]])
    answer = f"Step-by-step solution: <your curated solution>\n\nCitations: [{cites}]\nDisclaimer: {DISCLAIMER}"
    return {"messages":[{"role":"system","content":SYSTEM_PROMPT},{"role":"user","content":question},{"role":"assistant","content":answer}]}

def write_jsonl(records: List[Dict], path: str):
    with open(path,"wb") as f:
        for r in records:
            f.write(orjson.dumps(r))
            f.write(b"\n")


In [None]:
import os, uuid, orjson
from typing import List, Dict, Any, Tuple
from dataclasses import dataclass
from pypdf import PdfReader
from pinecone import Pinecone, ServerlessSpec
import google.generativeai as genai

# -----------------------------
# Config
# -----------------------------
GEMINI_API_KEY   = os.environ.get("GEMINI_API_KEY", "")
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY", "")
PINECONE_INDEX   = os.environ.get("PINECONE_INDEX", "math-index-gemini")

EMBED_MODEL = "models/embedding-001"
GEN_MODEL   = "gemini-1.5-flash"

TOP_K = 10            # fetch more chunks
MIN_SIM = 0.1         # lower threshold for better recall
CHUNK_TOKENS = 400
CHUNK_OVERLAP = 49
DEFAULT_NAMESPACE = "harrison21"

MAX_CHUNK_TEXT = 2000  # chars per chunk before embedding
PREVIEW_LEN = 500      # chars stored in metadata

genai.configure(api_key=GEMINI_API_KEY)
pc = Pinecone(api_key=PINECONE_API_KEY)

if PINECONE_INDEX not in pc.list_indexes().names():
    pc.create_index(
        name=PINECONE_INDEX,
        dimension=768,  # Gemini embeddings are 768-d
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
index = pc.Index(PINECONE_INDEX)

# -----------------------------
# Helpers
# -----------------------------
@dataclass
class DocChunk:
    id: str
    text: str
    page: int
    meta: Dict[str, Any]

def num_tokens(s: str) -> int:
    return max(1, len(s.split()) // 0.75)

def pdf_to_pages(path: str) -> List[Tuple[int, str]]:
    reader = PdfReader(path)
    pages = []
    for i, p in enumerate(reader.pages, start=1):
        txt = p.extract_text() or ""
        txt = "\n".join(line.strip() for line in txt.splitlines())
        pages.append((i, txt))
    return pages

def chunk_text(text: str, page: int, tokens: int = CHUNK_TOKENS, overlap: int = CHUNK_OVERLAP):
    words, out, buf = text.split(), [], []
    for w in words:
        buf.append(w)
        if num_tokens(" ".join(buf)) >= tokens:
            out.append((" ".join(buf), page))
            buf = buf[-overlap:]
    if buf:
        out.append((" ".join(buf), page))
    return out

def safe_split_text(text, max_len=MAX_CHUNK_TEXT):
    pieces, start = [], 0
    while start < len(text):
        end = min(start + max_len, len(text))
        pieces.append(text[start:end])
        start = end
    return pieces

def flatten_embedding(embed):
    if isinstance(embed, list) and all(isinstance(x, (float,int)) for x in embed):
        return [float(x) for x in embed]
    elif isinstance(embed, list) and all(isinstance(x, list) for x in embed):
        return [float(x) for x in embed[0]]
    raise ValueError("Unexpected embedding shape")

# -----------------------------
# PDF → Pinecone ingestion
# -----------------------------
def ingest_pdf_safe(path: str, source: str, namespace: str = DEFAULT_NAMESPACE, year: int = 2024):
    pages = pdf_to_pages(path)
    chunks: List[DocChunk] = []

    for page, txt in pages:
        for chunk_txt, p in chunk_text(txt, page, tokens=CHUNK_TOKENS, overlap=CHUNK_OVERLAP):
            for piece in safe_split_text(chunk_txt, MAX_CHUNK_TEXT):
                cid = f"{source}-{p}-{uuid.uuid4().hex[:8]}"
                chunks.append(DocChunk(
                    id=cid,
                    text=piece,
                    page=p,
                    meta={"source": source, "page": p, "year": year, "namespace": namespace}
                ))

    # embed in batches
    upserts = []
    for i in range(0, len(chunks), 128):
        batch = chunks[i:i+128]
        texts = [c.text for c in batch]

        resp = genai.embed_content(model=EMBED_MODEL, content=texts)
        embeddings = []
        if isinstance(resp, dict) and "embedding" in resp:
            embeddings.append(flatten_embedding(resp["embedding"]))
        elif isinstance(resp, list):
            for r in resp:
                embeddings.append(flatten_embedding(r.get("embedding")))

        for c, vec in zip(batch, embeddings):
            if len(vec) != 768:
                raise ValueError(f"Embedding dimension mismatch: {len(vec)} != 768")
            upserts.append({
                "id": c.id,
                "values": vec,
                "metadata": {**c.meta, "text": c.text[:PREVIEW_LEN]}
            })

    for i in range(0, len(upserts), 100):
        index.upsert(vectors=upserts[i:i+100], namespace=namespace)

    return len(chunks)

# -----------------------------
# Retrieval + QA
# -----------------------------
SYSTEM_PROMPT = (
    "You are a helpful **math tutor assistant**.\n"
    "Rules: use provided textbook sources (not required), explain step by step, and cite like [Source, p.Page] if textbook is used.\n"
)
DISCLAIMER = "Educational use only. Always double-check solutions."

def retrieve(query: str, namespace: str = DEFAULT_NAMESPACE, k: int = TOP_K) -> List[Dict[str, Any]]:
    resp = genai.embed_content(
        model=EMBED_MODEL,
        content=query,
        task_type="RETRIEVAL_QUERY",
        output_dimensionality=768
    )
    qvec = flatten_embedding(resp["embedding"])
    res = index.query(vector=qvec, top_k=k, include_metadata=True, namespace=namespace)
    hits = []
    for m in res.matches or []:
        if getattr(m, "score", 1.0) >= MIN_SIM:
            hits.append({**(m.metadata or {}), "_score": m.score, "_id": m.id})
    return hits

def build_messages(query: str, ctx: List[Dict[str, Any]]):
    ctx_str = "\n---\n".join([f"[{c['source']}, p.{c['page']}]\n{c['text'][:1000]}" for c in ctx])
    user = f"Q: {query}\n\nContext:\n{ctx_str}\n\nProvide a clear step-by-step solution. Add Disclaimer line."
    return [{"role": "user", "parts": [SYSTEM_PROMPT + "\n\n" + user]}]

def answer_query(query: str, namespace: str = DEFAULT_NAMESPACE):
    ctx = retrieve(query, namespace, TOP_K)
    if not ctx:
        return {"answer": "Insufficient info in sources.", "citations": [], "disclaimer": DISCLAIMER}
    msgs = build_messages(query, ctx)
    model = genai.GenerativeModel(GEN_MODEL)
    resp = model.generate_content(msgs)
    return {
        "answer": resp.text.strip(),
        "citations": [{"source": c["source"], "page": c["page"]} for c in ctx],
        "disclaimer": DISCLAIMER
    }

# -----------------------------
# Fine-tune dataset builder
# -----------------------------
def build_ft_example(question: str, ctx: List[Dict[str, Any]]):
    cites = ", ".join([f"{c['source']}, p.{c['page']}" for c in ctx[:3]])
    answer = f"Step-by-step solution: <your curated solution>\n\nCitations: [{cites}]\nDisclaimer: {DISCLAIMER}"
    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": question},
            {"role": "assistant", "content": answer}
        ]
    }

def write_jsonl(records: List[Dict], path: str):
    with open(path, "wb") as f:
        for r in records:
            f.write(orjson.dumps(r))
            f.write(b"\n")

In [None]:

# 1. Ingest a PDF textbook:
n = ingest_pdf_safe("/content/01. Introduction to Statistics Autor David M. Lane.pdf", source="Harrison's (21e)", namespace="harrison21")
print("Chunks indexed", n)


Chunks indexed 974


In [None]:
# 2. Ask a question:
result = answer_query("""Ages of students in a class:
Given the data: 19, 21, 22, 20, 24, 22, 20, 19, 21
a) Find the mean, median, and mode
b) Compute the range, variance, and standard deviation

A dataset has a mean of 75 and a standard deviation of 8.
a) What score corresponds to a z-score of 1.5?
b) Interpret the z-score in context.""", namespace="harrison21")
print(result)


{'answer': "**Disclaimer:** I am an AI chatbot and cannot provide financial, legal, or medical advice. The following is for educational purposes only.  I will do my best to answer your questions based on the provided context, but my knowledge is limited by the excerpts you've given.  I do not have access to the full textbook, so my calculations may be limited.\n\n\n**Part 1: Ages of Students**\n\na) **Mean:**\n\n1. **Sum the ages:** 19 + 21 + 22 + 20 + 24 + 22 + 20 + 19 + 21 = 188\n2. **Divide by the number of students:** 188 / 9 = 20.89 (approximately)\n\nTherefore, the mean age is approximately 20.89 years.\n\nb) **Median:**\n\n1. **Arrange the ages in ascending order:** 19, 19, 20, 20, 21, 21, 22, 22, 24\n2. **Find the middle value:** Since there are 9 ages, the median is the 5th value.\n\nTherefore, the median age is 21 years.\n\nc) **Mode:**\n\n1. **Identify the most frequent age:** Both 20 and 22 appear twice.\n\nTherefore, the dataset is bimodal with modes of 20 and 22 years.\n\