In [45]:
!pip -q install -U langchain-google-genai langchain langchain-community langchain-text-splitters langchain-chroma chromadb pypdf


In [49]:
import os, getpass
os.environ["OPENAI_API_KEY"] = getpass.getpass("Paste FULL OpenAI secret key (starts with sk-): ").strip()
print("Length:", len(os.environ["OPENAI_API_KEY"]), "| Starts with:", os.environ["OPENAI_API_KEY"][:3])


Paste FULL OpenAI secret key (starts with sk-): ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑
Length: 164 | Starts with: sk-


In [50]:
from langchain_openai import OpenAIEmbeddings
emb = OpenAIEmbeddings(model="text-embedding-3-large")
print(len(emb.embed_query("hello")))


3072


In [51]:
from langchain_openai import OpenAIEmbeddings
import os

emb = OpenAIEmbeddings(model="text-embedding-3-large", api_key=os.environ["OPENAI_API_KEY"])
print("Embedding dim:", len(emb.embed_query("hello")))


Embedding dim: 3072


In [52]:
from langchain_chroma import Chroma

DB_DIR = "chroma_db"
COLLECTION = "docuchat_openai"

embeddings = OpenAIEmbeddings(model="text-embedding-3-large", api_key=os.environ["OPENAI_API_KEY"])

vs = Chroma(
    collection_name=COLLECTION,
    embedding_function=embeddings,
    persist_directory=DB_DIR,
)


InternalError: Database error: error returned from database: (code: 14) unable to open database file

In [53]:
import os, shutil

DB_DIR = "/content/chroma_db"   # writable in Colab

# remove old/corrupt db if it exists
if os.path.exists(DB_DIR):
    shutil.rmtree(DB_DIR)

os.makedirs(DB_DIR, exist_ok=True)
print("DB_DIR ready:", DB_DIR)


DB_DIR ready: /content/chroma_db


In [54]:
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
import os

COLLECTION = "docuchat_openai"

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-large",
    api_key=os.environ["OPENAI_API_KEY"]
)

vs = Chroma(
    collection_name=COLLECTION,
    embedding_function=embeddings,
    persist_directory=DB_DIR,
)
print("‚úÖ Chroma created")


‚úÖ Chroma created


In [55]:
from google.colab import drive
drive.mount("/content/drive")

DB_DIR = "/content/drive/MyDrive/docuchat/chroma_db"
import os
os.makedirs(DB_DIR, exist_ok=True)
print("DB_DIR:", DB_DIR)


Mounted at /content/drive
DB_DIR: /content/drive/MyDrive/docuchat/chroma_db


In [56]:
import os
print(os.getcwd())
print(os.listdir("/content")[:10])


/content
['.config', 'chroma_db', '.ipynb_checkpoints', 'drive', 'sample_data']


In [57]:
import os, shutil

DB_DIR = "/content/chroma_db"  # writable in Colab
if os.path.exists(DB_DIR):
    shutil.rmtree(DB_DIR)
os.makedirs(DB_DIR, exist_ok=True)

print("‚úÖ DB_DIR ready:", DB_DIR)


‚úÖ DB_DIR ready: /content/chroma_db


In [58]:
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
import os

COLLECTION = "docuchat_openai"

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-large",
    api_key=os.environ["OPENAI_API_KEY"]
)

vs = Chroma(
    collection_name=COLLECTION,
    embedding_function=embeddings,
    persist_directory=DB_DIR,
)

print("‚úÖ Chroma initialized")


‚úÖ Chroma initialized


In [60]:
from uuid import uuid4
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
import os, shutil

BATCH = 64
ids = [str(uuid4()) for _ in range(len(chunks))]

try:
    for i in range(0, len(chunks), BATCH):
        vs.add_documents(chunks[i:i+BATCH], ids=ids[i:i+BATCH])
        print(f"Added {min(i+BATCH, len(chunks))}/{len(chunks)}")
    print("‚úÖ Ingest done")
except Exception as e:
    if "readonly database" in str(e):
        print("üî¥ Detected 'readonly database' error. Attempting to re-initialize Chroma and retry...")
        # Ensure DB_DIR is clean and accessible before re-initialization
        if os.path.exists(DB_DIR):
            shutil.rmtree(DB_DIR)
        os.makedirs(DB_DIR, exist_ok=True)
        print(f"DB_DIR {DB_DIR} re-created for retry.")

        # Re-initialize Chroma
        embeddings = OpenAIEmbeddings(
            model="text-embedding-3-large",
            api_key=os.environ["OPENAI_API_KEY"]
        )
        vs = Chroma(
            collection_name=COLLECTION,
            embedding_function=embeddings,
            persist_directory=DB_DIR,
        )
        print("‚úÖ Chroma re-initialized. Retrying ingest...")

        # Retry the operation
        try:
            for i in range(0, len(chunks), BATCH):
                vs.add_documents(chunks[i:i+BATCH], ids=ids[i:i+BATCH])
                print(f"Added {min(i+BATCH, len(chunks))}/{len(chunks)}")
            print("‚úÖ Ingest done after retry.")
        except Exception as retry_e:
            print(f"‚ùå Ingest failed again after re-initialization: {retry_e}")
            print("üî¥ Please consider restarting the Colab runtime if the issue persists.")
    else:
        print(f"‚ùå An unexpected error occurred during ingest: {e}")

üî¥ Detected 'readonly database' error. Attempting to re-initialize Chroma and retry...
DB_DIR /content/chroma_db re-created for retry.
‚úÖ Chroma re-initialized. Retrying ingest...
‚ùå Ingest failed again after re-initialization: Error updating collection: Database error: error returned from database: (code: 1032) attempt to write a readonly database
üî¥ Please consider restarting the Colab runtime if the issue persists.


In [61]:
import gc, os, uuid, pathlib

# drop any old Chroma objects still holding the DB open
try:
    del vs
except NameError:
    pass
gc.collect()

DB_DIR = f"/content/chroma_db_{uuid.uuid4().hex[:8]}"
pathlib.Path(DB_DIR).mkdir(parents=True, exist_ok=True)

# make sure it's writable
os.chmod(DB_DIR, 0o777)

print("‚úÖ Using DB_DIR:", DB_DIR)


‚úÖ Using DB_DIR: /content/chroma_db_a85bb1fe


In [62]:
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
import os

COLLECTION = "docuchat_openai"

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-large",
    api_key=os.environ["OPENAI_API_KEY"]
)

vs = Chroma(
    collection_name=COLLECTION,
    embedding_function=embeddings,
    persist_directory=DB_DIR,
)

print("‚úÖ Chroma initialized")


‚úÖ Chroma initialized


In [63]:
from uuid import uuid4

BATCH = 64
ids = [str(uuid4()) for _ in range(len(chunks))]

for i in range(0, len(chunks), BATCH):
    vs.add_documents(chunks[i:i+BATCH], ids=ids[i:i+BATCH])
    print(f"Added {min(i+BATCH, len(chunks))}/{len(chunks)}")

print("‚úÖ Ingest done")


Added 64/628
Added 128/628
Added 192/628
Added 256/628
Added 320/628
Added 384/628
Added 448/628
Added 512/628
Added 576/628
Added 628/628
‚úÖ Ingest done


In [64]:
!pip -q install faiss-cpu


[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m23.7/23.7 MB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [65]:
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
import os

embeddings = OpenAIEmbeddings(model="text-embedding-3-large", api_key=os.environ["OPENAI_API_KEY"])
vs = FAISS.from_documents(chunks, embeddings)   # ‚úÖ no disk DB
retriever = vs.as_retriever(search_kwargs={"k": 4})
print("‚úÖ FAISS vector store ready")


‚úÖ FAISS vector store ready


In [66]:
!pip -q install faiss-cpu


In [67]:
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
import os

embeddings = OpenAIEmbeddings(model="text-embedding-3-large", api_key=os.environ["OPENAI_API_KEY"])
vs = FAISS.from_documents(chunks, embeddings)
retriever = vs.as_retriever(search_kwargs={"k": 4})

print("‚úÖ FAISS ready")


‚úÖ FAISS ready


In [71]:
from pathlib import Path
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_core.chat_history import InMemoryChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

def format_docs(docs):
    blocks = []
    for d in docs:
        src = Path(d.metadata.get("source","unknown")).name
        page = d.metadata.get("page", None)
        page_tag = f":p{int(page)+1}" if page is not None else ""
        blocks.append(f"[{src}{page_tag}]\n{d.page_content}")
    return "\n\n".join(blocks)

def format_sources(docs):
    seen, out = set(), []
    for d in docs:
        src = Path(d.metadata.get("source","unknown")).name
        page = d.metadata.get("page", None)
        tag = f"{src}:p{int(page)+1}" if page is not None else src
        if tag not in seen:
            seen.add(tag)
            out.append(tag)
    return out

llm = ChatOpenAI(model="gpt-4.1-mini", temperature=0, api_key=os.environ["OPENAI_API_KEY"])

rewrite_prompt = ChatPromptTemplate.from_messages([
    ("system", "Rewrite the latest user question into a standalone question using chat history. Return ONLY the question."),
    MessagesPlaceholder("history"),
    ("human", "{input}")
])
rewrite_chain = rewrite_prompt | llm | StrOutputParser()

answer_prompt = ChatPromptTemplate.from_messages([
    ("system", "Answer using ONLY the provided context. If not in context, say: 'I don't know based on the documents.'"),
    MessagesPlaceholder("history"),
    ("human", "Question: {question}\n\nContext:\n{context}\n\nAnswer:")
])

def ask(q, session_id="default"):
    out = chat.invoke({"input": q}, config={"configurable": {"session_id": session_id}})
    return out["output"], format_sources(out["docs"])


store = {}
def get_history(session_id: str):
    if session_id not in store:
        store[session_id] = InMemoryChatMessageHistory()
    return store[session_id]

chat = RunnableWithMessageHistory(
    rag_chain,
    get_history,
    input_messages_key="input",
    history_messages_key="history",
)

def ask(q, session_id="default"):
    out = chat.invoke({"input": q}, config={"configurable": {"session_id": session_id}})
    return out["answer"], format_sources(out["docs"])

print("‚úÖ Chat ready. Try: answer, sources = ask('Summarize my docs')")

‚úÖ Chat ready. Try: answer, sources = ask('Summarize my docs')


In [70]:
answer, sources = ask("Summarize the main topics across my PDFs.")
print(answer)
print("Sources:", sources)




I don't know based on the documents.
Sources: ['Lecture2A-Univariate Discrete Distributions-1.pdf:p10', 'Lecture2A-Univariate Discrete Distributions-1.pdf:p3', 'Lecture3A-Descriptive and Inferential statistics.pdf:p34', 'Lecture2B-Univariate Continuous Distributions.pdf:p4']


In [72]:
retriever = vs.as_retriever(
    search_type="mmr",
    search_kwargs={"k": 12, "fetch_k": 40}
)


In [73]:
answer_prompt = ChatPromptTemplate.from_messages([
    ("system",
     "Use the provided context to answer. If the context is incomplete, say what you CAN conclude from it "
     "and what you cannot. Do NOT invent details."),
    MessagesPlaceholder("history"),
    ("human", "Question: {question}\n\nContext:\n{context}\n\nAnswer:")
])


In [75]:
def debug_retrieve(q):
    docs = retriever.invoke(q)
    for i, d in enumerate(docs, 1):
        print(f"\n--- {i}. {Path(d.metadata.get('source','?')).name} | page={d.metadata.get('page', '?')} ---")
        print(d.page_content[:400])
    return docs

_ = debug_retrieve("Summarize the main topics across my PDFs")


--- 1. Lecture3A-Descriptive and Inferential statistics.pdf | page=33 ---
Summary on Descriptive Statistics
ÔÇß Descriptive Statistics describe a sample through calculations of summary statistics such as mean, 
median, variance, standard deviation, percentiles, skewness, kurtosis etc. 
ÔÇß They provide information on the nature of the variables that represent the population.
ÔÇß The sample can also be described graphically for visual ease
ÔÇß Descriptive statistics are an attem

--- 2. Lecture2A-Univariate Discrete Distributions-1.pdf | page=9 ---
Download ClassData.csv and Binomial.R from Canvas 
10
Click
Click
Clicking here will download the file to your browser‚Äôs Downloads folder. Copy 
from there into a folder ‚ÄúMyRFiles‚Äù on the Desktop.
Do the Same for ‚ÄúBinomial.R‚Äù.
1
2

--- 3. Lecture4B-Multiple Regression.pdf | page=19 ---
LECTURE 4B-2 ‚Äì CORRELATIONS 
AMONG PREDICTORS AND PREDICTOR 
SELECTION
20

--- 4. Lecture4C-Testing Multiple Regression Assumptions.pdf | page=49 

In [76]:
answer, sources = ask("Give me the main topics covered in these lectures (high level).")
print(answer)
print("Sources:", sources)




The main high-level topics covered in these lectures are:

1. Descriptive and Inferential Statistics  
2. Correlation and Regression  
3. Univariate Discrete Distributions (including Bernoulli, Binomial, Multinomial, Geometric, Negative Binomial, and Poisson Distributions)  
4. Testing Multiple Regression Assumptions (such as Linearity)
Sources: ['Lecture3A-Descriptive and Inferential statistics.pdf:p1', 'Lecture4A-Correlation and Regression.pdf:p1', 'Lecture2A-Univariate Discrete Distributions-1.pdf:p1', 'Lecture4C-Testing Multiple Regression Assumptions.pdf:p3']


In [77]:
retriever = vs.as_retriever(
    search_type="mmr",
    search_kwargs={"k": 12, "fetch_k": 40}
)


In [78]:
answer, sources = ask("Summarize the key concepts from Lecture 2A and 2B.")
print(answer)
print("Citations:", sources)




Lecture 2A covers Univariate Discrete Distributions, including Bernoulli, Binomial, Multinomial, Geometric, Negative Binomial, and Poisson Distributions, with reference to Book Chapter 4.

Lecture 2B covers Univariate Continuous Distributions, specifically the Uniform, Exponential, and Normal distributions, with reference to Book Chapters 5 and 6.
Citations: ['Lecture3A-Descriptive and Inferential statistics.pdf:p1', 'Lecture4A-Correlation and Regression.pdf:p2', 'Lecture2A-Univariate Discrete Distributions-1.pdf:p1', 'Lecture2B-Univariate Continuous Distributions.pdf:p1']


In [79]:
eval_qs = [
  "Explain Bernoulli and Binomial distributions in simple terms.",
  "What is the difference between a PMF and a PDF?",
  "Define expected value and variance.",
  "What is a confidence interval?",
  "When would you use a Poisson distribution?",
]
for q in eval_qs:
    a, s = ask(q)
    print("\nQ:", q)
    print("A:", a[:300])
    print("Sources:", s)





Q: Explain Bernoulli and Binomial distributions in simple terms.
A: The Bernoulli distribution is the simplest discrete distribution and models an experiment with only two possible outcomes: "success" with probability p, and "failure" with probability (1 ‚Äì p). We represent success as 1 and failure as 0 in a random variable X. Its probability mass function is given b
Sources: ['Lecture2A-Univariate Discrete Distributions-1.pdf:p6', 'Lecture2A-Univariate Discrete Distributions-1.pdf:p1', 'Lecture2A-Univariate Discrete Distributions-1.pdf:p36', 'Lecture2A-Univariate Discrete Distributions-1.pdf:p5']





Q: What is the difference between a PMF and a PDF?
A: The difference between a probability mass function (PMF) and a probability density function (PDF) is as follows:

- A PMF is used for discrete random variables and directly gives the probability for each specific value of the random variable. The probabilities are non-negative and sum to 1 over all 
Sources: ['Lecture2B-Univariate Continuous Distributions.pdf:p4', 'Lecture2A-Univariate Discrete Distributions-1.pdf:p3', 'Lecture2B-Univariate Continuous Distributions.pdf:p5']





Q: Define expected value and variance.
A: Expected value (or mean) and variance are defined as follows:

- Expected Value (E(X)): It is the weighted average of all possible values of a random variable, where the weights are the probabilities of each value. Mathematically, for a discrete random variable X with possible values x and probabili
Sources: ['Lecture2A-Univariate Discrete Distributions-1.pdf:p16', 'Lecture2A-Univariate Discrete Distributions-1.pdf:p6', 'Lecture3A-Descriptive and Inferential statistics.pdf:p10']





Q: What is a confidence interval?
A: I don't know based on the documents.
Sources: ['Lecture3B-Inferential Statistics - Confidence Intervals.pdf:p32', 'Lecture3B-Inferential Statistics - Confidence Intervals.pdf:p33', 'Lecture3B-Inferential Statistics - Confidence Intervals.pdf:p31', 'Lecture3B-Inferential Statistics - Confidence Intervals.pdf:p43']





Q: When would you use a Poisson distribution?
A: It is appropriate to use a Poisson distribution when you want to find the probability of a specified number of events occurring in a fixed interval of time and/or space. The events should occur independently and at a constant average rate. The Poisson distribution is especially suitable as a limitin
Sources: ['Lecture2A-Univariate Discrete Distributions-1.pdf:p43', 'Lecture2B-Univariate Continuous Distributions.pdf:p15', 'Lecture2A-Univariate Discrete Distributions-1.pdf:p42']
