### Load data

In [None]:
import PyPDF2

file_path = "../data/data.pdf"

try:
    with open(file_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)

        text = ""
        for i, page in enumerate(reader.pages):
            try:
                text += page.extract_text() + "\n"
            except Exception:
                print(f"‚ö†Ô∏è Extract failed on page {i}")

    print("PDF loaded successfully!")
    print(text[:200])

except FileNotFoundError:
    print("‚ùå File not found:", file_path)
except Exception as e:
    print("‚ùå Unexpected error:", e)


### Clean text
- Remove unnecessary blank lines

In [None]:
import re

def clean_text(text):
    lines = [line.strip() for line in text.splitlines()]
    
    cleaned_lines = []
    for line in lines:
        if line != "":
            cleaned_lines.append(line)
        elif len(cleaned_lines) > 0 and cleaned_lines[-1] != "":
            cleaned_lines.append("")
    
    cleaned = "\n".join(cleaned_lines)
    cleaned = re.sub(r" {2,}", " ", cleaned)
    return cleaned


In [None]:
cleaned_text = clean_text(text)
print(cleaned_text[:200])


### Chunking

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=950,
    chunk_overlap=130
)

chunks = splitter.split_text(cleaned_text)
print(f"Total chunks: {len(chunks)}\n")
print(chunks[1])


### Embedding

In [None]:
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

vectordb = Chroma.from_texts(
    texts=chunks,
    embedding=embeddings,
    persist_directory="chroma_db"
)

print(vectordb._collection.count())

In [None]:
vectordb.persist()

### testing vector search

In [None]:
results = vectordb.similarity_search("good students")
for r in results:
    print(r.page_content)

### Creating RAG model

- Create retrieve search closely 3 chunks

In [None]:
retriever = vectordb.as_retriever(
    search_kwargs={"k": 3}
)

- select LLM model

In [None]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0.2
)



- create system prompt

In [None]:
from langchain_core.prompts import PromptTemplate

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
Use ONLY the information provided in the context to answer the question.
Extract relevant details and summarize clearly. Do not invent any information.

If the answer cannot be found in the context, reply:
"No information"

Context:
{context}

Question:
{question}
"""
)



In [None]:
def rag_chain(question: str, show_context=True):
    # üîç Retrieve docs matched to question
    docs = retriever.invoke(question)
    context = "\n".join([d.page_content for d in docs])
    
    # üß© Format prompt
    formatted_prompt = prompt.format(
        context=context,
        question=question
    )

    # ü§ñ LLM generate answer
    response = llm.invoke(formatted_prompt)
    answer = response.content

    # üñ® Display nicely
    print("üìå Input Question:")
    print(question)
    print("\nüìö Retrieved Context:")
    if show_context:
        print(context)
    else:
        print("(hidden)")
    print("\nüß† Model Output:")
    print(answer)

    return answer


In [None]:
ans = rag_chain("How many credits are there per semester?")

In [None]:
ans = rag_chain("What are the grading criteria?")

In [None]:
ans = rag_chain("Is my friend named Jimmy studying here?")

In [None]:
ans = rag_chain("How to be a good student?")