# Assignment pt2
Johnny Rosas

In [41]:
# core
import os
import re
import numpy as np
import pandas as pd
import string
# vector DB
import faiss
# pdf parsing libs
from pathlib import Path
from pypdf import PdfReader
# chumking + embeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

from openai import OpenAI

In [42]:
# using src folder and data folder we get the correct path to the PDF

PDF_PATH = "2020-2021-graduate-bulletin.pdf"

# set = None to load all pages.
PAGES_TO_LOAD = (
    list(range(37, 43)) +      # Admissions Registration
    list(range(43, 49)) +      # Tuition and Fees
    list(range(49, 51)) +      # Financial Aid
    list(range(67, 82)) +      # University Policies
    list(range(83, 88)) +      # Graduate Requirements
    list(range(88, 90)) +      # Doctoral Requirements
    list(range(91, 94)) +      # Specific Programs
    list(range(94, 98))       # Course Information
)

# TESTING
#print("Resolved PDF path:", PDF_PATH)
#print("File exists:", PDF_PATH.exists())
#print("Pages requested:", len(PAGES_TO_LOAD), "pages")

In [43]:
# Load and clean pdf from noise(headers, footers, page numbers, etc.)
reader = PdfReader(str(PDF_PATH))

# Store cleaned text from each page
pages_text = []

def clean_text(text: str) -> str:
    text = text.replace("\r", "\n") # normalize newlines
    text = re.sub(r"(?m)^\s*\d+\s*$", "", text) # remove lines that are just numbers

    text = re.sub(r"(?<=\w)\s-\s(?=\w)", "", text) # fix hyphenated words
    text = re.sub(r"(?<=\w)-\n(?=\w)", "", text) # fix hyphenated words
    text = re.sub(r"(?m)^\s*•\s*$", "", text) # remove bullet points on their own line
    text = re.sub(r"(?m)^\s*[•\-–—]*\s*[IVXLCDM0-9]+\s+(?=[A-Z])", "", text) 
    text = re.sub(r"SDSU\s*\|\s*Graduate\s*Bulletin", "", text, flags=re.IGNORECASE) # remove headers

    text = re.sub(r"[ \t]+", " ", text) # collapse multiple spaces/tabs
    text = re.sub(r"\n{3,}", "\n\n", text) # collapse multiple newlines

    return text.strip()

# which pages to load
total_pages = len(reader.pages)

if PAGES_TO_LOAD is None:
    page_indices = list(range(total_pages))
else:
    # keep only valid page numbers
    page_indicies = [p for p in PAGES_TO_LOAD if 0 <= p <= total_pages]

for page_index in page_indicies:
    page = reader.pages[page_index]
    raw_text = page.extract_text()

    if raw_text:
        cleaned_text = clean_text(raw_text)
        pages_text.append({
            "pdf_index": page_index, # pdf page index
            "page": page_index + 1,
            "text": cleaned_text
        })

print("PDF indices:", page_indicies[:10])
print("Viewer pages:", [i+1 for i in page_indicies[:10]])


#print("Total cleaned pages loaded:", len(pages_text))
#print("Loaded page numbers:", [p["page"] for p in pages_text[:25]], "..." if len(pages_text) > 25 else "")

# quick preview (first loaded page)
#USED FOR TESTING
#if pages_text:
#    print(f"\n--- Preview of first loaded page (index {pages_text[0]['pdf_index']} / human page {pages_text[0]['page']}) ---\n")
#    print(pages_text[0]["text"][:3500])

PDF indices: [37, 38, 39, 40, 41, 42, 43, 44, 45, 46]
Viewer pages: [38, 39, 40, 41, 42, 43, 44, 45, 46, 47]


In [44]:
RUNNING_HEADERS = "Admission and Registration" # handle repeated running header without losing main context
GLOBAL_CONTEXT = RUNNING_HEADERS # save the main header as global context to remove from subsequent pages

def remove_running_headers(page_text: str, header: str) -> str:
    """
    Remove repeating running headers that appear on many pages
    This prevents them from being treated as real section titles
    """
    if not page_text:
        return page_text

    lines = [ln.rstrip() for ln in page_text.splitlines()]

    # Drop lines that exactly equal the running header
    cleaned = []
    for ln in lines:
        if ln.strip() == header:
            continue
        cleaned.append(ln)

    return "\n".join(cleaned).strip()

Will continue working on this part in the future to improve section chunking.

# Splitting Pages into Sections

In [45]:
def split_page_to_sections(page_text: str):
    # Allows for common lowercase connector words inside headings
    connectors = r"(?:and|or|of|to|for|in|the|a|an|on|with|by|at|from|as|into|upon|within|without|via)"

    heading_pattern = re.compile(
        r"(?m)^(?:[•\-–— ]*)"
        r"(?P<h>"
            r"([A-Z][A-Z\s]{3,})"  # ALL CAPS
            r"|"
            r"([A-Z][a-z]+(?:\s+(?:[A-Z][a-z]+|" + connectors + r")){1,12})"  # Title Case + connectors
        r")\s*$"
    )

    matches = list(heading_pattern.finditer(page_text))

    if not matches:
        body = page_text.strip()
        return [("Unknown Section", body)] if body else []
    
    sections = []
    for idx, m in enumerate(matches):
        title = m.group("h").strip()

        start = m.end()
        end = matches[idx + 1].start() if idx + 1 < len(matches) else len(page_text)

        body = page_text[start:end].strip()
        if len(body) < 150:
            continue

        # Normalize CAPS headings to Title Case
        title_norm = title.title() if title.isupper() else title

        sections.append((title_norm, body))

    return sections

# DEBUG CELL: check extracted text + what section headers are detected
print("Total cleaned pages loaded:", len(pages_text))
if pages_text:
    print("Loaded page indices:", [p["pdf_index"] for p in pages_text])
    print("Loaded view pages:", [p["page"] for p in pages_text])

    # Print a short preview of the first 2 loaded pages
    for i in range(min(2, len(pages_text))):
        print(f"\n--- Preview page i={i} | pdf_index={pages_text[i]['pdf_index']} | viewer_page={pages_text[i]['page']} ---")
        print(pages_text[i]["text"][:9000])

    # Check headings detected on the first 5 loaded pages
    print("\n--- Detected section titles per page (first 5 loaded pages) ---")
    for i in range(min(5, len(pages_text))):
        tmp_text = pages_text[i]["text"]
        tmp_text = remove_running_headers(tmp_text, GLOBAL_CONTEXT)  # match your pipeline
        secs = split_page_to_sections(tmp_text)

        titles = [t for (t, _) in secs]
        print(f"pdf_index={pages_text[i]['pdf_index']} | viewer_page={pages_text[i]['page']} | #sections={len(titles)}")
        print("   ", titles[:12], "..." if len(titles) > 12 else "")


# Sections to paragraphs

In [46]:
# Split pages -> sections -> paragraph blocks -> final chunks
# MNOTE: avoid overly tiny chunks by packing multiple paragraphs before final splitting.

chunker = RecursiveCharacterTextSplitter(
    chunk_size=1200,
    chunk_overlap=100,
    separators=["\n\n", "\n", ". ", " "]
)

def pack_paragraphs(paragraphs, target_chars=1200):
    blocks = []
    buf = []
    buf_len = 0
    for para in paragraphs:
        if not para:
            continue
        add_len = len(para) + (2 if buf else 0)
        if buf and (buf_len + add_len) > target_chars:
            blocks.append("\n\n".join(buf).strip())
            buf = [para]
            buf_len = len(para)
        else:
            buf.append(para)
            buf_len += add_len
    if buf:
        blocks.append("\n\n".join(buf).strip())
    return [b for b in blocks if b]

chunks = []

for p in pages_text:
    page_num = p["page"]
    page_text = p["text"]

    page_text = remove_running_headers(page_text, GLOBAL_CONTEXT) # remove redundant header but keep subcontext

    page_sections = split_page_to_sections(page_text)


    for section_title, section_body in page_sections:
        # Paragraph split on blank lines
        paragraphs = [para.strip() for para in section_body.split("\n\n") if para.strip()]

        # Pack paragraphs into larger
        blocks = pack_paragraphs(paragraphs, target_chars=1200)

        for block in blocks:
            subchunks = chunker.split_text(block)

            for sub in subchunks:
                if len(sub) < 200:
                    continue
                # If section_title is empty/none, just use the global context
                label = f"{GLOBAL_CONTEXT} > {section_title}" if section_title else GLOBAL_CONTEXT

                chunks.append({
                    "page": page_num,
                    "section": label,
                    "text": sub
                })



# DEBUG: verify chunk/section quality
print("Total chunks created:", len(chunks))

# Show how many chunks per section (top 15 sections)
from collections import Counter
section_counts = Counter([c["section"] for c in chunks])
print("\nTop sections by chunk count:")
for sec, cnt in section_counts.most_common(15):
    print(f"{cnt:3d} | {sec}")

# Print a couple example chunks from the first few unique sections
print("\n--- Sample chunks by section (first 8 unique sections) ---")
seen = set()
shown = 0
for c in chunks:
    if c["section"] in seen:
        continue
    seen.add(c["section"])
    print(f"\nSECTION: {c['section']} | page={c['page']}")
    print(c["text"][:700].replace("\n", " "))
    shown += 1
    if shown >= 8:
        break


# Build a Vector Database (FAISS)

Embeded each chunk using a sentence-transformer model and store vectors in a FAISS index.

In [47]:
# Create embeddings for each chunk and build FAISS index
# Small, fast, and good baseline for RAG (I tried all-mpnet-base-v2 but was slower)
EMBED_MODEL_NAME = "all-MiniLM-L6-v2"
model = SentenceTransformer(EMBED_MODEL_NAME)

texts = [c["text"] for c in chunks]

# Compute embeddings (float32 for FAISS)
emb = model.encode(texts, show_progress_bar=True, convert_to_numpy=True).astype("float32")

# Normalize -> cosine 
faiss.normalize_L2(emb)

index = faiss.IndexFlatIP(emb.shape[1])
index.add(emb)

print("Embedding model:", EMBED_MODEL_NAME)
print("Vector dimension:", emb.shape[1])
print("Vectors in index:", index.ntotal)


Loading weights: 100%|██████████| 103/103 [00:00<00:00, 1359.24it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
Batches: 100%|██████████| 10/10 [00:03<00:00,  2.81it/s]

Embedding model: all-MiniLM-L6-v2
Vector dimension: 384
Vectors in index: 311





I used localized embeddings using Embedding model: all-MiniLM-L6-v2

In [48]:
# Query helper
def search(query: str, top_k: int = 5):
    q = model.encode([query], convert_to_numpy=True).astype("float32")
    faiss.normalize_L2(q)
    scores, idxs = index.search(q, top_k)

    results = []
    for score, i in zip(scores[0].tolist(), idxs[0].tolist()):
        if i == -1:
            continue
        rec = chunks[i].copy()
        rec["score"] = float(score)
        results.append(rec)
    return results

# Quick Test
test_query = "How many units are required for graduation?"  # should hit graduation requirements section
results = search(test_query, top_k=5)

for r in results:
    print(f"score={r['score']:.3f} | page={r['page']} | section={r['section']}")
    print(r["text"][:250].replace("\n", " "), "...")
    print()


score=0.738 | page=85 | section=Admission and Registration > Master of Engineering Degree
Thirty-six units consisting of 500-, 600-, and 700-numbered courses specified by the degree requirements and program director while earned in graduate standing and six units of ENGR 798 (project) which may be taken as two three-unit modules. At least ...

score=0.733 | page=85 | section=Admission and Registration > Master of Fine Arts in Art Degree
Sixty units of 500-, 600-, and 700-numbered courses earned in graduate standing and specified by the School of Art and Design, at least 24 of which must be completed in residence. Courses required to remove undergraduate deficiencies are in addition  ...

score=0.709 | page=85 | section=Admission and Registration > Master of City Planning Degree
Forty-eight units of approved 500-, 600-, and 700-numbered courses earned in graduate standing, at least 39 of which must be completed in residence. Courses required to remove undergraduate deficiencies are in a

---

## Part 3 Connecting LLM

In [49]:
# RAG: Retrieval + LLM Answer
USE_OPENAI = True

def build_context(hits, max_chars=4000):
    parts = []
    total = 0
    for h in hits:
        header = f"[page={h['page']} | section={h.get('section','')} | score={h['score']:.3f}]"
        block = header + "\n" + h["text"].strip()
        if total + len(block) > max_chars:
            break
        parts.append(block)
        total += len(block)
    return "\n\n---\n\n".join(parts)

def rag_answer(query, top_k=5):
    # Retrieve top_k chunks using FAISS DB
    
    hits = search(query, top_k=top_k)   # <-- uses your existing FAISS search()
    context = build_context(hits)

    if not context.strip():
        return "Not found in the provided records.", hits

    if USE_OPENAI: # Send retrieved context + query to LLM
        
        client = OpenAI()  # uses OPENAI_API_KEY environment variable

        system_msg = (
            "You are a question-answering assistant. Use ONLY the provided CONTEXT. "
            "If the answer is not in the context, say: 'Not found in the provided records.' "
            "Be concise and cite the most relevant page/section from the context."
        )

        user_msg = f"""QUESTION:
{query}

CONTEXT:
{context}
"""

        resp = client.responses.create(
            model="gpt-5.1-chat-latest",
            input=[
                {"role": "system", "content": system_msg},
                {"role": "user", "content": user_msg}
            ],
        )

        return resp.output_text, hits # Return answer + the retrieval hits used

    else:
        return "LLM backend not configured.", hits


In [50]:
# DEMO
demo_q = "What are the admission requirements for graduate applicants?"
answer, hits = rag_answer(demo_q, top_k=5)

print("QUESTION:", demo_q)
print("\nANSWER:\n", answer)
print("\nTOP HITS USED:")
for h in hits:
    print(f"score={h['score']:.3f} | page={h['page']} | section={h.get('section','')}")


QUESTION: What are the admission requirements for graduate applicants?

ANSWER:
 Graduate applicants must:

• Possess a bachelor’s degree from an accredited institution or equivalent (page 41).  
• Have a cumulative GPA of at least 3.0 (page 41).  
• Be in good standing at all universities attended (page 41).  
• Submit satisfactory GRE/GMAT scores and TOEFL/IELTS/PTE Academic if required (page 41).  

Additional notes:  
• Some programs may require additional criteria (page 38).  
• Admission cannot be deferred and applicants must enroll in the term of admission (page 38).  

Special-action admission is possible for those not meeting standard requirements, with supporting evidence and letters of recommendation (page 38).

TOP HITS USED:
score=0.650 | page=38 | section=Admission and Registration > Application and Admission Process
score=0.604 | page=38 | section=Admission and Registration > Special Action Admissions
score=0.601 | page=41 | section=Admission and Registration > Admission

---

# Relevance Evaluation (PART 2)

do the returned chunks answer the query?

Below is a light-weight heuristic score in addition to the FAISS similarity:
- tokenize query and chunk
- compute token overlap ratio (Jaccard-like)

In [51]:
def tokenize(s: str):
    s = s.lower()
    s = s.translate(str.maketrans("", "", string.punctuation))
    toks = [t for t in s.split() if len(t) > 2]
    # small stopword list (keep minimal for reproducibility)
    stop = set(["the","and","for","with","that","this","from","are","was","were","you","your","into","can","will"])
    return [t for t in toks if t not in stop]

def overlap_score(query: str, chunk_text: str) -> float:
    q = set(tokenize(query))
    c = set(tokenize(chunk_text))
    if not q:
        return 0.0
    return len(q & c) / len(q)

queries = [
    "How many units are required for graduation?",
    "what are graduate admission requirements?",
    "what is the minimum gpa for graduate admission?",
    "How much is in state tuition?",
    "tell me about the graduate programs"
]

rows = []
TOP_K = 5

for q in queries:
    res = search(q, top_k=TOP_K)
    for rank, r in enumerate(res, start=1):
        rows.append({
            "query": q,
            "rank": rank,
            "faiss_score": r["score"],
            "overlap_score": overlap_score(q, r["text"]),
            "page": r["page"],
            "section": r["section"],
            "snippet": r["text"][:180].replace("\n", " ") + "..."
        })

df = pd.DataFrame(rows)
df.sort_values(["query","rank"], inplace=True)
df


Unnamed: 0,query,rank,faiss_score,overlap_score,page,section,snippet
0,How many units are required for graduation?,1,0.737707,0.4,85,Admission and Registration > Master of Enginee...,"Thirty-six units consisting of 500-, 600-, and..."
1,How many units are required for graduation?,2,0.732853,0.4,85,Admission and Registration > Master of Fine Ar...,"Sixty units of 500-, 600-, and 700-numbered co..."
2,How many units are required for graduation?,3,0.70903,0.4,85,Admission and Registration > Master of City Pl...,"Forty-eight units of approved 500-, 600-, and ..."
3,How many units are required for graduation?,4,0.705055,0.4,85,Admission and Registration > Master of Fine Ar...,"Fifty-four units of 500-, 600-, and 700-number..."
4,How many units are required for graduation?,5,0.674943,0.4,85,Admission and Registration > Master of Science...,Thirty-seven units of 600- and 700- numbered c...
15,How much is in state tuition?,1,0.638562,0.25,45,Admission and Registration > Tuition For Nonre...,(Foreign and Out-of-State) Nonresident tuition...
16,How much is in state tuition?,2,0.605754,0.25,50,Admission and Registration > Foreign Students,"In California, all students are required to pa..."
17,How much is in state tuition?,3,0.594618,0.25,46,Admission and Registration > Installment Plan,Basic Tuition and Fees. An installment plan is...
18,How much is in state tuition?,4,0.564687,0.25,44,Admission and Registration > Teaching Credenti...,"and a Basic Tuition Fee of either $1,665.00 or..."
19,How much is in state tuition?,5,0.542715,0.25,45,Admission and Registration > Living on,campus Commuting from home Basic tuition and f...


## Observation Notes

- **CHUNKING STRAT**: I decided to first extract the more importatn pages from the bulletin pdf -> split each page into sections using headlines -> split section text into paragraphs then form paragraph clustter -> use RecursiveCharacterTextSplitter with overlap to produce chunks. Each chunk is also stored with metadata such as page, and section title.

- **Vector Database**: I embedded each chunk using a SentenceTransformer embedding model and indexed the embeddings in FAISS. Since cosine similarity ranges from 0 to 1 for normalized vectors, scores closer to 1 indicate a stronger semantic mathc, and lower scres indicate weaker similarity. As you can see in my table above, I have good answers for the query questions.

- **Query Questions**: As you can see above, I have a test query question as well as 4 other ones. I noticed that whenever I would improve the chunking boundaries the FAISS score would increase slightly. Some queries scored descent (0.7) while others ok (0.5)

- **Encountered Issue**: One issue I encountered was that there would be redundent headers at the top of each new section. I first noticed "Admission and Registration" section being redundant with the header. I saw that the header would slip inbetween chunk context which i figured caused my initial low FAISS score. However, I decided to go with a global header approach in order to remove redundency and improve section specification.

- **Conclusion**: Overall, separeting the chunks was somewhat complicated in the begginning but with the use of test cases that I commented out, helped get me through. Despite limitations like improper chunk sectioning, I feel like i'll be able to improve on this by next due date.

---
# Evaluation Part 3

running this cell takes about 4-6 minutes bc going through all 264 questions.

In [52]:
PART1_QA_PATH = "QuestionsAnswers.csv"  

qa = pd.read_csv(PART1_QA_PATH) 

def cosine_sim(a_vec, b_vec):
    return float(np.dot(a_vec, b_vec.T))

def semantic_similarity(text_a, text_b, embed_model):
    va = embed_model.encode([text_a], normalize_embeddings=True)
    vb = embed_model.encode([text_b], normalize_embeddings=True)
    return float(np.dot(va, vb.T)[0][0])

rows = []
for _, row in qa.iterrows():
    q = str(row["Questions"])
    gold = str(row["Answers"])

    pred, hits = rag_answer(q, top_k=3) # decided to use top k=3 to reduce token usage

    sem = semantic_similarity(pred, gold, model)  # model = SentenceTransformer
    rows.append({
        "question": q,
        "gold_answer": gold,
        "predict_answer": pred,
        "semantic_sim": sem,
        "top_hit_page": hits[0]["page"] if hits else None,
        "top_hit_section": hits[0].get("section","") if hits else None,
        "top_hit_score": hits[0]["score"] if hits else None
    })

eval_df = pd.DataFrame(rows)
display(eval_df)
# better analyze results
#eval_df.to_csv("evaluation_results.csv", index=False) 

print("Average semantic similarity:", eval_df["semantic_sim"].mean())


Unnamed: 0,question,gold_answer,predict_answer,semantic_sim,top_hit_page,top_hit_section,top_hit_score
0,How many 600+ level CS classes are available?,29,Not found in the provided records.,0.123818,85,Admission and Registration > Master of Science...,0.554968
1,Who are the graduate advisers for Computer Sci...,Wei Wang and Roger Whitney,Not found in the provided records.,0.151362,98,Admission and Registration > Postsecondary Edu...,0.455246
2,What ART section number if Graphic Communication?,641,Not found in the provided records.,0.154870,84,Admission and Registration > Advancement to Ca...,0.401632
3,What classes will discuss Earthquake magnitude?,GEOL 638 Advanced Notable Historic Earthquakes...,Not found in the provided records.,0.069586,85,Admission and Registration > Master of Science...,0.348424
4,What is the lowest GPA needed to get admission...,A grade point average (GPA) of at lease 3.0,Not found in the provided records.,0.059244,85,Admission and Registration > Master of Science...,0.543454
...,...,...,...,...,...,...,...
259,Who is the President of San Diego State Univer...,Adela de la Torre.,Not found in the provided records.,0.042588,92,Admission and Registration > Summary of Curric...,0.613605
260,What is the deadline for students to notify in...,Students should notify instructors by the end ...,Students must notify instructors **by the end ...,0.830467,95,Admission and Registration > Religious Observa...,0.855531
261,Is the GRE (Graduate Record Examination) requi...,"While many programs require the GRE, requireme...",No. The bulletin states that while the GRE Gen...,0.757623,39,Admission and Registration > Examination Requi...,0.672134
262,What is the maximum number of days a medical-r...,SHS will work with students to provide documen...,Not found in the provided records.,0.081198,40,Admission and Registration > Determination of ...,0.559582


Average semantic similarity: 0.41139524199146155


In [53]:
# Count pred_answers that are "Not found" vs real answers
not_found_count = (eval_df["predict_answer"] == "Not found in the provided records.").sum()
real_answer_count = len(eval_df) - not_found_count

print(f"Questions with NO answer found: {not_found_count}")
print(f"Questions with REAL answers: {real_answer_count}")
print(f"Total questions: {len(eval_df)}")
print(f"\nPercentage found: {(real_answer_count / len(eval_df) * 100):.1f}%")
print(f"Percentage not found: {(not_found_count / len(eval_df) * 100):.1f}%")

Questions with NO answer found: 131
Questions with REAL answers: 133
Total questions: 264

Percentage found: 50.4%
Percentage not found: 49.6%


# Part 3 Final Notes

First, the query is embedded using the same sentence transformer model used for indexing (all-MiniLM-L6-v2). The query embedding is L2-normalized, and I use a FAISS IndexFlatIP index to retrieve the top-k most similar chunk vectors (inner product on normalized vectors ≈ cosine similarity). Each retrieved record includes the chunk text and metadata (page number and section title), along with a similarity score.\
After retrieving top results, I format them into a single “context” block for the LLM using build_context(). Each chunk is preceded by a short citation header showing [page | section | score]. I also capped the context length (max_chars=4000) to reduce token usage and avoid rate-limit issues.\
Semantic similarity is computed by embedding both the predicted answer and the gold answer(in csv) with the same sentence transformer model and taking cosine similarity. I also recorded the top retrieved chunk metadata (page, section, retrieval score) for transparency and debugging. Finally, I computed how many questions resulted in “Not found…” versus real answers. After examining the question from the csv a little more carefully, I realized some were more specific than others and hence the results. I figured I could add more pages to scan in part 1 but for token conservation I didn't tinker to add more. Theoretically, I can answer all the question in the csv but I also realized my parsing system does not scale as well.