The goal of this notebook build faiss indexes for semantic search 
- retrieve chunked data
- built FAISS
- Veriefed that they can be query and get results 

**Inputs**
- `../data/processed/slides_chunks.parquet`  
- `../data/processed/labs_chunks.parquet`

In [11]:
# libraries
from pathlib import Path
import os
import pandas as pd
import numpy as np

from sentence_transformers import SentenceTransformer
import faiss


In [None]:
# Folders
PROJECT_ROOT = Path("..").resolve()
RAW_FOLDER = PROJECT_ROOT / "data" / "raw"
PROCESSED_FOLDER = PROJECT_ROOT / "data" / "processed"

# Input files 
SLIDES_CHUNKS_PATH = PROCESSED_FOLDER / "slides_chunks.parquet"
LABS_CHUNKS_PATH   = PROCESSED_FOLDER / "labs_chunks.parquet"

# Define FAISS index path
FAISS_INDEX_PATH = PROCESSED_FOLDER / "faiss_slides.index"
FAISS_LABS_INDEX_PATH = PROCESSED_FOLDER / "faiss_labs.index"

print("Slides chunks exist? ", SLIDES_CHUNKS_PATH.exists(), SLIDES_CHUNKS_PATH)
print("Labs chunks exist?   ", LABS_CHUNKS_PATH.exists(), LABS_CHUNKS_PATH)



Slides chunks exist?  True C:\Users\julmo\OneDrive - University of Rochester\TKH Labs\Grades2Goals_Planner\data\processed\slides_chunks.parquet
Labs chunks exist?    True C:\Users\julmo\OneDrive - University of Rochester\TKH Labs\Grades2Goals_Planner\data\processed\labs_chunks.parquet


In [3]:
# Load chunks 
slides_df = pd.read_parquet("../data/processed/slides_chunks.parquet")
labs_df   = pd.read_parquet("../data/processed/labs_chunks.parquet")

# Drop empty rows
slides_df = slides_df.dropna(subset=['text']).reset_index(drop=True)
labs_df   = labs_df.dropna(subset=['text']).reset_index(drop=True)

print("Slides rows:", len(slides_df))
print("Labs rows:  ", len(labs_df))


Slides rows: 40
Labs rows:   1410


In [4]:
# Embedding model
embedder = SentenceTransformer("bert-base-nli-mean-tokens")


def encode_normalized(texts):
    """Convert a single query string (what the student types) into a normalized float32 vector.
    - normalize_embeddings=True ensures vectors have length 1, so FAISS inner product ≈ cosine similarity.
    - We return a NumPy array with dtype float32 because FAISS expects float32 vectors."""
    embeddings = embedder.encode(texts, normalize_embeddings=True, show_progress_bar=True)
    return np.asarray(embeddings, dtype="float32")

print("Embedding model loaded:", embedder)

Embedding model loaded: SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)


In [10]:
# Create 2 FAISS indices, one for slides and one for labs

# Slides index
slides_embeddings = encode_normalized(slides_df['text'].tolist())
slides_index = faiss.IndexFlatIP(slides_embeddings.shape[1])
slides_index.add(slides_embeddings)
faiss.write_index(slides_index, FAISS_INDEX_PATH.as_posix())

# Labs index
labs_embeddings = encode_normalized(labs_df['text'].tolist())
labs_index = faiss.IndexFlatIP(labs_embeddings.shape[1])
labs_index.add(labs_embeddings)
faiss.write_index(labs_index, FAISS_LABS_INDEX_PATH.as_posix())


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/45 [00:00<?, ?it/s]

In [7]:
def search_slides(query_text,top_k =5):
    """
    Search ONLY the slides index and return a small, readable DataFrame of results.

    Steps:
    1) Encode the query using the same embedding model and normalization.
    2) Ask FAISS for the top_k most similar vectors from the slides index.
    3) For each match, look up the original row in slides_df to get metadata (file, page, text).
    4) Build a small results table with source_type, file, page, text, and the similarity score.
    5) Sort by score descending (higher ≈ more similar).
    """
    # Embed query 
    query_vector = encode_normalized([query_text])

    # FAISS search
    distances, indices = slides_index.search(query_vector, top_k)

    # Turn indicies into a list of rows from slides_df
    rows = []
    for score, idx in zip(distances[0], indices[0]):
        if idx < 0:  # FAISS returns -1 for empty results
            continue
        row = slides_df.iloc[idx] # get the mathching slide chunks
        # result structure
        rows.append({
            "source_type": "slide",
            "file": row['file'], # slide filename
            "page": row['page'], # slide page number
            "text": row['text'], # slide text chunk that matched
            "score": score # similarity score  (higher the better)
    
        })

    # Create a DataFrame and sort by score descending
    results_df = pd.DataFrame(rows).sort_values(by="score", ascending=False).reset_index(drop=True)
    return results_df

In [8]:
def search_labs(query_text,top_k =5):
    """
    Search ONLY the labs index and return a small, readable DataFrame of results.

    Steps:
    1) Encode the query using the same embedding model and normalization.
    2) Ask FAISS for the top_k most similar vectors from the labs index.
    3) For each match, look up the original row in labs_df to get metadata (file, page, text).
    4) Build a small results table with source_type, file, page, text, and the similarity score.
    5) Sort by score descending (higher ≈ more similar).
    """
    # Embed query 
    query_vector = encode_normalized([query_text])

    # FAISS search
    distances, indices = labs_index.search(query_vector, top_k)

    # Turn indicies into a list of rows from labs_df
    rows = []
    for score, idx in zip(distances[0], indices[0]):
        if idx < 0:  # FAISS returns -1 for empty results
            continue
        row = labs_df.iloc[idx] # get the mathching slide chunks
        # result structure
        rows.append({
            "source_type": "lab",
            "file": row['file'], # lab filename
            "text": row['text'], # lab text chunk that matched
            "score": score # similarity score  (higher the better)
    
        })

    # Create a DataFrame and sort by score descending
    results_df = pd.DataFrame(rows).sort_values(by="score", ascending=False).reset_index(drop=True)
    return results_df


In [9]:
display(search_slides("SQL joins inner left right", top_k=5))
display(search_labs("SQL joins inner left right", top_k=5))

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,source_type,file,page,text,score
0,slide,Introduction to Structured Databases I (1).pdf,62,Tuesday\nOn Tuesday we will review…\n●\nWhat i...,0.549009
1,slide,Advanced SQL I.pdf,43,Tuesday\nSQL + Python\n●\nHow do we design a d...,0.531278
2,slide,Introduction to Structured Databases II.pdf,42,Wednesday\nMore SQL Practice!\n●\nSQL Leetcode...,0.493887
3,slide,Types of Visualizations Review.pdf,64,population - entire group you could possibly g...,0.479493
4,slide,SQL Review.pdf,16,Next Week…\nNext week will entail:\n●\nMonday:...,0.413241


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,source_type,file,text,score
0,lab,w9-class2.ipynb,# SQL REVIEW,0.768907
1,lab,w10-class1.ipynb,"\n df = pd.read_sql_query(f""SELECT * FROM {...",0.748725
2,lab,w7-class3.ipynb,df = pd.read_csv('data.csv')\n# df = pd.read_e...,0.737325
3,lab,w10-class1.ipynb,import sqlite3\nimport pandas as pd,0.718976
4,lab,w9-class2.ipynb,import sqlite3\nimport pandas as pd,0.718976
