the goal of thois notebook is to 
pull from faiss and connect to llm to output 7 day plan 

Implement semantic search (topk) and a simple 7-day plan composer that attaches citations from chunks.

LLM planner

use openai that can
1. take a query (student's input)
2. search indicies using (search_slides and search_labs)
3. format results with citations 
4. call the chat model using api key
5. gen 7day  plan 
6. save the plan to JSON/CSV 

**Inputs**
- `../data/processed/slides_chunks.parquet`
- `../data/processed/labs_chunks.parquet`
- `../data/processed/faiss_slides.index`
- `../data/processed/faiss_labs.index`


- `../data/processed/faiss_labs.index`

**Outputs**
- Printed study plan in the notebook
- JSON file of the plan for saving 

In [33]:
from openai import OpenAI
from pathlib import Path
import pandas as pd 
import numpy as np
import os
import faiss
import json
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer


In [None]:
# Load DAta
# Folders
PROJECT_ROOT = Path("..").resolve()
PROCESSED_FOLDER = PROJECT_ROOT / "data" / "processed"

SLIDES_CHUNKS_PATH = PROCESSED_FOLDER / "slides_chunks.parquet"
LABS_CHUNKS_PATH   = PROCESSED_FOLDER / "labs_chunks.parquet"
FAISS_SLIDES_PATH  = PROCESSED_FOLDER / "faiss_slides.index"
FAISS_LABS_PATH    = PROCESSED_FOLDER / "faiss_labs.index"


print("Slides chunks:", SLIDES_CHUNKS_PATH.exists(), SLIDES_CHUNKS_PATH)
print("Labs chunks:  ", LABS_CHUNKS_PATH.exists(),   LABS_CHUNKS_PATH)
print("Slides index: ", FAISS_SLIDES_PATH.exists(),  FAISS_SLIDES_PATH)
print("Labs index:   ", FAISS_LABS_PATH.exists(),    FAISS_LABS_PATH)



Slides chunks: True C:\Users\julmo\OneDrive - University of Rochester\TKH Labs\Grades2Goals_Planner\data\processed\slides_chunks.parquet
Labs chunks:   True C:\Users\julmo\OneDrive - University of Rochester\TKH Labs\Grades2Goals_Planner\data\processed\labs_chunks.parquet
Slides index:  True C:\Users\julmo\OneDrive - University of Rochester\TKH Labs\Grades2Goals_Planner\data\processed\faiss_slides.index
Labs index:    True C:\Users\julmo\OneDrive - University of Rochester\TKH Labs\Grades2Goals_Planner\data\processed\faiss_labs.index


In [18]:
# Load chunks 
slides_df = pd.read_parquet("../data/processed/slides_chunks.parquet")
labs_df   = pd.read_parquet("../data/processed/labs_chunks.parquet")

# Drop empty rows
slides_df = slides_df.dropna(subset=['text']).reset_index(drop=True)
labs_df   = labs_df.dropna(subset=['text']).reset_index(drop=True)

print("Slides rows:", len(slides_df))
print("Labs rows:  ", len(labs_df))

Slides rows: 40
Labs rows:   1410


In [19]:
# Load FAISS indexes from disk
slides_index = faiss.read_index(FAISS_SLIDES_PATH.as_posix())
labs_index   = faiss.read_index(FAISS_LABS_PATH.as_posix())# LLM call (OpenAI) to build a 7-day plan



In [20]:
# Embedding model
embedder = SentenceTransformer("bert-base-nli-mean-tokens")


def encode_normalized(texts):
    """Convert a single query string (what the student types) into a normalized float32 vector.
    - normalize_embeddings=True ensures vectors have length 1, so FAISS inner product ≈ cosine similarity.
    - We return a NumPy array with dtype float32 because FAISS expects float32 vectors."""
    embeddings = embedder.encode(texts, normalize_embeddings=True, show_progress_bar=True)
    return np.asarray(embeddings, dtype="float32")

print("Embedding model loaded:", embedder)

Embedding model loaded: SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)


In [21]:
# Create 2 FAISS indices, one for slides and one for labs

# Slides index
slides_embeddings = encode_normalized(slides_df['text'].tolist())
slides_index = faiss.IndexFlatIP(slides_embeddings.shape[1])
slides_index.add(slides_embeddings)
faiss.write_index(slides_index, FAISS_SLIDES_PATH.as_posix())

# Labs index
labs_embeddings = encode_normalized(labs_df['text'].tolist())
labs_index = faiss.IndexFlatIP(labs_embeddings.shape[1])
labs_index.add(labs_embeddings)
faiss.write_index(labs_index, FAISS_LABS_PATH.as_posix())

print("Slides index size:", slides_index.ntotal)
print("Labs index size:", labs_index.ntotal)

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/45 [00:00<?, ?it/s]

Slides index size: 40
Labs index size: 1410


In [22]:
def search_slides(query_text,top_k =5):
    """
    Search ONLY the slides index and return a small, readable DataFrame of results.

    Steps:
    1) Encode the query using the same embedding model and normalization.
    2) Ask FAISS for the top_k most similar vectors from the slides index.
    3) For each match, look up the original row in slides_df to get metadata (file, page, text).
    4) Build a small results table with source_type, file, page, text, and the similarity score.
    5) Sort by score descending (higher ≈ more similar).
    """
    # Embed query 
    query_vector = encode_normalized([query_text])

    # FAISS search
    distances, indices = slides_index.search(query_vector, top_k)

    # Turn indicies into a list of rows from slides_df
    rows = []
    for score, idx in zip(distances[0], indices[0]):
        if idx < 0:  # FAISS returns -1 for empty results
            continue
        row = slides_df.iloc[idx] # get the mathching slide chunks
        # result structure
        rows.append({
            "source_type": "slide",
            "file": row['file'], # slide filename
            "page": row['page'], # slide page number
            "text": row['text'], # slide text chunk that matched
            "score": score # similarity score  (higher the better)
    
        })

    # Create a DataFrame and sort by score descending
    results_df = pd.DataFrame(rows).sort_values(by="score", ascending=False).reset_index(drop=True)
    return results_df




In [23]:
def search_labs(query_text,top_k =5):
    """
    Search ONLY the labs index and return a small, readable DataFrame of results.

    Steps:
    1) Encode the query using the same embedding model and normalization.
    2) Ask FAISS for the top_k most similar vectors from the labs index.
    3) For each match, look up the original row in labs_df to get metadata (file, page, text).
    4) Build a small results table with source_type, file, page, text, and the similarity score.
    5) Sort by score descending (higher ≈ more similar).
    """
    # Embed query 
    query_vector = encode_normalized([query_text])

    # FAISS search
    distances, indices = labs_index.search(query_vector, top_k)

    # Turn indicies into a list of rows from labs_df
    rows = []
    for score, idx in zip(distances[0], indices[0]):
        if idx < 0:  # FAISS returns -1 for empty results
            continue
        row = labs_df.iloc[idx] # get the mathching slide chunks
        # result structure
        rows.append({
            "source_type": "lab",
            "file": row['file'], # lab filename
            "text": row['text'], # lab text chunk that matched
            "score": score # similarity score  (higher the better)
    
        })

    # Create a DataFrame and sort by score descending
    results_df = pd.DataFrame(rows).sort_values(by="score", ascending=False).reset_index(drop=True)
    return results_df


In [None]:
load_dotenv(override=True)  # take environment variables from .env file

# Create a single OpenAI client (reads key from environment)
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def generate_7_day_plan(feedback_text, top_k_slides: 5, top_k_labs: 5, model_name: "gpt-4o-mini"):
    """
    End-to-end:
    1) Search slides and labs separately.
    2) Concatenate the results (labs first by default since they are practical).
    3) Build a single context block with simple citations.
    4) Call OpenAI to write a 7-day plan with spaced review.
    5) Return the generated text.

    Args:
        feedback_text (str): Student's feedback, e.g., "I lost points on SQL joins and confusion matrix."
        top_k_slides (int): Number of slide chunks to include.
        top_k_labs   (int): Number of lab chunks to include.
        model_name   (str): OpenAI chat model.

    Returns:
        str: The study plan text generated by the LLM.
    """
    # Search slides and labs
    slides_results = search_slides(feedback_text, top_k=top_k_slides)
    labs_results   = search_labs(feedback_text, top_k=top_k_labs)

    # Combine results 
    combined_results = pd.concat([labs_results, slides_results], ignore_index=True)

    # Build context block with citations
    context_blocks = []
    for i, row in combined_results.iterrows():
        citation = f"[{i+1}]"
        if row['source_type'] == 'slide':
            context_blocks.append(f"{citation} (Slide: {row['file']} Page: {row['page']}) {row['text']}")
        else:
            context_blocks.append(f"{citation} (Lab: {row['file']}) {row['text']}")

    context = "\n\n".join(context_blocks) # double newline for readability
    
    system_prompt = (
        "You are an academic coach for a data science course. "
        "You must create a concrete 7-day micro-task plan using ONLY the provided context. "
        "Ensure tasks alternate between review (reading/notes), application (coding exercises), and reflection."
        "Each day should include 2–4 actionable tasks, with estimated time, and a citation line that points back to the source. "
        "Use spaced review on Day 1, Day 3, and Day 6. "
        "If context is insufficient for any part, state that clearly."
    )
    
    user_prompt = (
        f"Student feedback: {feedback_text}\n\n"
        f"Context from course materials (slides and labs):\n"
        f"{context}\n"
        "Now write the 7-day plan in this structure:\n"
        "Day 1 — Understand\n"
        "- Task 1 (est. 15–25 min) — description [CITATION]\n"
        "- Task 2 (est. 10–20 min) — description [CITATION]\n"
        "Day 2 — Apply\n"
        "- Task 1 ...\n"
        "...\n"
        "Day 7 — Checkpoint\n"
        "- Mini-quiz or small coding task ...\n"
        "\n"
        "Rules:\n"
        "- Use only facts available in the context above.\n"
        "- Each task line should end with a [CITATION] using the [SOURCE: ...] entry from context.\n"
        "- If something is unclear or missing, say so.\n"
    )

    # Call OpenAI chat completion
    response = client.chat.completions.create(
        model=model_name,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0.2,  # low temperature for focused output
        max_tokens=1000   # adjust as needed
    )

    return response.choices[0].message.content

In [42]:
example_feedback = "I lost points on SQL joins and I keep mixing up inner vs left vs right joins."
plan_text = generate_7_day_plan(example_feedback, top_k_slides=4, top_k_labs=6, model_name="gpt-4o-mini")
print(plan_text)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

{"plan":{"days":[{"day":1,"tasks":[{"title":"Review SQL Joins","est_mins":25,"citation_url":""},{"title":"Study Inner Join vs Left Join vs Right Join","est_mins":20,"citation_url":""}]},{"day":2,"tasks":[{"title":"Practice SQL Joins with Sample Data","est_mins":30,"citation_url":""},{"title":"Complete SQL Join Exercises","est_mins":30,"citation_url":""}]},{"day":3,"tasks":[{"title":"Review SQL Keywords and Clauses","est_mins":25,"citation_url":"[5]"},{"title":"Practice SELECT statements with WHERE clause","est_mins":20,"citation_url":""}]},{"day":4,"tasks":[{"title":"Work on SQL Join Scenarios","est_mins":30,"citation_url":""},{"title":"Analyze SQL Join Results","est_mins":30,"citation_url":""}]},{"day":5,"tasks":[{"title":"Review SQL Data Manipulation Language (DML)","est_mins":25,"citation_url":"[5]"},{"title":"Practice INSERT, UPDATE, DELETE commands","est_mins":30,"citation_url":""}]},{"day":6,"tasks":[{"title":"Review SQL Join Types and Use Cases","est_mins":25,"citation_url":""},

In [None]:
# Detect Topic 
Topic_keywords = {
    "sql": ["join", "left join ", "right join", "inner join", "outer join", "select", "from", "where", "group by", "order by", "having", "union", "intersect", "except", "subquery", "cte", "window function"],
    "loops": ["for loop", "while loop", "do while loop", "nested loop", "break", "continue", "infinite loop", "loop control"]

    def detect_topic(query):
        result = []
        for topic, keywords in Topic_keywords.items():
            for keyword in keywords:
                if keyword in query.lower():
                    result.append(topic)
        return result       