In [1]:
pip install sentence-transformers faiss-cpu


Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [3]:
import pandas as pd

df = pd.read_csv("courses_detailed.csv")

print(df.columns)
print(len(df), "rows")
df.head()


Index(['program_name', 'req_num', 'course_code', 'title', 'credits', 'offered',
       'description', 'grading', 'repeatable', 'prerequisites', 'extra_blocks',
       'detail_url'],
      dtype='object')
3166 rows


Unnamed: 0,program_name,req_num,course_code,title,credits,offered,description,grading,repeatable,prerequisites,extra_blocks,detail_url
0,Advanced Mathematical Methods (for Students in...,1,MATH-UA 140,Linear Algebra,(4 Credits),"Fall, Spring, and Summer terms",Systems of linear equations. Gaussian eliminat...,CAS Graded,No,,,https://bulletins.nyu.edu/search/?P=MATH-UA%20140
1,Advanced Mathematical Methods (for Students in...,1,MATH-UA 148,Honors Linear Algebra,(4 Credits),,This honors section of Linear Algebra is a pro...,CAS Graded,No,,,https://bulletins.nyu.edu/search/?P=MATH-UA%20148
2,Advanced Mathematical Methods (for Students in...,2,MATH-UA 352,Numerical Analysis,(4 Credits),Spring,Formerly numbered MATH-UA 252; the content has...,CAS Graded,No,,,https://bulletins.nyu.edu/search/?P=MATH-UA%20352
3,Advanced Mathematical Methods (for Students in...,2,MATH-UA 358,Honors Numerical Analysis,(4 Credits),,Formerly numbered MATH-UA 258; the content has...,CAS Graded,No,,,https://bulletins.nyu.edu/search/?P=MATH-UA%20358
4,Advanced Mathematical Methods (for Students in...,2,MATH-GA 2010,Numerical Methods I,(3 Credits),Fall,This course is part of a two-course series mea...,GSAS Graded,No,,,https://bulletins.nyu.edu/search/?P=MATH-GA%20...


In [4]:
docs = []
metadata = []

for _, row in df.iterrows():
    doc = f"""
Program: {row['program_name']}
Requirement group: {row['req_num']}

Course: {row['course_code']} - {row['title']}
Credits: {row['credits']}
Typically offered: {row['offered']}
Prerequisites: {row['prerequisites']}
Repeatable: {row['repeatable']}
Grading: {row['grading']}

Description:
{row['description']}

Extra:
{row['extra_blocks']}
""".strip()

    docs.append(doc)
    metadata.append(row.to_dict())

print("Built", len(docs), "course docs.")


Built 3166 course docs.


In [5]:
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

embeddings = embedder.encode(docs, convert_to_numpy=True).astype("float32")

dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)

print("Index contains", index.ntotal, "vectors.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Index contains 3166 vectors.


In [6]:
def retrieve_courses(query: str, k: int = 5):
    q_emb = embedder.encode([query], convert_to_numpy=True).astype("float32")
    distances, indices = index.search(q_emb, k)

    results = []
    for dist, idx in zip(distances[0], indices[0]):
        if idx == -1:
            continue
        results.append({
            "score": float(dist),
            "text": docs[idx],
            "meta": metadata[idx],   # original row as dict
        })
    return results


In [7]:
results = retrieve_courses(
    "introductory data science course with no prerequisites and 4 credits",
    k=5
)

for r in results:
    m = r["meta"]
    print(f"Score: {r['score']:.4f}")
    print(f"{m['course_code']} - {m['title']}  ({m['credits']})")
    print("Program:", m["program_name"])
    print("Prereq:", m["prerequisites"])
    print("Offered:", m["offered"])
    print("Detail URL:", m["detail_url"])
    print("-" * 80)


Score: 0.4533
DS-UA 301 - Advanced Topics in Data Science  ((4 Credits))
Program: Data Science (Minor)
Prereq: DS-UA 112 and ( MATH-UA 185 or MATH-UA 334 or MA-UY 2224 as co-requisites ) and ( CSCI-UA 473 as a co-requisite ) and restricted to Majors/Minors.
Offered: Fall and Spring
Detail URL: https://bulletins.nyu.edu/search/?P=DS-UA%20301
--------------------------------------------------------------------------------
Score: 0.4665
DS-UA 301 - Advanced Topics in Data Science  ((4 Credits))
Program: Data Science (BA)
Prereq: DS-UA 112 and ( MATH-UA 185 or MATH-UA 334 or MA-UY 2224 as co-requisites ) and ( CSCI-UA 473 as a co-requisite ) and restricted to Majors/Minors.
Offered: Fall and Spring
Detail URL: https://bulletins.nyu.edu/search/?P=DS-UA%20301
--------------------------------------------------------------------------------
Score: 0.4756
DS-UA 301 - Advanced Topics in Data Science  ((4 Credits))
Program: Computer and Data Science (BA)
Prereq: DS-UA 112 and ( MATH-UA 185 or MAT

In [8]:
def build_prompt(query: str, retrieved):
    context_parts = []
    for i, item in enumerate(retrieved, start=1):
        context_parts.append(
            f"[Course {i}]\n{item['text']}"
        )
    context = "\n\n".join(context_parts)

    prompt = f"""
You are an assistant that helps a student plan their NYU courses.
Use ONLY the information in the context below.

Context:
{context}

Question:
{query}

Instructions:
- Base your answer only on the context above.
- When you mention courses, include the course code and title.
- If something is not clearly supported by the context, say you are not sure.
"""
    return prompt.strip()


In [10]:
def answer_query(query: str, k: int = 5) -> str:
    retrieved = retrieve_courses(query, k=k)
    prompt = build_prompt(query, retrieved)
    answer = call_llm(prompt)
    return answer


In [17]:
from google.colab import userdata
from openai import OpenAI

api_key = userdata.get("OPENAI_API_KEY")  # or whatever name you used
assert api_key, "No API key found in Colab userdata under 'OPENAI_API_KEY'"

client = OpenAI(api_key=api_key)


In [21]:
def build_prompt(query: str, retrieved):
    context_parts = []
    for i, item in enumerate(retrieved, start=1):
        context_parts.append(f"[Course {i}]\n{item['text']}")
    context = "\n\n".join(context_parts)

    prompt = f"""
You are an assistant helping a student plan their NYU courses.

Use ONLY the information in the context below.

Context:
{context}

Question:
{query}

Instructions:
- Answer using the context above.
- When you mention courses, include course code and title.
- If the answer is not clearly supported by the context, say you are not sure.
""".strip()
    return prompt

def call_llm(prompt: str, model: str = "gpt-4.1-mini") -> str:
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a concise, helpful course-planning assistant."},
            {"role": "user", "content": prompt},
        ],
    )
    return response.choices[0].message.content

def answer_query(query: str, k: int = 5, model: str = "gpt-4.1-mini") -> str:
    retrieved = retrieve_courses(query, k=k)
    prompt = build_prompt(query, retrieved)
    return call_llm(prompt, model=model)


In [22]:
plan_query = """
I am an incoming first-year student planning to major in Data Science.
Assume I start in Fall 2025, want to take about 15–16 credits per semester,
and finish in 4 years.

Using the course information in the context, propose a tentative 8-semester
plan with course codes and titles. Try to:

- Put introductory/1000-level courses in the first year,
- Respect listed prerequisites when possible,
- Mix required/core DS courses with reasonable electives,
- Leave placeholders like 'Core humanities elective' where the catalog does not give enough detail.

Output the plan as a table with columns: Year, Semester, Course Code, Course Title, Credits, Notes.
"""

print(answer_query(plan_query, k=40))


| Year | Semester  | Course Code | Course Title                      | Credits   | Notes                                                    |
|-------|-----------|-------------|---------------------------------|-----------|----------------------------------------------------------|
| 1     | Fall      | DS-UA 111   | Principles of Data Science I     | 4 Credits | Introductory data science course; no prerequisites       |
| 1     | Fall      | CSCI-UA 2   | Introduction to Computer Programming (No Prior Experience) | 4 Credits | For students with no prior programming; prerequisite for CSCI-UA 101 |
| 1     | Fall      | Core math elective (e.g. MATH-UA 120 if allowed) | 4 Credits | Not specified in DS BA program but logical for math foundation (Discrete Mathematics) |
| 1     | Fall      | Core humanities elective        | 3-4 Credits | Placeholder for required humanities elective             |
| 1     | Spring    | CSCI-UA 101 | Intro to Computer Science        | 4 Credits | Prerequisi

In [23]:
plan_query = """
I am an incoming first-year student at NYU planning to major in Data Science.
Assume I start in Fall 2025, want to take about 15–16 credits per semester,
and finish in 4 years.

I want to spend one semester studying abroad in my 3rd year (for example,
Fall of junior year). During that semester, I will mainly take electives
or flexible requirements, not heavy prerequisite chains.

Using the course information in the context, propose a tentative 8-semester
plan with course codes and titles. Try to:

- Put introductory/1000-level courses in the first year.
- Respect listed prerequisites when possible.
- Ensure key prerequisite chains are completed BEFORE the study-abroad semester.
- Use the study-abroad semester mostly for electives or core requirements
  that do not break prerequisites.
- Leave placeholders like 'Core humanities elective' or 'Study abroad elective'
  where the catalog does not give enough detail.

Output the plan as a table with columns:
Year, Semester, Course Code, Course Title, Credits, Notes.
"""

print(answer_query(plan_query, k=40))


Based on the information provided and your plan to major in Data Science starting Fall 2025 with a study-abroad in Fall of your junior year (Year 3 Fall), here is a tentative 8-semester plan focusing on 15-16 credits per semester and respecting prerequisites and course levels. Since the context does not include explicit Data Science courses or core curriculum for the major, I have relied on applicable courses with clear prerequisites/principles and included placeholders where needed.

| Year | Semester | Course Code        | Course Title                                   | Credits    | Notes                                                                                         |
|-------|----------|--------------------|-----------------------------------------------|------------|-----------------------------------------------------------------------------------------------|
| 1     | Fall     | MATH-UA 121*       | Core Math Course (assumed prerequisite for PHYS-UA 11) | Not in contex