In [None]:
pip install chromadb

In [None]:
pip install sentence_transformers beautifulsoup4 requests

In [None]:
import requests
from bs4 import BeautifulSoup # we use BeatifulSoup for webscraping
term = "202610" #2025-2026 course catalog
base_url = "https://courses.rice.edu"
url_catalog = base_url + "/admweb/!SWKSCAT.cat?p_action=CATALIST&p_term=" + term # scrape data from courses.rice.edu

# Get the HTML code of the page
resp = requests.get(url_catalog)
html_source = resp.text

soup = BeautifulSoup(html_source, "html.parser") # our scraper

courses = []

for row in soup.select("tr"): # iterate through each table row
    cells = row.find_all("td") # get all table data  cells in current table row
    if len(cells) > 4 and cells[0].find("a"): # skip rows that don't have enough cells (header or blank rows)
        course_info = {
            "course": cells[0].text.strip(),
            "title": cells[1].text.strip(),
            "distribution_group": cells[2].text.strip(),
            "diversity_credit": cells[3].text.strip() != "",
            "credit_hours": cells[4].text.strip(),
        }

        # Get the url
        link_tag = cells[0].find("a")
        relative_url = link_tag['href']
        full_url = base_url+relative_url

        #Here, scrape the data from sub-page.

        print(f"  Scraping sub-page for {course_info['course']}...")
        # Make a new request to the course-specific page
        course_resp = requests.get(full_url)
        course_resp.raise_for_status()

        # Create a new soup object for the course page
        course_soup = BeautifulSoup(course_resp.text, "html.parser")

        # Scrape the Department
        department_b_tag = course_soup.find(
            lambda tag: tag.name == 'b' and "Department:" in tag.get_text()
        )

        if department_b_tag:
            # .next_sibling gets the text node after <b>, strip() cleans whitespace
            course_info['department'] = department_b_tag.next_sibling.strip()
        else:
            course_info['department'] = "No department found."

        #Scrape the Grade Mode
        grade_mode_b_tag = course_soup.find(
            lambda tag: tag.name == 'b' and "Grade Mode:" in tag.get_text()
        )
        if grade_mode_b_tag:
            course_info['grade mode'] = grade_mode_b_tag.next_sibling.strip()
        else:
            course_info['grade mode'] = "No grade mode found."

        #Scrape the Course Type
        course_type_b_tag = course_soup.find(
            lambda tag: tag.name == 'b' and "Course Type:" in tag.get_text()
        )
        if course_type_b_tag:

            course_info['course type'] = course_type_b_tag.next_sibling.strip()
        else:
            course_info['course type'] = "No course type found."

        # Scrape the Restrictions
        restrictions_b_tag = course_soup.find(
            lambda tag: tag.name == 'b' and "Restrictions:" in tag.get_text()
        )
        if restrictions_b_tag:
            # Find all <div>s that are siblings *after* the <b> tag
            restriction_divs = restrictions_b_tag.find_next_siblings("div")
            restrictions_list = [div.text.strip() for div in restriction_divs]
            # Join them with a new line
            course_info['restrictions'] = "\n".join(restrictions_list)
        else:
            course_info['restrictions'] = "No restrictions found."

        # Scrape the Prerequisites
        prereqs_b_tag = course_soup.find(
            lambda tag: tag.name == 'b' and "Prerequisite(s):" in tag.get_text()
        )
        if prereqs_b_tag:
            course_info['prerequisites'] = prereqs_b_tag.next_sibling.strip()
        else:
            course_info['prerequisites'] = "No prerequisites found."

        # Scrape the Description
        description_b_tag = course_soup.find(
            lambda tag: tag.name == 'b' and "Description:" in tag.get_text()
        )
        if description_b_tag and description_b_tag.next_sibling:
            course_info['description'] = description_b_tag.next_sibling.strip()
        else:
            course_info['description'] = "No description found."

        # Add the combined data to your list
        courses.append(course_info)

for course in courses:
    print(course)

In [None]:
'''
import json

filename = "courses.json"


with open(filename, 'w') as json_file:
    json.dump(courses, json_file, indent=4)
'''

In [20]:
import json

with open('courses.json', 'r') as f:
    courses = json.load(f)


In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
documents = []
metadatas = []
ids = []
id = 0

for course in courses:
  doc_text = f"Course: {course['title']} : {course['course']}. Description: {course['description']}."
  documents.append(doc_text)
  metadatas.append(course)
  ids.append(str(id))
  id += 1

embeddings = model.encode(documents, show_progress_bar=True)

In [None]:
import chromadb

db_path = './rice_courses_db'
client = chromadb.PersistentClient(path=db_path)
collection = client.get_or_create_collection("rice_courses")

# Set a batch size safely under the 5461 limit. 5000 is a safe, round number.
batch_size = 5000
total_items = len(ids)
num_batches = ((total_items + batch_size - 1) // batch_size) #ceil of division

print(f"Total items: {total_items}. Batch size: {batch_size}. Sending in {num_batches} batches...")

for i in range(0, total_items, batch_size):

    # Calculate the end index for the current batch
    end_index = min(i + batch_size, total_items)

    print(f"Adding batch {i // batch_size + 1}/{num_batches} (items {i} to {end_index})...")

    # Get the sublists
    batch_embeddings = embeddings[i:end_index]
    batch_documents = documents[i:end_index]
    batch_metadatas = metadatas[i:end_index]
    batch_ids = ids[i:end_index]

    collection.add(
        embeddings=batch_embeddings,
        documents=batch_documents,
        metadatas=batch_metadatas,
        ids=batch_ids
        )

In [None]:
results = collection.query(
    query_texts = [""],
    n_results = 1, # by default returns 10 closest results
    where={
        "$and": [
            {"distribution_group": {"$eq": "Distribution Group II"}},
            {"diversity_credit": {"$eq": True}}
        ]
    }
)
print(results["documents"])
print(results["metadatas"])
print(results["distances"])