In [93]:
!pip install pandas numpy scikit-learn



In [94]:
import sys
print(sys.executable)

c:\Users\anahi\miniconda3\envs\dooleyhelps\python.exe


In [95]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
import os, json, math
from sklearn.linear_model import Ridge
import joblib

# --- Figure out project root and key dirs ---
cwd = os.getcwd()
if os.path.basename(cwd) == "Model":
    PROJECT_ROOT = os.path.dirname(cwd)
else:
    PROJECT_ROOT = cwd  # if you're already at the root

MODEL_DIR = os.path.join(PROJECT_ROOT, "Model")
MODEL_PATH = os.path.join(MODEL_DIR, "ridge_recommender.pkl")
OUT_DIR   = os.path.join(PROJECT_ROOT, "out")

print("CWD:", cwd)
print("PROJECT_ROOT:", PROJECT_ROOT)
print("MODEL_DIR:", MODEL_DIR)
print("OUT_DIR:", OUT_DIR)

# Make sure Python can import from Model/
if MODEL_DIR not in sys.path:
    sys.path.append(MODEL_DIR)

from track_graduation import track_grad

# --- Paths to your data files ---
SYNTHETIC_COURSES_PATH = os.path.join(MODEL_DIR, "synthetic_courses.json")
SYNTHETIC_PREF_PATH    = os.path.join(MODEL_DIR, "synthetic_pref.json")
COURSES_QUAL_PATH      = os.path.join(OUT_DIR, "courses_qualified.json")

print("SYNTHETIC_COURSES_PATH:", SYNTHETIC_COURSES_PATH)
print("SYNTHETIC_PREF_PATH:", SYNTHETIC_PREF_PATH)
print("COURSES_QUAL_PATH:", COURSES_QUAL_PATH)


CWD: c:\Users\anahi\OneDrive\Documents\EMORY\Fall 2025\CS 370\Model
PROJECT_ROOT: c:\Users\anahi\OneDrive\Documents\EMORY\Fall 2025\CS 370
MODEL_DIR: c:\Users\anahi\OneDrive\Documents\EMORY\Fall 2025\CS 370\Model
OUT_DIR: c:\Users\anahi\OneDrive\Documents\EMORY\Fall 2025\CS 370\out
SYNTHETIC_COURSES_PATH: c:\Users\anahi\OneDrive\Documents\EMORY\Fall 2025\CS 370\Model\synthetic_courses.json
SYNTHETIC_PREF_PATH: c:\Users\anahi\OneDrive\Documents\EMORY\Fall 2025\CS 370\Model\synthetic_pref.json
COURSES_QUAL_PATH: c:\Users\anahi\OneDrive\Documents\EMORY\Fall 2025\CS 370\out\courses_qualified.json


In [96]:
with open(SYNTHETIC_COURSES_PATH, "r") as f:
    synthetic_courses = {rec["shared_id"]: rec for rec in json.load(f)}

with open(SYNTHETIC_PREF_PATH, "r") as f:
    synthetic_prefs = {rec["shared_id"]: rec for rec in json.load(f)}

with open(COURSES_QUAL_PATH, "r") as f:
    candidate_courses = json.load(f)

print("students:", len(synthetic_courses), "candidate courses:", len(candidate_courses))


students: 40 candidate courses: 1787


In [97]:
# Map from survey "priorityOrder" labels â†’ internal aspect keys
ASPECT_KEY_MAP = {
    "GER_REQUIREMENTS":   "ger_reqs",
    "PROFESSOR_RATING":   "ratings",
    "MAJOR_REQUIREMENTS": "major_ger",
    "INTERESTS":          "interests",
    "TIME_PREFERENCE":    "time",
}

def weights(rank_order, decay=0.5):
    """
    rank_order: list of aspect keys in priority order (best â†’ worst)
    decay: geometric decay factor between ranks.
    """
    raw = {}
    for rank, aspect in enumerate(rank_order):
        raw[aspect] = decay ** rank

    total = sum(raw.values())
    return {k: v / total for k, v in raw.items()}

def compute_aspect_weights(priority_order, decay=0.5):
    rank_order = [
        ASPECT_KEY_MAP[p]
        for p in priority_order
        if p in ASPECT_KEY_MAP
    ]
    return weights(rank_order, decay=decay)


In [98]:
# Test example
# sample_priority = ["GER_REQUIREMENTS","PROFESSOR_RATING","MAJOR_REQUIREMENTS","INTERESTS","TIME_PREFERENCE"]
# aspect_weights = compute_aspect_weights(sample_priority, decay=0.5)
# aspect_weights


In [99]:
def hhmm_to_min(t):
    if not t or ":" not in t:
        return None
    h, m = t.split(":")
    return int(h) * 60 + int(m)

def score_time(course_doc, time_pref):
    """
    time_pref: ["HH:MM", "HH:MM"] (24h earliest, latest)
    Uses meeting.start_min/end_min from the qualified courses collection.
    """
    mtg = course_doc.get("meeting") or {}
    start = mtg.get("start_min")
    end   = mtg.get("end_min")

    if start is None or end is None:
        return 0.5

    if not time_pref or len(time_pref) < 2:
        return 0.5

    pref_start = hhmm_to_min(time_pref[0])
    pref_end   = hhmm_to_min(time_pref[1])

    if pref_start is None or pref_end is None or pref_start >= pref_end:
        return 0.5

    # Overlap reward
    overlap_start = max(start, pref_start)
    overlap_end   = min(end, pref_end)
    overlap = max(0, overlap_end - overlap_start)
    duration = max(0, end - start)

    if duration > 0 and overlap > 0:
        return 0.7 + 0.3 * (overlap / duration)

    # Otherwise, distance penalty
    if start < pref_start:
        distance = pref_start - start
    else:
        distance = start - pref_end

    SCALE = 120  # 2 hours
    penalty = min(1.0, distance / SCALE)
    return max(0.0, 1.0 - penalty)


In [100]:
def ger_set_builder(course_doc, ger_due, ger_left):
    def extract_tags(lst): 
        tags = set()
        for item in lst or []:
            if isinstance(item, dict):
                tags.update(item.keys())
            elif isinstance(item, str):
                tags.add(item)
        return tags
    
    due_tags = extract_tags(ger_due)
    left_tags = extract_tags(ger_left)
    return due_tags, left_tags, due_tags | left_tags

def score_ger_reqs(course_doc, ger_due, ger_left):
    due_tags, left_tags, _ = ger_set_builder(course_doc, ger_due, ger_left)
    course_tags = set(course_doc.get("ger") or [])
    if not course_tags:
        return 0.0
    if course_tags & due_tags:
        return 1.0
    if course_tags & left_tags:
        return 0.6
    return 0.0


In [101]:
def score_major_ger(course_doc, major_must_set, major_elec_set):
    code = (course_doc.get("code") or "").upper()
    if code in major_must_set:
        return 1.0
    if code in major_elec_set:
        return 1.0
    return 0.05


In [102]:
def score_interests(course_doc, interests):
    if not interests:
        return 0.5

    title = (course_doc.get("title") or "").lower()
    code  = (course_doc.get("code") or "").lower()

    hits = 0
    for raw in interests:
        key = str(raw).lower()
        if key in title or key in code:
            hits += 1

    if hits == 0:
        return 0.2
    return min(1.0, 0.4 + 0.6 * (hits / len(interests)))

def safe_float(x, default=None):
    try:
        if x is None:
            return default
        return float(x)
    except Exception:
        return default

def score_ratings(course_doc):
    rmp = course_doc.get("rmp") or {}
    rating_raw = rmp.get("rating")
    num_raw    = rmp.get("num_ratings")

    # Default to 2.9 if missing, per your spec
    rating = safe_float(rating_raw, 2.9)
    num    = safe_float(num_raw, 0.0)

    # Map rating in [1,5] â†’ [0,1]
    base = max(0.0, min(1.0, (rating - 1.0) / 4.0))

    # Popularity bonus based on log(#ratings); if num==0 â†’ 0
    if num <= 0:
        pop = 0.0
    else:
        pop = math.log1p(num) / math.log1p(50)   # saturates ~50 ratings

    return min(1.0, 0.8 * base + 0.2 * pop)




In [103]:
def build_major_sets(major_must, major_elec_groups):
    must_set = set(major_must)
    elec_set = set()
    for group in major_elec_groups:
        for c in group.get("courses", []):
            elec_set.add(c)
    return must_set, elec_set

def build_student_context(shared_id):
    pref = synthetic_prefs[shared_id]
    hist = synthetic_courses[shared_id]

    degree = pref["degreeType"]  # "BA" or "BS"
    major_code = "CSBA" if degree == "BA" else "CSBS"

    year  = pref["year"]                          # "Freshman", ...
    term  = pref["expectedGraduation"]["semester"]  # "Fall"/"Spring"

    major_must, major_elec, ger_due, ger_left = track_grad(
        major_code,
        hist["incoming_test_courses"],
        hist["incoming_transfer_courses"],
        hist["emory_courses"],
        year,
        term,
        countic=True,
    )

    major_must_set, major_elec_set = build_major_sets(major_must, major_elec)

    ctx = {
        "shared_id": shared_id,
        "pref": pref,
        "interests": pref.get("interests", []),
        "timePreference": pref.get("timePreference", []),
        "preferredCredits": pref.get("preferredCredits"),
        "major_must_set": major_must_set,
        "major_elec_set": major_elec_set,
        "ger_due": ger_due,
        "ger_left": ger_left,
    }
    return ctx


In [104]:
def score_course(course_doc, student_ctx, aspect_weights=None, ridge_model=None):
    """
    If ridge_model is provided, use ML prediction as total_score.
    Otherwise, fall back to the old weighted-sum using aspect_weights.
    Returns (total_score, components_dict)
    """
    comps = compute_components(course_doc, student_ctx)

    # Fallback: old weighted-sum logic
    if ridge_model is None:
        if aspect_weights is None:
            # Equal weights if none provided
            aspect_weights = {k: 1.0 / len(comps) for k in comps.keys()}
        total = 0.0
        for aspect, w in aspect_weights.items():
            total += w * comps.get(aspect, 0.0)
        return total, comps

    # ML path: use Ridge prediction
    x = extract_features(course_doc, student_ctx).reshape(1, -1)
    total = float(ridge_model.predict(x)[0])
    return total, comps


In [105]:
FEATURE_ORDER = ["time", "major_ger", "ger_reqs", "ratings", "interests"]

def compute_components(course_doc, student_ctx):
    """Return the per-aspect scores you already use, as a dict."""
    comps = {
        "time":       score_time(course_doc, student_ctx["timePreference"]),
        "major_ger":  score_major_ger(
            course_doc,
            student_ctx["major_must_set"],
            student_ctx["major_elec_set"]
        ),
        "ger_reqs":   score_ger_reqs(
            course_doc,
            student_ctx["ger_due"],
            student_ctx["ger_left"]
        ),
        "ratings":    score_ratings(course_doc),
        "interests":  score_interests(course_doc, student_ctx["interests"]),
    }
    return comps

def extract_features(course_doc, student_ctx):
    """
    Turn (student, course) into a numeric feature vector for Ridge.
    Right now: just the 5 aspect scores in a fixed order.
    """
    comps = compute_components(course_doc, student_ctx)
    return np.array([comps[name] for name in FEATURE_ORDER], dtype=float)


In [106]:
def build_training_data(shared_ids=None):
    """
    Build X, y from synthetic users and candidate_courses.
    y = 1 if the student has ever taken that course (by code), else 0.
    """
    if shared_ids is None:
        shared_ids = list(synthetic_courses.keys())

    X_list = []
    y_list = []

    for sid in shared_ids:
        student_ctx = build_student_context(sid)
        hist = synthetic_courses[sid]
        taken_codes = set(hist.get("emory_courses", []))

        for c in candidate_courses:
            code = (c.get("code") or "").upper()
            x = extract_features(c, student_ctx)
            y = 1 if code in taken_codes else 0

            X_list.append(x)
            y_list.append(y)

    X = np.vstack(X_list)
    y = np.array(y_list, dtype=float)
    return X, y


In [107]:
def train_ridge_model(alpha=1.0, save_path="models/ridge_recommender.pkl"):
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    X, y = build_training_data()

    model = Ridge(alpha=alpha)
    model.fit(X, y)

    print("Training complete:", X.shape)
    joblib.dump(model, save_path)
    print("Saved â†’", save_path)
    return model



In [108]:
def load_ridge_model(path="models/ridge_recommender.pkl"):
    if os.path.exists(path):
        return joblib.load(path)
    return None

RIDGE_MODEL = load_ridge_model()
print("Loaded Ridge:", RIDGE_MODEL)


Loaded Ridge: Ridge()


In [109]:
def normalize_course_code(code: str) -> str:
    """
    Normalize course codes for comparison.
    Mirrors your general pattern: uppercase, strip spaces, drop trailing 'Z' variants.
    """
    if not code:
        return ""
    c = str(code).strip().upper()
    # Optional: mimic getridofZ logic from track_graduation.py
    if c.endswith("ZL"):
        c = c[:-2] + "L"
    elif c.endswith("Z"):
        c = c[:-1]
    return c


def user_completed_set(student_ctx) -> set:
    """
    Build a set of normalized course codes the student has completed.
    Works for both synthetic data and real Mongo data if your context
    includes these keys.
    """
    completed = []

    hist = student_ctx.get("history") or {}
    # synthetic_courses style
    completed += hist.get("incoming_test_courses", [])
    completed += hist.get("incoming_transfer_courses", [])
    completed += hist.get("emory_courses", [])

    # If your real context already has a list like 'all_courses', you can add it here:
    completed += student_ctx.get("all_courses", [])

    return {normalize_course_code(c) for c in completed}


def user_satisfies_prereqs(course_doc: dict, student_ctx) -> bool:
    """
    Return True if the user satisfies this course's prerequisites.
    
    Assumes course_doc['prerequisites'] is an AND-of-OR list of lists, e.g.:
        [["MATH111"], ["CS170","CS171"]]
    meaning:
        (MATH111) AND (CS170 OR CS171)

    If prerequisites is missing, None, empty, or only empty groups â†’ treated as no prereq.
    """
    completed = user_completed_set(student_ctx)

    prereq_groups = course_doc.get("prerequisites")

    # No prereq info â†’ assume open to everyone
    if prereq_groups is None:
        return True

    # Normalize weird formats just in case
    if not isinstance(prereq_groups, list):
        return True  # fail-open rather than exploding

    if prereq_groups == []:
        return True

    # Each group is an OR-clause; all groups together are AND.
    for group in prereq_groups:
        # Empty group = trivially satisfied (often used for co-req / weird encodings)
        if not group:
            continue

        group_ok = False
        for raw_code in group:
            code_norm = normalize_course_code(raw_code)
            if code_norm and code_norm in completed:
                group_ok = True
                break

        # If NONE of the codes in this group are completed, prereqs not met.
        if not group_ok:
            return False

    return True


In [110]:
def get_ranked_courses(shared_id, use_ml=True):
    student_ctx = build_student_context(shared_id)
    priority_order = student_ctx["pref"]["priorityOrder"]
    aspect_weights = compute_aspect_weights(priority_order, decay=0.5)

    rows = []
    for c in candidate_courses:
        # ðŸ”´ NEW: skip if prereqs not satisfied
        if not user_satisfies_prereqs(c, student_ctx):
            continue

        total, comps = score_course(
            c,
            student_ctx,
            aspect_weights=None if (use_ml and RIDGE_MODEL is not None) else aspect_weights,
            ridge_model=RIDGE_MODEL if use_ml else None,
        )

        meeting = c.get("meeting") or {}

        rows.append({
            "code": c.get("code"),
            "title": c.get("title"),
            "professor": c.get("professor"),

            # GER tags from DetailedCourses / CoursesEnriched
            "ger": c.get("ger") or [],

            # raw Atlas string
            "time_raw": c.get("time"),
            # alias if other code uses it
            "time": c.get("time"),

            # parsed meeting info
            "days": meeting.get("days"),
            "start_min": meeting.get("start_min"),
            "end_min": meeting.get("end_min"),

            # ratings / scores
            "rmp_rating": (c.get("rmp") or {}).get("rating"),
            "credits": credits,
            "total_score": total,
            "time_score": comps["time"],
            "major_ger_score": comps["major_ger"],
            "ger_reqs_score": comps["ger_reqs"],
            "ratings_score": comps["ratings"],
            "interests_score": comps["interests"],
        })



    df = pd.DataFrame(rows)
    df = df.sort_values("total_score", ascending=False).reset_index(drop=True)
    return df


In [111]:
ridge_model = train_ridge_model(alpha=1.0)
RIDGE_MODEL = load_ridge_model()

df = get_ranked_courses("000001", use_ml=True)
df.head()


Training complete: (71480, 5)
Saved â†’ models/ridge_recommender.pkl


Unnamed: 0,code,title,professor,ger,time_raw,time,days,start_min,end_min,rmp_rating,credits,total_score,time_score,major_ger_score,ger_reqs_score,ratings_score,interests_score
0,AAS490R,Senior Seminar: African American Human Rights,Carol Anderson,[],,,"[T, Th]",780.0,855.0,0.0,"Thanks to CWI, CNRI, BeOpen.com, Zope Corp...",0.034286,1.0,0.05,0.0,0.0,0.2
1,MATH112,Calculus II,Ariana Brown,[],,,"[T, Th]",960.0,1035.0,0.0,"Thanks to CWI, CNRI, BeOpen.com, Zope Corp...",0.034286,1.0,0.05,0.0,0.0,0.2
2,ACT477,Machine Learning for Fundamental Analysis,Matthew Lyle,[],,,"[T, Th]",870.0,945.0,1.0,"Thanks to CWI, CNRI, BeOpen.com, Zope Corp...",0.034286,1.0,0.05,0.0,0.0,0.2
3,BIOL450,Computational Neuroscience,Gordon Berman,[],,,"[T, Th]",780.0,855.0,0.0,"Thanks to CWI, CNRI, BeOpen.com, Zope Corp...",0.034286,1.0,0.05,0.0,0.0,0.2
4,CHEM343,Chemical Biology,Christine Dunham,[],,,"[T, Th]",600.0,675.0,0.0,"Thanks to CWI, CNRI, BeOpen.com, Zope Corp...",0.034286,1.0,0.05,0.0,0.0,0.2


In [112]:
ranked_000001 = get_ranked_courses("000001")
ranked_000001.head(10)


Unnamed: 0,code,title,professor,ger,time_raw,time,days,start_min,end_min,rmp_rating,credits,total_score,time_score,major_ger_score,ger_reqs_score,ratings_score,interests_score
0,AAS490R,Senior Seminar: African American Human Rights,Carol Anderson,[],,,"[T, Th]",780.0,855.0,0.0,"Thanks to CWI, CNRI, BeOpen.com, Zope Corp...",0.034286,1.0,0.05,0.0,0.0,0.2
1,MATH112,Calculus II,Ariana Brown,[],,,"[T, Th]",960.0,1035.0,0.0,"Thanks to CWI, CNRI, BeOpen.com, Zope Corp...",0.034286,1.0,0.05,0.0,0.0,0.2
2,ACT477,Machine Learning for Fundamental Analysis,Matthew Lyle,[],,,"[T, Th]",870.0,945.0,1.0,"Thanks to CWI, CNRI, BeOpen.com, Zope Corp...",0.034286,1.0,0.05,0.0,0.0,0.2
3,BIOL450,Computational Neuroscience,Gordon Berman,[],,,"[T, Th]",780.0,855.0,0.0,"Thanks to CWI, CNRI, BeOpen.com, Zope Corp...",0.034286,1.0,0.05,0.0,0.0,0.2
4,CHEM343,Chemical Biology,Christine Dunham,[],,,"[T, Th]",600.0,675.0,0.0,"Thanks to CWI, CNRI, BeOpen.com, Zope Corp...",0.034286,1.0,0.05,0.0,0.0,0.2
5,ENGRD226W,Public Science Communication,Robert Birdwell,[],,,"[M, W]",780.0,855.0,0.0,"Thanks to CWI, CNRI, BeOpen.com, Zope Corp...",0.034286,1.0,0.05,0.0,0.0,0.2
6,PHIL385,Special Topics in Philosophy: Critical Black T...,Calvin Warren,[],,,"[T, Th]",690.0,765.0,0.0,"Thanks to CWI, CNRI, BeOpen.com, Zope Corp...",0.034286,1.0,0.05,0.0,0.0,0.2
7,CS110,Computer Science Fundamentals,Kristin Williams,[],,,"[M, W]",870.0,945.0,0.0,"Thanks to CWI, CNRI, BeOpen.com, Zope Corp...",0.034286,1.0,0.05,0.0,0.0,0.2
8,FIN483,Applied Investment Mgt,Jeffrey Busse,[],,,"[T, Th]",960.0,1035.0,0.0,"Thanks to CWI, CNRI, BeOpen.com, Zope Corp...",0.034286,1.0,0.05,0.0,0.0,0.2
9,ENG275W,Intro.to Asian Am.Lit.Culture,Erica Kanesaka,[],,,"[T, Th]",600.0,675.0,0.0,"Thanks to CWI, CNRI, BeOpen.com, Zope Corp...",0.034286,1.0,0.05,0.0,0.0,0.2


In [113]:
def blocks_conflict(start1, end1, start2, end2):
    """
    True if two time blocks overlap (strict overlap: start < other_end and other_start < end)
    """
    if start1 is None or end1 is None or start2 is None or end2 is None:
        # If we don't know the time, treat it as "no conflict" (or change to True if you prefer)
        return False
    return (start1 < end2) and (start2 < end1)


In [None]:
def get_schedule(shared_id, n_courses=5, excluded_codes=None):
    """
    Build a workable, non-overlapping schedule for a student.

    - Starts from the ranked course list
    - Skips any course whose code is in excluded_codes
    - Tries to ensure GERs due this year/term are covered if possible
    - Ensures no time conflicts on any shared day
    - Tries to keep total credits <= preferredCredits (if provided)
    """
    import pandas as pd

    excluded_codes = set(excluded_codes or [])

    # Ranked course candidates
    ranked = get_ranked_courses(shared_id)

    # Filter out excluded courses
    if excluded_codes:
        ranked = ranked[~ranked["code"].isin(excluded_codes)]

    # Student context for GER + credit info
    student_ctx = build_student_context(shared_id)
    pref_cred = (
        student_ctx.get("preferredCredits")
        or student_ctx["pref"].get("preferredCredits")
        or 16
    )

    # --- helpers ---------------------------------------------------------

    def safe_credits_from_value(val, default=3.0):
        """
        Robustly convert a credits value to float.
        Falls back to `default` if it can't be parsed.
        """
        if val is None:
            return float(default)
        try:
            return float(val)
        except (TypeError, ValueError):
            return float(default)

    def extract_tags(lst):
        """
        Turn ger_due / ger_left lists into a set of tag strings.
        Handles both [{'ECS': 1}, ...] and ['ECS', 'HLTH'] formats.
        """
        tags = set()
        for item in lst or []:
            if isinstance(item, dict):
                for k, v in item.items():
                    if v:  # only keep if count > 0
                        tags.add(k)
            elif isinstance(item, str):
                tags.add(item)
        return tags

    # GER tags that should be addressed by this year/term
    due_ger_tags = extract_tags(student_ctx.get("ger_due"))

    # Time / conflict handling
    schedule_blocks = {}  # day -> list of (start_min, end_min)
    selected_rows = []
    selected_codes = set()
    total_credits = 0.0

    def has_conflict(row):
        days = row.get("days") or []
        start = row.get("start_min")
        end = row.get("end_min")
        if start is None or end is None or not days:
            # If we don't know the time, treat it as "no conflict"
            return False

        for d in days:
            for (s, e) in schedule_blocks.get(d, []):
                # intervals overlap if NOT fully before or after
                if not (end <= s or start >= e):
                    return True
        return False

    def add_row_if_possible(row):
        """
        Try to add this course to the schedule.
        Returns True if added, False otherwise.
        """
        nonlocal total_credits
        code = row["code"]

        if code in selected_codes or code in excluded_codes:
            return False

        credits = safe_credits_from_value(row.get("credits", 3.0))

        # credit cap (soft-ish; can relax by changing the margin)
        if total_credits + credits > pref_cred + 1e-6:
            return False

        # time conflict
        if has_conflict(row):
            return False

        # Accept this course
        selected_rows.append(row)
        selected_codes.add(code)

        days = row.get("days") or []
        start = row.get("start_min")
        end = row.get("end_min")
        if start is not None and end is not None:
            for d in days:
                schedule_blocks.setdefault(d, []).append((start, end))

        total_credits += credits
        return True

    # --- STEP 1: try to satisfy GERs that are due soon -------------------

    covered_due_tags = set()

    if "ger" in ranked.columns and due_ger_tags:
        for tag in sorted(due_ger_tags):
            # Scan ranked list in order to find the best course for this tag
            for _, row in ranked.iterrows():
                course_gers = row.get("ger") or []
                if tag not in course_gers:
                    continue
                if len(selected_rows) >= n_courses:
                    break
                if add_row_if_possible(row):
                    covered_due_tags.add(tag)
                    break  # move on to next GER tag

    # --- STEP 2: fill remaining slots greedily by score ------------------

    for _, row in ranked.iterrows():
        if len(selected_rows) >= n_courses:
            break
        add_row_if_possible(row)

    if not selected_rows:
        # Nothing could be scheduled; return an empty DataFrame
        return ranked.head(0)

    # Build final schedule DataFrame
    schedule = pd.DataFrame(selected_rows).copy()

    # Normalize credits to floats for the front end
    schedule["credits"] = schedule["credits"].apply(
        lambda v: safe_credits_from_value(v, default=3.0)
    )

    # Nicely numbered slots
    schedule["slot"] = range(1, len(schedule) + 1)

    # These are the columns your UI is likely to want
    keep_cols = [
        "slot",
        "code",
        "title",
        "days",
        "time_raw",
        "credits",
        "total_score",
    ]
    # Keep only the columns that actually exist
    keep_cols = [c for c in keep_cols if c in schedule.columns]

    return schedule[keep_cols]







In [115]:
schedule_1 = get_schedule("000001", n_courses=5)
schedule_1

Unnamed: 0,slot,code,title,days,time_raw,credits,total_score
0,1,AAS490R,Senior Seminar: African American Human Rights,"[T, Th]",,3.0,0.034286
1,2,MATH112,Calculus II,"[T, Th]",,3.0,0.034286
2,3,ACT477,Machine Learning for Fundamental Analysis,"[T, Th]",,3.0,0.034286
4,4,CHEM343,Chemical Biology,"[T, Th]",,3.0,0.034286
5,5,ENGRD226W,Public Science Communication,"[M, W]",,3.0,0.034286
