In [18]:
!pip install pandas numpy scikit-learn



In [19]:
import sys
print(sys.executable)

c:\Users\anahi\miniconda3\envs\dooleyhelps\python.exe


In [20]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
import os, json, math
import pandas as pd
# --- Figure out project root and key dirs ---
cwd = os.getcwd()
if os.path.basename(cwd) == "Model":
    PROJECT_ROOT = os.path.dirname(cwd)
else:
    PROJECT_ROOT = cwd  # if you're already at the root

MODEL_DIR = os.path.join(PROJECT_ROOT, "Model")
OUT_DIR   = os.path.join(PROJECT_ROOT, "out")

print("CWD:", cwd)
print("PROJECT_ROOT:", PROJECT_ROOT)
print("MODEL_DIR:", MODEL_DIR)
print("OUT_DIR:", OUT_DIR)

# Make sure Python can import from Model/
if MODEL_DIR not in sys.path:
    sys.path.append(MODEL_DIR)

from track_graduation import track_grad

# --- Paths to your data files ---
SYNTHETIC_COURSES_PATH = os.path.join(MODEL_DIR, "synthetic_courses.json")
SYNTHETIC_PREF_PATH    = os.path.join(MODEL_DIR, "synthetic_pref.json")
COURSES_QUAL_PATH      = os.path.join(OUT_DIR, "courses_qualified.json")

print("SYNTHETIC_COURSES_PATH:", SYNTHETIC_COURSES_PATH)
print("SYNTHETIC_PREF_PATH:", SYNTHETIC_PREF_PATH)
print("COURSES_QUAL_PATH:", COURSES_QUAL_PATH)


CWD: c:\Users\anahi\OneDrive\Documents\EMORY\Fall 2025\CS 370\Model
PROJECT_ROOT: c:\Users\anahi\OneDrive\Documents\EMORY\Fall 2025\CS 370
MODEL_DIR: c:\Users\anahi\OneDrive\Documents\EMORY\Fall 2025\CS 370\Model
OUT_DIR: c:\Users\anahi\OneDrive\Documents\EMORY\Fall 2025\CS 370\out
SYNTHETIC_COURSES_PATH: c:\Users\anahi\OneDrive\Documents\EMORY\Fall 2025\CS 370\Model\synthetic_courses.json
SYNTHETIC_PREF_PATH: c:\Users\anahi\OneDrive\Documents\EMORY\Fall 2025\CS 370\Model\synthetic_pref.json
COURSES_QUAL_PATH: c:\Users\anahi\OneDrive\Documents\EMORY\Fall 2025\CS 370\out\courses_qualified.json


In [21]:
with open(SYNTHETIC_COURSES_PATH, "r") as f:
    synthetic_courses = {rec["shared_id"]: rec for rec in json.load(f)}

with open(SYNTHETIC_PREF_PATH, "r") as f:
    synthetic_prefs = {rec["shared_id"]: rec for rec in json.load(f)}

with open(COURSES_QUAL_PATH, "r") as f:
    candidate_courses = json.load(f)

print("students:", len(synthetic_courses), "candidate courses:", len(candidate_courses))


students: 40 candidate courses: 1787


In [22]:
# Map from survey "priorityOrder" labels → internal aspect keys
ASPECT_KEY_MAP = {
    "GER_REQUIREMENTS":   "ger_reqs",
    "PROFESSOR_RATING":   "ratings",
    "MAJOR_REQUIREMENTS": "major_ger",
    "INTERESTS":          "interests",
    "TIME_PREFERENCE":    "time",
}

def weights(rank_order, decay=0.5):
    """
    rank_order: list of aspect keys in priority order (best → worst)
    decay: geometric decay factor between ranks.
    """
    raw = {}
    for rank, aspect in enumerate(rank_order):
        raw[aspect] = decay ** rank

    total = sum(raw.values())
    return {k: v / total for k, v in raw.items()}

def compute_aspect_weights(priority_order, decay=0.5):
    rank_order = [
        ASPECT_KEY_MAP[p]
        for p in priority_order
        if p in ASPECT_KEY_MAP
    ]
    return weights(rank_order, decay=decay)


In [23]:
# Test example
# sample_priority = ["GER_REQUIREMENTS","PROFESSOR_RATING","MAJOR_REQUIREMENTS","INTERESTS","TIME_PREFERENCE"]
# aspect_weights = compute_aspect_weights(sample_priority, decay=0.5)
# aspect_weights


In [24]:
def hhmm_to_min(t):
    if not t or ":" not in t:
        return None
    h, m = t.split(":")
    return int(h) * 60 + int(m)

def score_time(course_doc, time_pref):
    """
    time_pref: ["HH:MM", "HH:MM"] (24h earliest, latest)
    Uses meeting.start_min/end_min from the qualified courses collection.
    """
    mtg = course_doc.get("meeting") or {}
    start = mtg.get("start_min")
    end   = mtg.get("end_min")

    if start is None or end is None:
        return 0.5

    if not time_pref or len(time_pref) < 2:
        return 0.5

    pref_start = hhmm_to_min(time_pref[0])
    pref_end   = hhmm_to_min(time_pref[1])

    if pref_start is None or pref_end is None or pref_start >= pref_end:
        return 0.5

    # Overlap reward
    overlap_start = max(start, pref_start)
    overlap_end   = min(end, pref_end)
    overlap = max(0, overlap_end - overlap_start)
    duration = max(0, end - start)

    if duration > 0 and overlap > 0:
        return 0.7 + 0.3 * (overlap / duration)

    # Otherwise, distance penalty
    if start < pref_start:
        distance = pref_start - start
    else:
        distance = start - pref_end

    SCALE = 120  # 2 hours
    penalty = min(1.0, distance / SCALE)
    return max(0.0, 1.0 - penalty)


In [25]:
def ger_set_builder(course_doc, ger_due, ger_left):
    def extract_tags(lst): 
        tags = set()
        for item in lst or []:
            if isinstance(item, dict):
                tags.update(item.keys())
            elif isinstance(item, str):
                tags.add(item)
        return tags
    
    due_tags = extract_tags(ger_due)
    left_tags = extract_tags(ger_left)
    return due_tags, left_tags, due_tags | left_tags

def score_ger_reqs(course_doc, ger_due, ger_left):
    due_tags, left_tags, _ = ger_set_builder(course_doc, ger_due, ger_left)
    course_tags = set(course_doc.get("ger") or [])
    if not course_tags:
        return 0.0
    if course_tags & due_tags:
        return 1.0
    if course_tags & left_tags:
        return 0.6
    return 0.0


In [26]:
def score_major_ger(course_doc, major_must_set, major_elec_set):
    code = (course_doc.get("code") or "").upper()
    if code in major_must_set:
        return 1.0
    if code in major_elec_set:
        return 1.0
    return 0.05


In [27]:
def score_interests(course_doc, interests):
    if not interests:
        return 0.5

    title = (course_doc.get("title") or "").lower()
    code  = (course_doc.get("code") or "").lower()

    hits = 0
    for raw in interests:
        key = str(raw).lower()
        if key in title or key in code:
            hits += 1

    if hits == 0:
        return 0.2
    return min(1.0, 0.4 + 0.6 * (hits / len(interests)))


import math

def safe_float(x, default=None):
    try:
        if x is None:
            return default
        return float(x)
    except (TypeError, ValueError):
        return default

def score_ratings(course_doc):
    """
    Robust RMP scoring:
    - rating and num_ratings may be strings or missing
    - default rating = 2.9
    - default num_ratings = 0
    """
    rmp = course_doc.get("rmp") or {}
    rating_raw = rmp.get("rating")
    num_raw    = rmp.get("num_ratings")

    rating = safe_float(rating_raw, 2.9)
    num    = safe_float(num_raw, 0.0)

    # Map rating in [1,5] → [0,1]
    base = max(0.0, min(1.0, (rating - 1.0) / 4.0))

    # Popularity bonus based on log(#ratings); if num==0 → 0
    if num <= 0:
        pop = 0.0
    else:
        pop = math.log1p(num) / math.log1p(50)

    return min(1.0, 0.8 * base + 0.2 * pop)



In [28]:
def build_major_sets(major_must, major_elec_groups):
    must_set = set(major_must)
    elec_set = set()
    for group in major_elec_groups:
        for c in group.get("courses", []):
            elec_set.add(c)
    return must_set, elec_set

def build_student_context(shared_id):
    pref = synthetic_prefs[shared_id]
    hist = synthetic_courses[shared_id]

    degree = pref["degreeType"]  # "BA" or "BS"
    major_code = "CSBA" if degree == "BA" else "CSBS"

    year  = pref["year"]                          # "Freshman", ...
    term  = pref["expectedGraduation"]["semester"]  # "Fall"/"Spring"

    major_must, major_elec, ger_due, ger_left = track_grad(
        major_code,
        hist["incoming_test_courses"],
        hist["incoming_transfer_courses"],
        hist["emory_courses"],
        year,
        term,
        countic=True,
    )

    major_must_set, major_elec_set = build_major_sets(major_must, major_elec)

    ctx = {
        "shared_id": shared_id,
        "pref": pref,
        "interests": pref.get("interests", []),
        "timePreference": pref.get("timePreference", []),
        "preferredCredits": pref.get("preferredCredits"),
        "major_must_set": major_must_set,
        "major_elec_set": major_elec_set,
        "ger_due": ger_due,
        "ger_left": ger_left,
    }
    return ctx


In [29]:
def score_course(course_doc, student_ctx, aspect_weights):
    components = {
        "time":       score_time(course_doc, student_ctx["timePreference"]),
        "major_ger":  score_major_ger(course_doc, student_ctx["major_must_set"],
                                      student_ctx["major_elec_set"]),
        "ger_reqs":   score_ger_reqs(course_doc, student_ctx["ger_due"], student_ctx["ger_left"]),
        "ratings":    score_ratings(course_doc),
        "interests":  score_interests(course_doc, student_ctx["interests"]),
    }
    total = 0.0
    for aspect, w in aspect_weights.items():
        total += w * components.get(aspect, 0.0)
    return total, components


In [30]:
def get_ranked_courses(shared_id):
    """
    Returns a DataFrame of *all* candidate courses for this student,
    sorted by total_score descending.
    """
    student_ctx = build_student_context(shared_id)
    priority_order = student_ctx["pref"]["priorityOrder"]
    aspect_weights = compute_aspect_weights(priority_order, decay=0.5)

    rows = []
    for c in candidate_courses:
        total, comps = score_course(c, student_ctx, aspect_weights)
        meeting = c.get("meeting") or {}
        credits = c.get("credits") or c.get("credit") or 3.0

        rows.append({
            "shared_id": shared_id,
            "code": c.get("code"),
            "title": c.get("title"),
            "days": meeting.get("days") or [],           # ⬅️ NEW
            "time_raw": meeting.get("raw"),
            "start_min": meeting.get("start_min"),
            "end_min": meeting.get("end_min"),
            "rmp_rating": (c.get("rmp") or {}).get("rating"),
            "credits": credits,
            "total_score": total,
            "time_score": comps["time"],
            "major_ger_score": comps["major_ger"],
            "ger_reqs_score": comps["ger_reqs"],
            "ratings_score": comps["ratings"],
            "interests_score": comps["interests"],
        })

    df = pd.DataFrame(rows)
    df = df.sort_values("total_score", ascending=False).reset_index(drop=True)
    return df


In [31]:
ranked_000001 = get_ranked_courses("000001")
ranked_000001.head(10)


Unnamed: 0,shared_id,code,title,days,time_raw,start_min,end_min,rmp_rating,credits,total_score,time_score,major_ger_score,ger_reqs_score,ratings_score,interests_score
0,1,CS329,Computational Linguistics,"[M, W]",MW 2:30pm-3:45pm,870.0,945.0,4.5,3.0,0.354839,1.0,1.0,0.0,0.7,0.2
1,1,CS377,Database Systems,"[M, W]",MW 10am-11:15am,600.0,675.0,4.4,3.0,0.349677,1.0,1.0,0.0,0.68,0.2
2,1,CS171,Introduction to Computer Science II,"[M, W]",MW 11:30am-12:45pm,690.0,765.0,4.4,3.0,0.349677,1.0,1.0,0.0,0.68,0.2
3,1,CS463,Quantum Computing and Information,"[M, W]",MW 1pm-2:15pm,780.0,855.0,4.2,3.0,0.339355,1.0,1.0,0.0,0.64,0.2
4,1,CS370,Computer Science Practicum,"[T, Th]",TTh 4pm-5:15pm,960.0,1035.0,3.6,3.0,0.308387,1.0,1.0,0.0,0.52,0.2
5,1,CS370,Computer Science Practicum,"[T, Th]",TTh 5:30pm-6:45pm,1050.0,1125.0,3.6,3.0,0.284194,0.25,1.0,0.0,0.52,0.2
6,1,CS255,Computer Architecture/Machine Level Programming,"[T, Th]",TTh 10am-11:15am,600.0,675.0,,3.0,0.272258,1.0,1.0,0.0,0.38,0.2
7,1,MATH221,Linear Algebra,"[T, Th]",TTh 8:30am-9:45am,510.0,585.0,,4.0,0.272258,1.0,1.0,0.0,0.38,0.2
8,1,MATH221,Linear Algebra,"[T, Th]",TTh 1pm-2:15pm,780.0,855.0,,4.0,0.272258,1.0,1.0,0.0,0.38,0.2
9,1,CS452,Operating Systems,"[T, Th]",TTh 2:30pm-3:45pm,870.0,945.0,,3.0,0.272258,1.0,1.0,0.0,0.38,0.2


In [32]:
def blocks_conflict(start1, end1, start2, end2):
    """
    True if two time blocks overlap (strict overlap: start < other_end and other_start < end)
    """
    if start1 is None or end1 is None or start2 is None or end2 is None:
        # If we don't know the time, treat it as "no conflict" (or change to True if you prefer)
        return False
    return (start1 < end2) and (start2 < end1)


In [33]:
def get_schedule(shared_id, n_courses=5, excluded_codes=None):
    """
    Build a workable, non-overlapping schedule for a student.

    - Starts from the ranked course list
    - Skips any course whose code is in excluded_codes
    - Ensures no time conflicts on any shared day
    - Tries to keep total credits <= preferredCredits (if provided)
    """
    excluded_codes = set(excluded_codes or [])

    ranked = get_ranked_courses(shared_id)

    # Filter out excluded courses
    if excluded_codes:
        ranked = ranked[~ranked["code"].isin(excluded_codes)]

    # Figure out credit target
    student_ctx = build_student_context(shared_id)
    pref_cred = student_ctx.get("preferredCredits") or 16

    selected_rows = []
    total_credits = 0.0

    # We'll store occupied blocks per day for conflict checking
    # schedule_blocks["M"] = [(start_min, end_min), ...]
    schedule_blocks = {}

    for _, row in ranked.iterrows():
        if len(selected_rows) >= n_courses:
            break

        code = row["code"]
        days = row["days"] or []
        start = row["start_min"]
        end   = row["end_min"]
        credits = float(row.get("credits", 3.0))

        # Check credit cap (soft; you can relax this if needed)
        if total_credits + credits > pref_cred + 1e-6:
            continue

        # Check time conflicts: for each day, see if the block overlaps anything already chosen.
        conflict = False
        for d in days:
            existing_blocks = schedule_blocks.get(d, [])
            for (s0, e0) in existing_blocks:
                if blocks_conflict(start, end, s0, e0):
                    conflict = True
                    break
            if conflict:
                break

        if conflict:
            continue

        # No conflict → accept this course
        selected_rows.append(row)
        total_credits += credits
        for d in days:
            schedule_blocks.setdefault(d, []).append((start, end))

    if not selected_rows:
        # In worst case, just return the top course or empty DF
        return ranked.head(n_courses)

    schedule = pd.DataFrame(selected_rows).copy()
    schedule = schedule.sort_values("total_score", ascending=False).reset_index(drop=True)

    # Add a nice "slot" column for front-end placement
    schedule["slot"] = range(1, len(schedule) + 1)

    # Keep the most useful columns for the UI
    return schedule[[
        "slot",
        "code",
        "title",
        "days",
        "time_raw",
        "credits",
        "total_score",
    ]]


In [34]:
schedule_1 = get_schedule("000001", n_courses=5)
schedule_1

Unnamed: 0,slot,code,title,days,time_raw,credits,total_score
0,1,CS329,Computational Linguistics,"[M, W]",MW 2:30pm-3:45pm,3.0,0.354839
1,2,CS377,Database Systems,"[M, W]",MW 10am-11:15am,3.0,0.349677
2,3,CS171,Introduction to Computer Science II,"[M, W]",MW 11:30am-12:45pm,3.0,0.349677
3,4,CS463,Quantum Computing and Information,"[M, W]",MW 1pm-2:15pm,3.0,0.339355
4,5,CS370,Computer Science Practicum,"[T, Th]",TTh 4pm-5:15pm,3.0,0.308387
