In [1]:
from EduQuest.client import LocalOllamaClient
from EduQuest.similarity import CosineSimilarityCalculator
from EduQuest.recommend import EmbeddingRecommender

import pandas as pd
import re, json

In [2]:
ollama_client = LocalOllamaClient(
    generator_model="mistral",
    rec_model="qwen2.5:7b-instruct",
    embedding_model="nomic-embed-text"
)

similarity_calculator = CosineSimilarityCalculator()
Traditional_recModel = EmbeddingRecommender(ollama_client, similarity_calculator)
Similarity_recModel = EmbeddingRecommender(ollama_client, similarity_calculator)

In [3]:
ori_df = pd.read_pickle('cleaned_nusmods.pkl')    
ori_df.head()

Unnamed: 0,course,title,description,level,prefix,ori_course_code
0,ABM5001,Leadership in Biomedicine,Leadership is fundamental to the success of in...,5000,ABM,ABM5001
1,ABM5002,Advanced Biostatistics for Research,This course is served as a concept-based intro...,5000,ABM,ABM5002
2,ABM5003,Biomedical Innovation & Enterprise,This course will furnish students with a thoro...,5000,ABM,ABM5003
3,ABM5004,Capstone Project,This course encompasses research projects rele...,5000,ABM,ABM5004
4,ABM5101,Applied Immunology,Advanced immunological applications play impor...,5000,ABM,ABM5101


In [4]:
ori_df.sample(8)

Unnamed: 0,course,title,description,level,prefix,ori_course_code
6633,PP5105,Cost Benefit Analysis in Public Policy,Cost Benefit Analysis (CBA) is one of the most...,5000,PP,PP5105
7155,RVX1004,"Cultural Expressions of Science, Technology an...",Since the 20th century we have witnessed the i...,1000,RVX,RVX1004
6908,PS3215,Equality and Justice,This course asks questions about the relations...,3000,PS,PS3215
2216,EC5101,Microeconomic Theory,The purpose of this course is to provide stude...,5000,EC,EC5101
4676,"LL4500 (Cross-listed as LL5500, LL6500, LLJ5500)",Recalibrating Multilateral Trading Regime,This course aims to provide the students with ...,4000,LL,LL4500
1071,CDE2001,Innovation and Design for Communities,This course is designed to facilitate communit...,2000,CDE,CDE2001
7174,SC2211,Medical Sociology,This course will examine the relationship betw...,2000,SC,SC2211
6904,PS2254,American Government and Politics,The American system has been viewed as a model...,2000,PS,PS2254


In [5]:
ori_df[ori_df['ori_course_code'] == "CS2040"]

Unnamed: 0,course,title,description,level,prefix,ori_course_code
1709,CS2040,Data Structures and Algorithms,This course introduces students to the design ...,2000,CS,CS2040


**Run this cell only if have not yet done description embeddings**

In [7]:
from tqdm import tqdm

# Assume emb_df is already loaded with columns ['course', 'title', 'description', 'level', ...]
new_embeddings = []

async def reembed_all():
    for desc in tqdm(ori_df["description"].tolist()):
        emb = await ollama_client.generate_embedding(desc)
        new_embeddings.append(emb)

await reembed_all()

ori_df['embedding'] = new_embeddings
ori_df = ori_df[["course", "title", "description", "embedding", "level", "prefix", "ori_course_code"]]

ori_df.to_pickle('embeddings_ollama.pkl')
print("✅ Saved new embeddings to embeddings_ollama.pkl")

ori_df.head()

100%|██████████| 9123/9123 [03:08<00:00, 48.35it/s]


✅ Saved new embeddings to embeddings_ollama.pkl


Unnamed: 0,course,title,description,embedding,level,prefix,ori_course_code
0,ABM5001,Leadership in Biomedicine,Leadership is fundamental to the success of in...,"[0.34031474590301514, 0.9926074147224426, -3.2...",5000,ABM,ABM5001
1,ABM5002,Advanced Biostatistics for Research,This course is served as a concept-based intro...,"[1.1152286529541016, 0.348115473985672, -3.109...",5000,ABM,ABM5002
2,ABM5003,Biomedical Innovation & Enterprise,This course will furnish students with a thoro...,"[-0.37872791290283203, 1.9817829132080078, -3....",5000,ABM,ABM5003
3,ABM5004,Capstone Project,This course encompasses research projects rele...,"[0.18893012404441833, 0.6445629000663757, -2.7...",5000,ABM,ABM5004
4,ABM5101,Applied Immunology,Advanced immunological applications play impor...,"[-0.0720033049583435, 0.4849713444709778, -3.2...",5000,ABM,ABM5101


**START HERE**

In [8]:
q1 = "I’m interested in artificial intelligence and how it can be applied in business decision-making. I want to learn both technical and practical aspects of machine learning"
q2 = "I want to understand how humans interact with technology — things like user behavior, data ethics, and human-computer interaction."
q3 = "I’m planning to start my own company in the future and want to take courses that help me understand entrepreneurship, product development, and marketing analytics."
q4 = "I’m fascinated by financial modeling and quantitative trading. I’d like courses that combine programming, statistics, and economics."
q5 = "I want to explore data-driven solutions for climate change and sustainability — especially courses related to environmental policy, analytics, or renewable energy."

In [9]:
q0 = "I want to learn machine learning and deep learning for real-world applications."

In [10]:
all_levels = ori_df['level'].unique().tolist()
print(all_levels)
undergrad = [1000, 2000, 3000, 4000]

allprefixes = ori_df['prefix'].unique().tolist()
print(allprefixes)

[5000, 1000, 2000, 3000, 4000, 6000, 9000, 8000]
['ABM', 'AC', 'ACC', 'ACE', 'ADS', 'AH', 'AI', 'AII', 'AIS', 'ALS', 'AN', 'AR', 'ARD', 'ASP', 'AUD', 'AX', 'AY', 'BAA', 'BBP', 'BDC', 'BHD', 'BI', 'BIH', 'BIS', 'BL', 'BLD', 'BMA', 'BMC', 'BMD', 'BME', 'BMF', 'BMG', 'BMH', 'BMI', 'BMK', 'BMO', 'BMP', 'BMS', 'BMT', 'BMU', 'BMX', 'BN', 'BPM', 'BPS', 'BRP', 'BS', 'BSE', 'BSN', 'BSP', 'BSS', 'BST', 'BT', 'BWS', 'BX', 'BZD', 'CAH', 'CAS', 'CCS', 'CDE', 'CDM', 'CE', 'CEG', 'CFA', 'CFG', 'CG', 'CH', 'CHC', 'CIT', 'CL', 'CLC', 'CM', 'CMH', 'CN', 'COS', 'CP', 'CS', 'CSA', 'CZ', 'DAO', 'DBA', 'DE', 'DEP', 'DI', 'DL', 'DOS', 'DSA', 'DSC', 'DSE', 'DSN', 'DSS', 'DTK', 'DTS', 'DY', 'EBA', 'EC', 'ECA', 'EE', 'EEK', 'EG', 'EHB', 'EL', 'ELC', 'EM', 'EN', 'ENC', 'ENV', 'ES', 'ESE', 'ESP', 'ETP', 'EU', 'EX', 'FAS', 'FDP', 'FE', 'FIN', 'FSC', 'FSP', 'FST', 'FT', 'GE', 'GEA', 'GEC', 'GEH', 'GEI', 'GEK', 'GEM', 'GEN', 'GES', 'GESS', 'GET', 'GEX', 'GL', 'GMS', 'GS', 'GSA', 'GSN', 'GSS', 'HE', 'HI', 'HLE', 'HM'

**Similarity way: Build Faiss Index to calculate similarity and Retrieve top-k most similar courses**

In [11]:
# Step 1: Load courses into the recommendation model
Similarity_recModel.load_courses(ori_df)

# Step 2: Build Faiss Index
await Similarity_recModel.build_faiss_index()

🔍 Checking for existing embeddings...
✅ FAISS index built successfully.


In [12]:
q1 = "I’d like to work in AI product management and understand both tech and business sides."


In [13]:
q2="I want to learn machine learning and deep learning for real-world applications."
q3="I’m interested in data mining, big data systems, and analytics engineering."
q4="I plan to start a startup and want entrepreneurship and innovation modules."
q5="I want to learn business analytics and data-driven decision making."
q6="Interested in marketing analytics and consumer insights."
q7="I’m fascinated by quantitative finance and financial modeling."
q8="I want to learn econometrics and causal inference for policy/markets."
q9= "I want to study sustainability, climate policy, and environmental analytics."
q10= "Looking for urban planning, smart cities, and geospatial analytics."
q11="I’m interested in bioinformatics, computational biology, and genomics analytics."
q12="I want healthcare analytics and public health policy modules."
q13= "I want HCI, UX design, and human factors courses."
q14="I’m curious about cognitive psychology and behavioural science."
q15="I want to improve academic writing, communication, and presentation skills."
q16="I’m looking for tech ethics, data privacy, and AI governance courses."
q17="I want to learn robotics, control systems, and perception."
q18="I’m into operations research, optimization, and logistics."
q19="I care about social policy, governance, and public administration."
q20="I want to explore media studies, society, and technology."
q21="I’d like energy systems, renewable technologies, and sustainability engineering."
q22="I’m interested in materials science, nanotech, and semiconductors."
q23="I’m considering a research track and want advanced seminars or capstone-style modules."

In [14]:
Sim_results, Sim_top10_courses = await Similarity_recModel.recommend_deterministic(q1, top_k=11)

In [15]:
Sim_top10_courses

Unnamed: 0,course,title,description,embedding,level,prefix,ori_course_code,similarity
0,DBA4813,AI Strategies in Business,Artificial Intelligence (AI) is becoming the n...,"[-0.7067713141441345, 1.785797119140625, -3.08...",4000,DBA,DBA4813,0.810678
1,BME5202,Opportunities in AI,This course is designed to explore the transfo...,"[-0.7825117707252502, 1.5607876777648926, -2.8...",5000,BME,BME5202,0.797669
2,MSI5002,Management of Technological Innovation with AI,Explores the transformative role of artificial...,"[-0.8056091070175171, 1.6745011806488037, -2.7...",5000,MSI,MSI5002,0.780681
3,BME5402,AI in HRM,Artificial intelligence (AI) and machine learn...,"[-0.7983016967773438, 0.8531933426856995, -3.0...",5000,BME,BME5402,0.766276
4,IS4246,Smart Systems and AI Governance,This course is designed for students who want ...,"[-1.0542210340499878, 1.47096848487854, -3.000...",4000,IS,IS4246,0.763267
5,IS2108,Full-stack Software Engineering for AI Solutio...,Modern business Artificial Intelligence (AI) s...,"[-0.6993556618690491, 1.0005601644515991, -2.6...",2000,IS,IS2108,0.761519
6,IS4108,AI Solutioning Capstone Project,Students are required to work (in small groups...,"[-0.7475270628929138, 1.2489631175994873, -2.7...",4000,IS,IS4108,0.760017
7,YSC2257,AI Projects and Case Studies,Learn about AI and machine learning from hands...,"[0.1369640827178955, 1.289811611175537, -2.427...",2000,YSC,YSC2257,0.7575
8,AI5201,Executive Mastery of Artificial Intelligence: ...,This course equips leaders with the essential ...,"[-0.693904459476471, 1.6468610763549805, -2.83...",5000,AI,AI5201,0.755185
9,EE5065,Tenets of AI in Robotics,Artificial intelligence (AI) is set to disrupt...,"[-0.2248518168926239, 1.0833220481872559, -2.8...",5000,EE,EE5065,0.753826


In [None]:
Sim_results = []
for course_code in Sim_top10_courses["course"]:
    result_code = course_code.split(" ")[0]
    Sim_results.append(result_code)

Sim_df = pd.DataFrame(Sim_results, columns=['course_code'])
Sim_df

In [None]:
print(Sim_results)

**LLM way: Use LLM to directly recommend courses based on user query**

In [None]:
emb_df = pd.read_pickle('embeddings_ollama.pkl')

In [None]:
Traditional_recModel.load_courses(emb_df)

In [None]:
text_output, course_df = await Traditional_recModel.recommend(q0, type=1)
print(text_output)

In [None]:
course_df

In [None]:
tra_data = text_output.split('\n')
print(tra_data)

In [None]:
course_name = []
for i in range(0, len(tra_data), 4):
    course_info = tra_data[i]
    course_info= re.sub(r"^[^A-Za-z]*", "", course_info)
    code, title = course_info.split(":", 1)
    course_name.append(code)

print(course_name)

In [None]:
tra_df = pd.DataFrame(course_name, columns=['course_name'])
tra_df

In [None]:
all(tra_df["course_name"].isin(ori_df["ori_course_code"]))  # should be True

In [None]:
tra_df[~tra_df["course_name"].isin(ori_df["ori_course_code"])]

In [None]:
same_courses = pd.merge(tra_df, Sim_df, left_on='course_name', right_on='course_code', how='inner')
print(same_courses)
print(f"Same courses: {len(same_courses)}")