In [None]:
import pandas as pd
import numpy as np

from google.colab import drive
drive.mount('/content/drive')

df = pd.read_csv('/content/drive/MyDrive/student_course_data.csv')
columns = ["StudentID"] + [f"Course{i}" for i in range(101, 121)]
df = pd.DataFrame(df, columns=columns)
df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,StudentID,Course101,Course102,Course103,Course104,Course105,Course106,Course107,Course108,Course109,...,Course111,Course112,Course113,Course114,Course115,Course116,Course117,Course118,Course119,Course120
0,1,0,0,0,0,1,1,0,1,1,...,0,1,0,1,1,0,1,0,0,1
1,2,1,1,1,1,0,1,1,1,1,...,0,1,0,1,0,0,1,0,0,0
2,3,1,0,0,1,1,0,1,1,1,...,1,1,1,0,1,0,0,1,1,1
3,4,1,1,1,1,1,1,0,1,1,...,0,1,0,0,1,1,1,0,0,1
4,5,1,0,1,0,0,0,0,0,0,...,1,0,1,1,0,1,1,1,1,0


# Jaccard Similarity kNN

User input & to-be-matched historical student course registration data contain:

*   courses previously taken (binary)
*   major
*   difficulty level preference

In [None]:
df = pd.read_csv('/content/drive/MyDrive/student_course_data.csv')
columns = ["StudentID"] + [f"Course{i}" for i in range(101, 121)]
df = pd.DataFrame(df, columns=columns)

# Sample student X
student_X = np.array([41] + [1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + ['Math', 'easy', 'Data Science'])
# Simulation data generation
majors = ['Math', 'CS', 'ECE']
df['major'] = np.random.choice(majors, size=len(df))
difficulty = ['easy', 'medium', 'hard']
df['difficulty'] = np.random.choice(difficulty, size=len(df))
career = ['Data Science', 'Software Engineering', 'AI/ML']
df['career'] = np.random.choice(career, size=len(df))

from scipy.spatial.distance import jaccard

NUM_COURSES = 20
NUM_NONBINARY_FEATURES = len(student_X) - 1 - NUM_COURSES
NUM_NEIGHBORS = 8
# Compute Jaccard similarity for all students
# (Jaccard similarity definition: https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.jaccard.html)
similarities = []
for _, row in df.iterrows():
    student_courses = row[1:-NUM_NONBINARY_FEATURES].values
    similarity = 1 - jaccard(student_X[1:-NUM_NONBINARY_FEATURES], student_courses)
    # major similarity bonus
    if student_X[-1] == row.iloc[-1]:
        similarity *= 1.1
    # difficulty preference similarity bonus
    if student_X[-2] == row.iloc[-2]:
        similarity *= 1.5
    # career bonus
    if student_X[-3] == row.iloc[-3]:
        similarity *= 1.8
    similarities.append((row["StudentID"], similarity))

# Sort by similarity and get top-NUM_NEIGHBORS similar students
similar_students = sorted(similarities, key=lambda x: x[1], reverse=True)[:NUM_NEIGHBORS]
top_student_ids = [s[0] for s in similar_students]
# print(f"Top {NUM_NEIGHBORS} similar students to Student X:")
# print(similar_students)

# Aggregate course enrollments among top-NUM_NEIGHBORS students
top_students_data = df[df["StudentID"].isin(top_student_ids)].iloc[:, 1:-NUM_NONBINARY_FEATURES].sum()
# Exclude courses already taken by student X
for i in range(len(top_students_data)):
    if student_X[i + 1] == 1:
        top_students_data[i] = 0

recommended_courses = top_students_data.sort_values(ascending=False).head()
print("\nTop recommended courses for Student X / Course enrollments among top N similar students:")
print(recommended_courses)


Top recommended courses for Student X / Course enrollments among top N similar students:
Course117    7
Course101    6
Course104    6
Course105    6
Course108    6
dtype: int64


# Non-Negative Matrix Factorization (results are very similar to kNN; harder to debug/customize)

In [None]:
from sklearn.decomposition import NMF

# Need to run the data again
df = pd.read_csv('/content/drive/MyDrive/student_course_data.csv')
columns = ["StudentID"] + [f"Course{i}" for i in range(101, 121)]
df = pd.DataFrame(df, columns=columns)
# 41 is student_X's student ID
student_X = np.array([41] + [1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

# Extract student-course matrix (excluding StudentID)
student_course_matrix = df.iloc[:, 1:].values

# Apply Non-Negative Matrix Factorization (NMF)
nmf = NMF(n_components=3, init='random', random_state=42)
W = nmf.fit_transform(student_course_matrix)
H = nmf.components_

# Predict course enrollment probabilities
predicted_matrix = np.dot(W, H)
predicted_df = pd.DataFrame(predicted_matrix, columns=columns[1:])

# Get predicted scores for student X
student_X_index = 0  # Assuming student X is at index 0
predicted_scores = predicted_df.iloc[student_X_index]

# Format output as a DataFrame
recommended_df = pd.DataFrame({
    "Course Recommended": recommended_courses.index,
    "Predicted Relevance Score": recommended_courses.values
})

for i in recommended_courses.index:
  if student_X[int(i[-3:]) - 100] == 1:
    recommended_df = recommended_df.drop(recommended_df[recommended_df["Course Recommended"] == i].index)

# Output recommended courses and their rankings
print("Top 10 recommended courses for Student X:")
print(recommended_df)

Top 10 recommended courses for Student X:
  Course Recommended  Predicted Relevance Score
0          Course117                          7
2          Course104                          6
3          Course105                          6
4          Course108                          6


## Collaborative Filtering with Embeddings (Week of Feb 5)

In [8]:
import random
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

def get_text_embedding(text):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased').to('cuda')

    encoding = tokenizer.batch_encode_plus([text],
      padding=True,              # Pad to the maximum sequence length
      truncation=True,           # Truncate to the maximum sequence length if necessary
      return_tensors='pt',      # Return PyTorch tensors
      add_special_tokens=True    # Add special tokens CLS and SEP
    )

    input_ids = encoding['input_ids'].to('cuda')  # Token IDs
    attention_mask = encoding['attention_mask'].to('cuda')  # Attention mask

    with torch.no_grad():
      outputs = model(input_ids, attention_mask=attention_mask)
      word_embeddings = outputs.last_hidden_state
    return word_embeddings.mean(dim=1)[0].cpu()

econ = get_text_embedding("Biomedical Engineering")
english = get_text_embedding("Medical Engineering")
cosine_similarity([econ], [english])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

array([[0.9270652]], dtype=float32)

In [9]:
# Define the number of students
num_students = 200
num_courses = 100
max_courses_per_student = 30
num_concentrations_per_major = 4
max_knowledge_area_of_interest = 4

In [10]:
import numpy as np
import pandas as pd
import random
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer

# Define categories
majors = [
    "Economics", "Robotics", "Statistics", "Art", "Computer Science",
    "Business Analytics", "Design", "Information Systems",
    "Public Policy", "Mechanical Engineering", "Electrical Engineering",
    "Chemical Engineering", "Architecture", "Drama"
]

levels_of_study = ["Undergraduate", "Graduate", "PhD"]

# Define concentrations per major
concentrations = {major: [f"Concentration {i+1}" for i in range(random.randint(1, num_concentrations_per_major))] for major in majors}

# Define career directions, hobbies, and knowledge areas
career_directions = ["Research/Academia", "Industry", "Entrepreneurship", "Consulting", "Public Service"]
hobbies = ["Reading", "Gaming", "Sports", "Music", "Art", "Coding", "Writing", "Photography", "Traveling", "Cooking"]
knowledge_areas = ["AI", "Finance", "Health Tech", "Cybersecurity", "Sustainability", "Robotics", "Data Science", "Economics", "Education", "Philosophy"]

# Create a large dictionary
ALL_EMBEDDINGS = {}
for major in majors:
  ALL_EMBEDDINGS[major] = get_text_embedding(major)
for career in career_directions:
  ALL_EMBEDDINGS[career] = get_text_embedding(career)
for hobby in hobbies:
  ALL_EMBEDDINGS[hobby] = get_text_embedding(hobby)
for knowledge in knowledge_areas:
  ALL_EMBEDDINGS[knowledge] = get_text_embedding(knowledge)

In [23]:
# Generate student data
students = []
for _ in range(num_students):
    student = {}
    student["Major"] = major = random.choice(majors)
    student["Level of Study"] = random.choice(levels_of_study)
    student["Concentration"] = random.choice(concentrations[major])
    student["Career Directions"] = random.sample(career_directions, k = random.randint(1, 4))
    student["Hobbies"] = random.sample(hobbies, k = random.randint(1, 4))
    student["Knowledge Areas"] = random.sample(knowledge_areas, k = random.randint(1, 4))
    student["Preference for Course Difficulty"] = random.randint(0, 1)

    # CBF and CF combining features
    student['CF Weight'] = random.uniform(0, 1)
    student['CBF Weight'] = 1 - student['CF Weight']
      # 0.6 is set arbitrarily to generate dummy data
    student['CBF Course Competence Weight'] = random.uniform(0, 0.6 * student['CBF Weight'])
    student['CBF Professor Rating Weight'] = random.uniform(0, 0.6 * student['CBF Weight'])
    student['CBF Knowledge Area Weight'] = student['CBF Weight'] - student['CBF Course Competence Weight'] - student['CBF Professor Rating Weight']

    # Ensure P + R + W = 1
    P, R = np.random.dirichlet(np.ones(2), size=1)[0]
    student["Weight for courses taken by others"] = P
    student["Weight for non-course factors"] = R
    student["Weight for rigor commitment"] = 1 - (P + R) # = W

    # Generate class scores (NaN for courses not taken)
    taken_courses = random.sample(range(num_courses), k=random.randint(5, max_courses_per_student))
    scores = {f"Course_{i+1}": (np.random.uniform(50, 100) if i in taken_courses else np.nan) for i in range(num_courses)}

    # Harder courses (divisible by 5) should have lower scores (is set arbitrarily to generate dummy data)
    for i in taken_courses:
        if (i + 1) % 5 == 0:
            scores[f"Course_{i+1}"] = np.random.uniform(40, 80)
            if student["Preference for Course Difficulty"] == 1:
                scores[f"Course_{i+1}"] -= 5  # Penalize easy-preference students

    student.update(scores)
    students.append(student)

# Convert to DataFrame
students_df = pd.DataFrame(students)

# Generate a test data instance with only 5 classes taken
test_student = random.choice(students).copy()
taken_courses = random.sample(range(num_courses), k=5)
test_student.update({f"Course_{i+1}": np.random.uniform(50, 100) if i in taken_courses else np.nan for i in range(num_courses)})
test_student_df = pd.DataFrame([test_student])

# Save data to CSV
students_df.to_csv("students_data.csv", index=False)
test_student_df.to_csv("test_student_data.csv", index=False)

# display(students_df.head())
display(test_student_df)


Unnamed: 0,Major,Level of Study,Concentration,Career Directions,Hobbies,Knowledge Areas,Preference for Course Difficulty,CF Weight,CBF Weight,CBF Course Competence Weight,...,Course_91,Course_92,Course_93,Course_94,Course_95,Course_96,Course_97,Course_98,Course_99,Course_100
0,Business Analytics,Undergraduate,Concentration 2,"[Industry, Entrepreneurship]","[Music, Traveling, Gaming]",[Robotics],0,0.006366,0.993634,0.481737,...,,,,,,,,,,


In [24]:
from re import L
# Collaborative Filtering Implementation

from gensim.models import Word2Vec
import gensim
from nltk.tokenize import sent_tokenize, word_tokenize

# Compute similarity using Jaccard for courses and Cosine for text-based features
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0

def compute_similarity(test_student, students_df, NUM_OF_NEIGHTBORS):
    similarities = []
    test_courses = set([course for course in test_student.index if "Course_" in course and not np.isnan(test_student[course])])

    for i, student in students_df.iterrows():
        # For numerical valued features, use jaccard-sim
        student_courses = set([course for course in student.index if "Course_" in course and not np.isnan(student[course])])
        courses_taken_jaccard_sim = jaccard_similarity(test_courses, student_courses)
        # For text-based features, use cosine-sim
        textual_feature_sim = 0
        categorical_features = ["Major", "Career Directions", "Hobbies", "Knowledge Areas"]
        for j in categorical_features:
          if type(students_df[j][i]) != list:
              data_text_embeddings = ALL_EMBEDDINGS[students_df[j][i]]
          else:
              data_text_embeddings = np.mean(np.array([ALL_EMBEDDINGS[x] for x in students_df[j][i]]),axis=0)
              # data_text_embeddings = get_text_embedding(" ".join(students_df[j][i]))
          if type(test_student[j]) != list:
              test_text_embeddings = ALL_EMBEDDINGS[test_student[j]]
          else:
              test_text_embeddings = np.mean(np.array([ALL_EMBEDDINGS[x] for x in test_student[j]]),axis=0)
          textual_feature_sim += cosine_similarity([data_text_embeddings], [test_text_embeddings])[0][0]

        # Course difficulty features
        inverse_pref_course_difficulty_sim = 1 if students_df["Preference for Course Difficulty"][i] == test_student["Preference for Course Difficulty"] else 0

        # Overall similarity
        overall_sim = 0
        overall_sim += test_student['Weight for courses taken by others'] * courses_taken_jaccard_sim
        overall_sim += test_student['Weight for non-course factors'] * textual_feature_sim
        overall_sim += test_student['Weight for rigor commitment'] * inverse_pref_course_difficulty_sim

        similarities.append((i, overall_sim))

    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:NUM_OF_NEIGHTBORS]  # Top NUM_OF_NEIGHTBORS similar students

# Find similar students and recommend courses
NUM_OF_NEIGHTBORS = 5
similar_students = compute_similarity(test_student_df.iloc[0], students_df, NUM_OF_NEIGHTBORS)
recommended_courses = set()

for student_idx, _ in similar_students:
    # Get the student’s row as a Series
    student_row = students_df.iloc[student_idx]
    # Extract courses the student has taken (not NaN)
    student_courses = set()
    for course in students_df.columns:
        if "Course_" in course and course in student_row:
            course_value = student_row[course]
            if not pd.isna(course_value):
                student_courses.add(course)

test_student_courses = set(test_student_df.columns)

CF_student_courses = list(student_courses)
test_student_courses = list(test_student_courses)
for i in CF_student_courses:
  if i in test_student_courses:
    CF_student_courses.remove(i)

print(f"Final recommended courses: {CF_student_courses}")

Final recommended courses: ['Course_78', 'Course_35', 'Course_6', 'Course_11', 'Course_58']


## Combining CF and CBF

In [25]:
recommended_courses_course_competence_sim = ['Course_12', 'Course_36', 'Course_40', 'Course_56', 'Course_72', 'Course_95', 'Course_14', 'Course_23', 'Course_65', 'Course_81', 'Course_91', 'Course_33', 'Course_9']

recommended_courses_prof_rating_sim = ['Course_5', 'Course_26', 'Course_47', 'Course_50', 'Course_84', 'Course_88', 'Course_39', 'Course_78', 'Course_92', 'Course_14', 'Course_60', 'Course_29', 'Course_74']

recommended_courses_knowledge_area_sim = ['Course_85', 'Course_3', 'Course_56', 'Course_77', 'Course_14', 'Course_11', 'Course_42', 'Course_30', 'Course_99', 'Course_66', 'Course_53', 'Course_7', 'Course_21']

relevance_score = {}
# CBF
# course competence similarity matrix
for course in recommended_courses_course_competence_sim:
  relevance_score[course] = test_student['CBF Course Competence Weight']
# professor rating similarity matrix
for course in recommended_courses_prof_rating_sim:
  if course in relevance_score:
    relevance_score[course] += test_student['CBF Professor Rating Weight']
  else:
    relevance_score[course] = test_student['CBF Professor Rating Weight']
# knowledge area similarity matrix
for course in recommended_courses_knowledge_area_sim:
  if course in relevance_score:
    relevance_score[course] += test_student['CBF Knowledge Area Weight']
  else:
    relevance_score[course] = test_student['CBF Knowledge Area Weight']
# Note, test_student['CBF Knowledge Area Weight'] + test_student['CBF Course Competence Weight'] + test_student['CBF Professor Rating Weight'] = test_student['CBF Weight']

# CF
for course in CF_student_courses:
  if course in relevance_score:
    relevance_score[course] += test_student['CF Weight']
  else:
    relevance_score[course] = test_student['CF Weight']

In [44]:
# display(relevance_score)
NUM_COURSES_RECOMMENDING = 10
sorted_relevance_score = sorted(relevance_score.items(), key=lambda item: item[1], reverse = True)[:NUM_COURSES_RECOMMENDING]
for i in sorted_relevance_score:
  print(f"{i[0]}, Relevance Score: {i[1]}")

Course_14, Relevance Score: 0.9936344691076607
Course_56, Relevance Score: 0.9387348173478346
Course_12, Relevance Score: 0.48173725358498276
Course_36, Relevance Score: 0.48173725358498276
Course_40, Relevance Score: 0.48173725358498276
Course_72, Relevance Score: 0.48173725358498276
Course_95, Relevance Score: 0.48173725358498276
Course_23, Relevance Score: 0.48173725358498276
Course_65, Relevance Score: 0.48173725358498276
Course_81, Relevance Score: 0.48173725358498276
