In [None]:
import pandas as pd
import numpy as np

from google.colab import drive
drive.mount('/content/drive')

df = pd.read_csv('/content/drive/MyDrive/student_course_data.csv')
columns = ["StudentID"] + [f"Course{i}" for i in range(101, 121)]
df = pd.DataFrame(df, columns=columns)
df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,StudentID,Course101,Course102,Course103,Course104,Course105,Course106,Course107,Course108,Course109,...,Course111,Course112,Course113,Course114,Course115,Course116,Course117,Course118,Course119,Course120
0,1,0,0,0,0,1,1,0,1,1,...,0,1,0,1,1,0,1,0,0,1
1,2,1,1,1,1,0,1,1,1,1,...,0,1,0,1,0,0,1,0,0,0
2,3,1,0,0,1,1,0,1,1,1,...,1,1,1,0,1,0,0,1,1,1
3,4,1,1,1,1,1,1,0,1,1,...,0,1,0,0,1,1,1,0,0,1
4,5,1,0,1,0,0,0,0,0,0,...,1,0,1,1,0,1,1,1,1,0


# Jaccard Similarity kNN

User input & to-be-matched historical student course registration data contain:

*   courses previously taken (binary)
*   major
*   difficulty level preference

In [None]:
df = pd.read_csv('/content/drive/MyDrive/student_course_data.csv')
columns = ["StudentID"] + [f"Course{i}" for i in range(101, 121)]
df = pd.DataFrame(df, columns=columns)

# Sample student X
student_X = np.array([41] + [1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + ['Math', 'easy', 'Data Science'])
# Simulation data generation
majors = ['Math', 'CS', 'ECE']
df['major'] = np.random.choice(majors, size=len(df))
difficulty = ['easy', 'medium', 'hard']
df['difficulty'] = np.random.choice(difficulty, size=len(df))
career = ['Data Science', 'Software Engineering', 'AI/ML']
df['career'] = np.random.choice(career, size=len(df))

from scipy.spatial.distance import jaccard

NUM_COURSES = 20
NUM_NONBINARY_FEATURES = len(student_X) - 1 - NUM_COURSES
NUM_NEIGHBORS = 8
# Compute Jaccard similarity for all students
# (Jaccard similarity definition: https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.jaccard.html)
similarities = []
for _, row in df.iterrows():
    student_courses = row[1:-NUM_NONBINARY_FEATURES].values
    similarity = 1 - jaccard(student_X[1:-NUM_NONBINARY_FEATURES], student_courses)
    # major similarity bonus
    if student_X[-1] == row.iloc[-1]:
        similarity *= 1.1
    # difficulty preference similarity bonus
    if student_X[-2] == row.iloc[-2]:
        similarity *= 1.5
    # career bonus
    if student_X[-3] == row.iloc[-3]:
        similarity *= 1.8
    similarities.append((row["StudentID"], similarity))

# Sort by similarity and get top-NUM_NEIGHBORS similar students
similar_students = sorted(similarities, key=lambda x: x[1], reverse=True)[:NUM_NEIGHBORS]
top_student_ids = [s[0] for s in similar_students]
# print(f"Top {NUM_NEIGHBORS} similar students to Student X:")
# print(similar_students)

# Aggregate course enrollments among top-NUM_NEIGHBORS students
top_students_data = df[df["StudentID"].isin(top_student_ids)].iloc[:, 1:-NUM_NONBINARY_FEATURES].sum()
# Exclude courses already taken by student X
for i in range(len(top_students_data)):
    if student_X[i + 1] == 1:
        top_students_data[i] = 0

recommended_courses = top_students_data.sort_values(ascending=False).head()
print("\nTop recommended courses for Student X / Course enrollments among top N similar students:")
print(recommended_courses)


Top recommended courses for Student X / Course enrollments among top N similar students:
Course117    7
Course101    6
Course104    6
Course105    6
Course108    6
dtype: int64


# Non-Negative Matrix Factorization (results are very similar to kNN; harder to debug/customize)

In [None]:
from sklearn.decomposition import NMF

# Need to run the data again
df = pd.read_csv('/content/drive/MyDrive/student_course_data.csv')
columns = ["StudentID"] + [f"Course{i}" for i in range(101, 121)]
df = pd.DataFrame(df, columns=columns)
# 41 is student_X's student ID
student_X = np.array([41] + [1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

# Extract student-course matrix (excluding StudentID)
student_course_matrix = df.iloc[:, 1:].values

# Apply Non-Negative Matrix Factorization (NMF)
nmf = NMF(n_components=3, init='random', random_state=42)
W = nmf.fit_transform(student_course_matrix)
H = nmf.components_

# Predict course enrollment probabilities
predicted_matrix = np.dot(W, H)
predicted_df = pd.DataFrame(predicted_matrix, columns=columns[1:])

# Get predicted scores for student X
student_X_index = 0  # Assuming student X is at index 0
predicted_scores = predicted_df.iloc[student_X_index]

# Format output as a DataFrame
recommended_df = pd.DataFrame({
    "Course Recommended": recommended_courses.index,
    "Predicted Relevance Score": recommended_courses.values
})

for i in recommended_courses.index:
  if student_X[int(i[-3:]) - 100] == 1:
    recommended_df = recommended_df.drop(recommended_df[recommended_df["Course Recommended"] == i].index)

# Output recommended courses and their rankings
print("Top 10 recommended courses for Student X:")
print(recommended_df)

Top 10 recommended courses for Student X:
  Course Recommended  Predicted Relevance Score
0          Course117                          7
2          Course104                          6
3          Course105                          6
4          Course108                          6


# Collaborative Filtering with Embeddings (Week of Feb 5)

## All Student Data Generation (with embedding)

In [None]:
import random
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

def get_text_embedding(text):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased').to('cuda')

    encoding = tokenizer.batch_encode_plus([text],
      padding=True,              # Pad to the maximum sequence length
      truncation=True,           # Truncate to the maximum sequence length if necessary
      return_tensors='pt',      # Return PyTorch tensors
      add_special_tokens=True    # Add special tokens CLS and SEP
    )

    input_ids = encoding['input_ids'].to('cuda')  # Token IDs
    attention_mask = encoding['attention_mask'].to('cuda')  # Attention mask

    with torch.no_grad():
      outputs = model(input_ids, attention_mask=attention_mask)
      word_embeddings = outputs.last_hidden_state
    return word_embeddings.mean(dim=1)[0].cpu()

# econ = get_text_embedding("Biomedical Engineering")
# english = get_text_embedding("Medical Engineering")
# cosine_similarity([econ], [english])

In [None]:
# Define the number of students
num_students = 200
num_courses = 100
max_courses_per_student = 40
min_courses_per_student = 30
num_concentrations_per_major = 4
max_knowledge_area_of_interest = 4

In [None]:
import numpy as np
import pandas as pd
import random
from sklearn.metrics.pairwise import cosine_similarity

import torch
print(torch.cuda.is_available()) # Answer should be YES, else do not proceed
device = torch.device("cuda:0")

# Define categories
majors = [
    "Economics", "Robotics", "Statistics", "Art", "Computer Science",
    "Business Analytics", "Design", "Information Systems",
    "Public Policy", "Mechanical Engineering", "Electrical Engineering",
    "Chemical Engineering", "Architecture", "Drama"
]

levels_of_study = ["Undergraduate", "Graduate", "PhD"]

# Define concentrations per major
concentrations = {major: [f"Concentration {i+1}" for i in range(random.randint(1, num_concentrations_per_major))] for major in majors}

# Define career directions, hobbies, and knowledge areas
career_directions = ["Research/Academia", "Industry", "Entrepreneurship", "Consulting", "Public Service"]
hobbies = ["Reading", "Gaming", "Sports", "Music", "Art", "Coding", "Writing", "Photography", "Traveling", "Cooking"]
knowledge_areas = ["AI", "Finance", "Health Tech", "Cybersecurity", "Sustainability", "Robotics", "Data Science", "Economics", "Education", "Philosophy"]

# Create a large dictionary
ALL_EMBEDDINGS = {}
for major in majors:
  ALL_EMBEDDINGS[major] = get_text_embedding(major)
for career in career_directions:
  ALL_EMBEDDINGS[career] = get_text_embedding(career)
for hobby in hobbies:
  ALL_EMBEDDINGS[hobby] = get_text_embedding(hobby)
for knowledge in knowledge_areas:
  ALL_EMBEDDINGS[knowledge] = get_text_embedding(knowledge)

True


In [None]:
# Generate student data
students = []
for _ in range(num_students):
    student = {}
    student["Major"] = major = random.choice(majors)
    student["Level of Study"] = random.choice(levels_of_study)
    student["Concentration"] = random.choice(concentrations[major])
    student["Career Directions"] = random.choice(career_directions)
    student["Hobbies"] = random.sample(hobbies, k = random.randint(1, 4))
    student["Knowledge Areas"] = random.sample(knowledge_areas, k = random.randint(1, 4))
    student["Preference for Course Difficulty"] = random.randint(0, 1)

    # CBF and CF combining features
    student['CF Weight'] = random.uniform(0, 1)
    student['CBF Weight'] = 1 - student['CF Weight']
      # 0.6 is set arbitrarily to generate dummy data
    student['CBF Course Competence Weight'] = random.uniform(0, 0.6 * student['CBF Weight'])
    student['CBF Professor Rating Weight'] = random.uniform(0, 0.6 * student['CBF Weight'])
    student['CBF Knowledge Area Weight'] = student['CBF Weight'] - student['CBF Course Competence Weight'] - student['CBF Professor Rating Weight']

    # Ensure P + R + W = 1
    P, R = np.random.dirichlet(np.ones(2), size=1)[0]
    student["Weight for courses taken by others"] = P
    student["Weight for non-course factors"] = R
    student["Weight for rigor commitment"] = 1 - (P + R) # = W

    # Generate class scores (NaN for courses not taken)
    taken_courses = random.sample(range(num_courses), k=random.randint(min_courses_per_student, max_courses_per_student))
    scores = {f"Course_{i+1}": (np.random.uniform(50, 100) if i in taken_courses else np.nan) for i in range(num_courses)}
    # Harder courses (divisible by 5) should have lower scores (is set arbitrarily to generate dummy data)
    for i in taken_courses:
        if (i + 1) % 5 == 0:
            scores[f"Course_{i+1}"] = np.random.uniform(40, 80)
            if student["Preference for Course Difficulty"] == 1:
                scores[f"Course_{i+1}"] -= 5  # Penalize easy-preference students

    student.update(scores)
    students.append(student)

# Convert to DataFrame
students_df = pd.DataFrame(students)

students_df.head(100)

Unnamed: 0,Major,Level of Study,Concentration,Career Directions,Hobbies,Knowledge Areas,Preference for Course Difficulty,CF Weight,CBF Weight,CBF Course Competence Weight,...,Course_91,Course_92,Course_93,Course_94,Course_95,Course_96,Course_97,Course_98,Course_99,Course_100
0,Chemical Engineering,PhD,Concentration 1,Industry,"[Reading, Traveling]","[Philosophy, Education, Health Tech, Robotics]",1,0.400193,0.599807,0.183620,...,85.417670,90.082949,77.679025,,,,71.396566,,,
1,Art,PhD,Concentration 1,Public Service,"[Gaming, Coding, Music]",[AI],0,0.911953,0.088047,0.009661,...,62.253736,,90.220411,,79.273254,77.161268,89.789623,80.65089,85.24539,
2,Statistics,Graduate,Concentration 2,Industry,[Reading],"[AI, Robotics, Finance, Health Tech]",0,0.317201,0.682799,0.209464,...,,,,,,86.459490,,,,
3,Computer Science,Undergraduate,Concentration 2,Entrepreneurship,"[Cooking, Sports, Reading]",[Finance],0,0.769483,0.230517,0.014353,...,,,57.524423,62.881768,78.313898,,57.181478,,,73.544662
4,Computer Science,Undergraduate,Concentration 1,Research/Academia,[Writing],"[AI, Robotics, Health Tech]",1,0.299647,0.700353,0.310842,...,,,,61.233228,41.959225,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Statistics,Undergraduate,Concentration 1,Entrepreneurship,"[Reading, Coding, Traveling]","[Data Science, Finance]",0,0.636674,0.363326,0.198732,...,,52.483923,58.555989,79.923737,,76.110045,,,,54.534902
96,Electrical Engineering,Undergraduate,Concentration 2,Entrepreneurship,"[Gaming, Sports, Traveling, Coding]","[Economics, Sustainability, Robotics, Finance]",0,0.241484,0.758516,0.349322,...,,,52.333039,,,55.750966,,,,
97,Robotics,Undergraduate,Concentration 1,Entrepreneurship,"[Cooking, Gaming, Art, Music]","[Robotics, Education, Data Science, Finance]",0,0.576872,0.423128,0.081036,...,,,62.089504,,,,90.427882,,,47.597083
98,Art,PhD,Concentration 1,Consulting,"[Gaming, Writing, Art]","[Data Science, Cybersecurity, AI, Finance]",0,0.179847,0.820153,0.214302,...,,,70.548917,,,,,,,


## Test Student Data Generation

In [None]:
# Generate a test data instance with only 5 classes taken
test_student = random.choice(students).copy()
taken_courses = random.sample(range(num_courses), k=5)
test_student.update({f"Course_{i+1}": np.random.uniform(50, 100) if i in taken_courses else np.nan for i in range(num_courses)})
test_student_df = pd.DataFrame([test_student])

# Save data to CSV
students_df.to_csv("students_data.csv", index=False)
test_student_df.to_csv("test_student_data.csv", index=False)

# display(students_df.head())
display(test_student_df)

Unnamed: 0,Major,Level of Study,Concentration,Career Directions,Hobbies,Knowledge Areas,Preference for Course Difficulty,CF Weight,CBF Weight,CBF Course Competence Weight,...,Course_91,Course_92,Course_93,Course_94,Course_95,Course_96,Course_97,Course_98,Course_99,Course_100
0,Drama,Undergraduate,Concentration 1,Industry,"[Writing, Art, Photography, Music]",[Philosophy],0,0.796483,0.203517,0.048472,...,,,80.68882,,,,,,,


## Compute Collaborative Filtering Similarity

In [None]:
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0

def compute_similarity(test_student, students_df, NUM_OF_NEIGHTBORS):
    similarities = []
    test_courses = set([course for course in test_student.index if "Course_" in course and not np.isnan(test_student[course])])

    for i, student in students_df.iterrows():
        # For numerical valued features, use jaccard-sim
        student_courses = set([course for course in student.index if "Course_" in course and not np.isnan(student[course])])
        courses_taken_jaccard_sim = jaccard_similarity(test_courses, student_courses)
        # For text-based features, use cosine-sim
        textual_feature_sim = 0
        categorical_features = ["Major", "Career Directions", "Hobbies", "Knowledge Areas"]
        for j in categorical_features:
          if type(students_df[j][i]) != list:
              data_text_embeddings = ALL_EMBEDDINGS[students_df[j][i]]
          else:
              data_text_embeddings = np.mean(np.array([ALL_EMBEDDINGS[x] for x in students_df[j][i]]),axis=0)
              # data_text_embeddings = get_text_embedding(" ".join(students_df[j][i]))
          if type(test_student[j]) != list:
              test_text_embeddings = ALL_EMBEDDINGS[test_student[j]]
          else:
              test_text_embeddings = np.mean(np.array([ALL_EMBEDDINGS[x] for x in test_student[j]]),axis=0)
          textual_feature_sim += cosine_similarity([data_text_embeddings], [test_text_embeddings])[0][0]

        # Course difficulty features
        inverse_pref_course_difficulty_sim = 1 if students_df["Preference for Course Difficulty"][i] == test_student["Preference for Course Difficulty"] else 0

        # Overall similarity
        overall_sim = 0
        overall_sim += test_student['Weight for courses taken by others'] * courses_taken_jaccard_sim
        overall_sim += test_student['Weight for non-course factors'] * textual_feature_sim
        overall_sim += test_student['Weight for rigor commitment'] * inverse_pref_course_difficulty_sim

        similarities.append((i, overall_sim))

    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:NUM_OF_NEIGHTBORS]  # Top NUM_OF_NEIGHTBORS similar students

# Find similar students and recommend courses
NUM_OF_NEIGHTBORS = 20
similar_students = compute_similarity(test_student_df.iloc[0], students_df, NUM_OF_NEIGHTBORS)
recommended_courses = set()

for student_idx, _ in similar_students:
    # Get the student’s row as a Series
    student_row = students_df.iloc[student_idx]
    # Extract courses the student has taken (not NaN)
    student_courses = set()
    for course in students_df.columns:
        if "Course_" in course and course in student_row:
            course_value = student_row[course]
            if not pd.isna(course_value):
                student_courses.add(course)

test_student_courses = set(test_student_df.columns)

recommended_courses_from_CF = list(student_courses)
test_student_courses = list(test_student_courses)
for i in recommended_courses_from_CF:
  if i in test_student_courses:
    recommended_courses_from_CF.remove(i)

print(f"Final recommended courses: {recommended_courses_from_CF}")

Final recommended courses: ['Course_85', 'Course_1', 'Course_52', 'Course_89', 'Course_60', 'Course_37', 'Course_72', 'Course_84', 'Course_61', 'Course_46', 'Course_80', 'Course_39', 'Course_97', 'Course_75', 'Course_9', 'Course_94', 'Course_7', 'Course_93', 'Course_10']


# CBF

## Data Setup
Course information:

*   Professors involved in the course
*   Course competencies/skills
*   Course knowledge area
*   Theoretical and practical contents of the course



In [None]:
class CourseDataGenerator:
    def __init__(self, num_courses=100, num_professors=100, num_competences=12, num_knowledge_areas=8):
        self.num_courses = num_courses
        self.num_professors = num_professors
        self.num_competences = num_competences
        self.num_knowledge_areas = num_knowledge_areas

        self.schools = [
            "College of Fine Arts",
            "Tepper School of Business",
            "School of Computer Science",
            "College of Engineering",
            "Heinz College of Information Systems and Public Policy",
            "Dietrich College of Humanities and Social Sciences"
        ]

        self.departments = [
            "Economics", "Robotics", "Statistics", "Art", "Computer Science",
            "Business Analytics", "Design", "Information Systems",
            "Public Policy", "Mechanical Engineering", "Electrical Engineering",
            "Chemical Engineering", "Architecture", "Drama"
        ]

        self.course_levels = ["Undergraduate", "Graduate", "PhD"]

    def generate_professors(self):
        professors = []
        for i in range(self.num_professors):
            professor = {
                'professor_id': i,
                'name': f'Professor_{i}',
                'department': random.choice(self.departments),
                'school': random.choice(self.schools)
            }
            professors.append(professor)
        return pd.DataFrame(professors)

    def generate_courses(self, professors_df):
      courses = []
      for i in range(self.num_courses):
          department = random.choice(self.departments)
          school = random.choice(self.schools)

          # Create professor vector
          professor_vector = np.zeros(self.num_professors, dtype=int)
          num_course_professors = random.randint(1, 3)
          course_professors = random.sample(range(self.num_professors), num_course_professors)
          professor_vector[course_professors] = 1

          # Create competence vector
          competence_vector = np.zeros(self.num_competences, dtype=int)
          num_competences = random.randint(1, 5)
          course_competences = random.sample(range(self.num_competences), num_competences)
          competence_vector[course_competences] = 1

          course = {
              'course_id': f'CMU-{1000 + i}',
              'title': f'Course_{i}',
              'department': department,
              'school': school,
              'professors_vector': professor_vector,
              'competences_vector': competence_vector,
              'knowledge_area': random.randint(1, self.num_knowledge_areas),
              'credits': random.choice([3, 6, 9, 12]),
              'level': random.choice(self.course_levels),
              'keywords': [f'keyword_{j}' for j in range(random.randint(3, 7))]
          }
          courses.append(course)

      return pd.DataFrame(courses)

    def generate_dataset(self):
        professors_df = self.generate_professors()
        courses_df = self.generate_courses(professors_df)

        return {
            'professors': professors_df,
            'courses': courses_df
        }

In [None]:
generator = CourseDataGenerator()
dataset = generator.generate_dataset()
professors = dataset['professors']
courses = dataset['courses']
display("Professors Sample:")
display(professors.head())
print('\n')
display("Courses Sample:")
display(courses.head())


NameError: name 'CourseDataGenerator' is not defined

## Calculating Similarity

### Professor Similarity

In [None]:
class ProfessorSimilarityCalculator:
    @staticmethod
    def jaccard_similarity(prof_vector1: np.ndarray, prof_vector2: np.ndarray) -> float:
        intersection = np.sum(np.logical_and(prof_vector1, prof_vector2))
        union = np.sum(np.logical_or(prof_vector1, prof_vector2))
        return intersection / union if union > 0 else 0.0

    @staticmethod
    def log_likelihood_similarity(prof_vector1: np.ndarray, prof_vector2: np.ndarray) -> float:
        n11 = np.sum(np.logical_and(prof_vector1, prof_vector2))
        n10 = np.sum(np.logical_and(prof_vector1, np.logical_not(prof_vector2)))
        n01 = np.sum(np.logical_and(np.logical_not(prof_vector1), prof_vector2))
        n00 = np.sum(np.logical_and(np.logical_not(prof_vector1), np.logical_not(prof_vector2)))

        n = n11 + n10 + n01 + n00
        epsilon = 1e-10

        p11 = (n11 + epsilon) / n
        p10 = (n10 + epsilon) / n
        p01 = (n01 + epsilon) / n

        if p11 == 0 or p10 == 0 or p01 == 0:
            return 0.0

        return n11 * np.log(p11) + n10 * np.log(p10) + n01 * np.log(p01)

    @staticmethod
    def normalize_log_likelihood(ll_score: float) -> float:
        min_ll, max_ll = -10, 0
        normalized = (ll_score - min_ll) / (max_ll - min_ll)
        return max(0, min(normalized, 1))

    def compute_similarity(self, course1: dict, course2: dict, weights: dict = None) -> float:
        if weights is None:
            weights = {'jaccard': 0.5, 'log_likelihood': 0.5}

        jaccard_sim = self.jaccard_similarity(course1['professors_vector'], course2['professors_vector'])
        ll_sim = self.log_likelihood_similarity(course1['professors_vector'], course2['professors_vector'])
        normalized_ll_sim = self.normalize_log_likelihood(ll_sim)

        return weights['jaccard'] * jaccard_sim + weights['log_likelihood'] * normalized_ll_sim

In [None]:
calculator = ProfessorSimilarityCalculator()
num_courses = len(courses)
professor_similarity_matrix = np.zeros((num_courses, num_courses))


In [None]:
from tqdm import tqdm
for i in tqdm(range(num_courses)):
    for j in range(i+1, num_courses):
        similarity = calculator.compute_similarity(
            {'professors_vector': courses.iloc[i]['professors_vector']},
            {'professors_vector': courses.iloc[j]['professors_vector']}
        )
        professor_similarity_matrix[i, j] = similarity
        professor_similarity_matrix[j, i] = similarity  # Matrix is symmetric

100%|██████████| 100/100 [00:00<00:00, 121.57it/s]


In [None]:
similarity_df = pd.DataFrame(
    professor_similarity_matrix,
    index=courses['course_id'],
    columns=courses['course_id']
)

In [None]:
print("Professor Similarity Matrix (sample):")
display(similarity_df.iloc[:5, :5])


Professor Similarity Matrix (sample):


course_id,CMU-1000,CMU-1001,CMU-1002,CMU-1003,CMU-1004
course_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CMU-1000,0.0,0.0,0.0,0.0,0.0
CMU-1001,0.0,0.0,0.0,0.0,0.0
CMU-1002,0.0,0.0,0.0,0.0,0.0
CMU-1003,0.0,0.0,0.0,0.0,0.0
CMU-1004,0.0,0.0,0.0,0.0,0.0


In [None]:
most_similar_pairs = []
for i in range(num_courses):
    for j in range(i+1, num_courses):
        most_similar_pairs.append((
            courses.iloc[i]['course_id'],
            courses.iloc[j]['course_id'],
            similarity_matrix[i, j]
        ))

most_similar_pairs.sort(key=lambda x: x[2], reverse=True)

print("\nTop 5 Most Similar Course Pairs:")
for pair in most_similar_pairs[:5]:
    print(f"Courses {pair[0]} and {pair[1]}: Similarity = {pair[2]:.4f}")

NameError: name 'similarity_matrix' is not defined

### Competence Similarity

In [None]:
class CompetenceSimilarityCalculator:
    @staticmethod
    def jaccard_similarity(comp_vector1: np.ndarray, comp_vector2: np.ndarray) -> float:
        intersection = np.sum(np.logical_and(comp_vector1, comp_vector2))
        union = np.sum(np.logical_or(comp_vector1, comp_vector2))
        return intersection / union if union > 0 else 0.0

    @staticmethod
    def log_likelihood_similarity(comp_vector1: np.ndarray, comp_vector2: np.ndarray) -> float:
        n11 = np.sum(np.logical_and(comp_vector1, comp_vector2))
        n10 = np.sum(np.logical_and(comp_vector1, np.logical_not(comp_vector2)))
        n01 = np.sum(np.logical_and(np.logical_not(comp_vector1), comp_vector2))
        n00 = np.sum(np.logical_and(np.logical_not(comp_vector1), np.logical_not(comp_vector2)))

        n = n11 + n10 + n01 + n00
        epsilon = 1e-10

        p11 = (n11 + epsilon) / n
        p10 = (n10 + epsilon) / n
        p01 = (n01 + epsilon) / n

        if p11 == 0 or p10 == 0 or p01 == 0:
            return 0.0

        return n11 * np.log(p11) + n10 * np.log(p10) + n01 * np.log(p01)

    @staticmethod
    def normalize_log_likelihood(ll_score: float) -> float:
        min_ll, max_ll = -10, 0
        normalized = (ll_score - min_ll) / (max_ll - min_ll)
        return max(0, min(normalized, 1))

    def compute_similarity(self, course1: dict, course2: dict, weights: dict = None) -> float:
        if weights is None:
            weights = {'jaccard': 0.5, 'log_likelihood': 0.5}

        jaccard_sim = self.jaccard_similarity(course1['competences_vector'], course2['competences_vector'])
        ll_sim = self.log_likelihood_similarity(course1['competences_vector'], course2['competences_vector'])
        normalized_ll_sim = self.normalize_log_likelihood(ll_sim)

        return weights['jaccard'] * jaccard_sim + weights['log_likelihood'] * normalized_ll_sim

In [None]:
comp_calculator = CompetenceSimilarityCalculator()
num_courses = len(courses)
competence_similarity_matrix = np.zeros((num_courses, num_courses))


In [None]:
for i in tqdm(range(num_courses)):
    for j in range(i+1, num_courses):
        similarity = comp_calculator.compute_similarity(
            {'competences_vector': courses.iloc[i]['competences_vector']},
            {'competences_vector': courses.iloc[j]['competences_vector']}
        )
        competence_similarity_matrix[i, j] = similarity
        competence_similarity_matrix[j, i] = similarity  # Matrix is symmetric

100%|██████████| 100/100 [00:00<00:00, 116.09it/s]


In [None]:
comp_similarity_df = pd.DataFrame(
    competence_similarity_matrix,
    index=courses['course_id'],
    columns=courses['course_id']
)

In [None]:
print("Competence Similarity Matrix (sample):")
display(comp_similarity_df.iloc[:5, :5])


Competence Similarity Matrix (sample):


course_id,CMU-1000,CMU-1001,CMU-1002,CMU-1003,CMU-1004
course_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CMU-1000,0.0,0.071429,0.166667,0.117403,0.197333
CMU-1001,0.071429,0.0,0.125,0.071429,0.101957
CMU-1002,0.166667,0.125,0.0,0.071429,0.11512
CMU-1003,0.117403,0.071429,0.071429,0.0,0.197333
CMU-1004,0.197333,0.101957,0.11512,0.197333,0.0


In [None]:
comp_most_similar_pairs = []
for i in range(num_courses):
    for j in range(i+1, num_courses):
        comp_most_similar_pairs.append((
            courses.iloc[i]['course_id'],
            courses.iloc[j]['course_id'],
            similarity_matrix[i, j]
        ))

comp_most_similar_pairs.sort(key=lambda x: x[2], reverse=True)

print("\nTop 5 Most Similar Course Pairs:")
for pair in comp_most_similar_pairs[:5]:
    print(f"Courses {pair[0]} and {pair[1]}: Similarity = {pair[2]:.4f}")

NameError: name 'similarity_matrix' is not defined

### Knowledge Similarity

In [None]:
class KnowledgeAreaSimilarityCalculator:
    @staticmethod
    def compute_similarity(course1: dict, course2: dict) -> float:
        return 1.0 if course1['knowledge_area'] == course2['knowledge_area'] else 0.0

knowledge_area_similarity_matrix = np.zeros((num_courses, num_courses))

for i in range(num_courses):
    for j in range(i, num_courses):
        similarity = KnowledgeAreaSimilarityCalculator.compute_similarity(
            courses.iloc[i], courses.iloc[j]
        )
        knowledge_area_similarity_matrix[i, j] = similarity
        knowledge_area_similarity_matrix[j, i] = similarity  # Matrix is symmetric

knowledge_area_similarity_df = pd.DataFrame(
    knowledge_area_similarity_matrix,
    index=courses['course_id'],
    columns=courses['course_id']
)

print("Knowledge Area Similarity Matrix (sample):")
display(knowledge_area_similarity_df.iloc[:5, :5])

Knowledge Area Similarity Matrix (sample):


course_id,CMU-1000,CMU-1001,CMU-1002,CMU-1003,CMU-1004
course_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CMU-1000,1.0,0.0,0.0,0.0,1.0
CMU-1001,0.0,1.0,0.0,0.0,0.0
CMU-1002,0.0,0.0,1.0,1.0,0.0
CMU-1003,0.0,0.0,1.0,1.0,0.0
CMU-1004,1.0,0.0,0.0,0.0,1.0


In [None]:
same_knowledge_area_pairs = []
for i in range(num_courses):
    for j in range(i+1, num_courses):
        if knowledge_area_similarity_matrix[i, j] == 1:
            same_knowledge_area_pairs.append((
                courses.iloc[i]['course_id'],
                courses.iloc[j]['course_id'],
                courses.iloc[i]['knowledge_area']
            ))

print("\nSample of Courses with the Same Knowledge Area:")
for pair in same_knowledge_area_pairs[:5]:
    print(f"Courses {pair[0]} and {pair[1]}: Knowledge Area {pair[2]}")



Sample of Courses with the Same Knowledge Area:
Courses CMU-1000 and CMU-1004: Knowledge Area 7
Courses CMU-1000 and CMU-1006: Knowledge Area 7
Courses CMU-1000 and CMU-1010: Knowledge Area 7
Courses CMU-1000 and CMU-1014: Knowledge Area 7
Courses CMU-1000 and CMU-1017: Knowledge Area 7


In [None]:
# Calculate the percentage of course pairs with the same knowledge area
total_pairs = num_courses * (num_courses - 1) / 2
same_knowledge_area_percentage = len(same_knowledge_area_pairs) / total_pairs * 100

print(f"\nPercentage of course pairs with the same knowledge area: {same_knowledge_area_percentage:.2f}%")


Percentage of course pairs with the same knowledge area: 12.16%


### Keyword Similarity

In [None]:
from typing import List

class KeywordSimilarityCalculator:
    @staticmethod
    def jaccard_similarity(keywords1: List[str], keywords2: List[str]) -> float:
        set1 = set(keywords1)
        set2 = set(keywords2)
        intersection = len(set1.intersection(set2))
        union = len(set1.union(set2))
        return intersection / union if union > 0 else 0.0

    @staticmethod
    def compute_similarity(course1: dict, course2: dict) -> float:
        return KeywordSimilarityCalculator.jaccard_similarity(course1['keywords'], course2['keywords'])

# Calculate similarity matrix for keywords
num_courses = len(courses)
keyword_similarity_matrix = np.zeros((num_courses, num_courses))

for i in range(num_courses):
    for j in range(i, num_courses):
        similarity = KeywordSimilarityCalculator.compute_similarity(
            courses.iloc[i], courses.iloc[j]
        )
        keyword_similarity_matrix[i, j] = similarity
        keyword_similarity_matrix[j, i] = similarity  # Matrix is symmetric

# Create a DataFrame for the keyword similarity matrix
keyword_similarity_df = pd.DataFrame(
    keyword_similarity_matrix,
    index=courses['course_id'],
    columns=courses['course_id']
)

print("Keyword Similarity Matrix (sample):")
display(keyword_similarity_df.iloc[:5, :5])

Keyword Similarity Matrix (sample):


course_id,CMU-1000,CMU-1001,CMU-1002,CMU-1003,CMU-1004
course_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CMU-1000,1.0,0.666667,1.0,0.666667,0.857143
CMU-1001,0.666667,1.0,0.666667,1.0,0.571429
CMU-1002,1.0,0.666667,1.0,0.666667,0.857143
CMU-1003,0.666667,1.0,0.666667,1.0,0.571429
CMU-1004,0.857143,0.571429,0.857143,0.571429,1.0


In [None]:
most_similar_pairs = []
for i in range(num_courses):
    for j in range(i+1, num_courses):
        most_similar_pairs.append((
            courses.iloc[i]['course_id'],
            courses.iloc[j]['course_id'],
            keyword_similarity_matrix[i, j]
        ))

most_similar_pairs.sort(key=lambda x: x[2], reverse=True)

print("\nTop 5 Most Similar Course Pairs based on Keywords:")
for pair in most_similar_pairs[:5]:
    print(f"Courses {pair[0]} and {pair[1]}: Similarity = {pair[2]:.4f}")


Top 5 Most Similar Course Pairs based on Keywords:
Courses CMU-1000 and CMU-1002: Similarity = 1.0000
Courses CMU-1000 and CMU-1008: Similarity = 1.0000
Courses CMU-1000 and CMU-1011: Similarity = 1.0000
Courses CMU-1000 and CMU-1015: Similarity = 1.0000
Courses CMU-1000 and CMU-1016: Similarity = 1.0000


In [None]:
# Calculate average keyword similarity
average_similarity = np.mean(keyword_similarity_matrix[np.triu_indices(num_courses, k=1)])
print(f"\nAverage Keyword Similarity: {average_similarity:.4f}")


Average Keyword Similarity: 0.7379


### Overall Similarity

In [None]:
class WeightedSimilarityCalculator:
    def __init__(self, alpha, beta, gamma, delta):
        self.alpha = alpha  # weight for professor similarity
        self.beta = beta    # weight for competence similarity
        self.gamma = gamma  # weight for knowledge area similarity
        self.delta = delta  # weight for keyword similarity

        assert abs(self.alpha + self.beta + self.gamma + self.delta - 1.0) < 1e-6, "Weights must sum to 1"

    def compute_weighted_similarity(self, prof_sim, comp_sim, know_sim, key_sim):
        return (self.alpha * prof_sim +
                self.beta * comp_sim +
                self.gamma * know_sim +
                self.delta * key_sim)

In [None]:
alpha = test_student['CBF Professor Rating Weight']  # weight for professor similarity
beta = test_student['CBF Course Competence Weight']    # weight for competence similarity
gamma = test_student['CBF Knowledge Area Weight']  # weight for knowledge area similarity
delta = 1 - (alpha + beta + gamma)  # weight for keyword similarity
calculator = WeightedSimilarityCalculator()

In [None]:
# Calculate weighted similarity matrix
num_courses = len(courses)
weighted_similarity_matrix = np.zeros((num_courses, num_courses))

for i in range(num_courses):
    for j in range(i, num_courses):
        weighted_sim = calculator.compute_weighted_similarity(
            professor_similarity_matrix[i, j],
            competence_similarity_matrix[i, j],
            knowledge_area_similarity_matrix[i, j],
            keyword_similarity_matrix[i, j]
        )
        weighted_similarity_matrix[i, j] = weighted_sim
        weighted_similarity_matrix[j, i] = weighted_sim  # Matrix is symmetric

# Create a DataFrame for the weighted similarity matrix
weighted_similarity_df = pd.DataFrame(
    weighted_similarity_matrix,
    index=courses['course_id'],
    columns=courses['course_id']
)

print("Weighted Similarity Matrix (sample):")
display(weighted_similarity_df.iloc[:5, :5])


Weighted Similarity Matrix (sample):


course_id,CMU-1000,CMU-1001,CMU-1002,CMU-1003,CMU-1004
course_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CMU-1000,0.4,0.204152,0.254339,0.167197,0.401759
CMU-1001,0.204152,0.4,0.204152,0.308974,0.161095
CMU-1002,0.254339,0.204152,0.4,0.392533,0.214498
CMU-1003,0.167197,0.308974,0.392533,0.4,0.135986
CMU-1004,0.401759,0.161095,0.214498,0.135986,0.4


In [None]:
most_similar_pairs = []
for i in range(num_courses):
    for j in range(i+1, num_courses):
        most_similar_pairs.append((
            courses.iloc[i]['course_id'],
            courses.iloc[j]['course_id'],
            weighted_similarity_matrix[i, j]
        ))

most_similar_pairs.sort(key=lambda x: x[2], reverse=True)

print("\nTop 5 Most Similar Course Pairs based on Weighted Similarity:")
for pair in most_similar_pairs[:5]:
    print(f"Courses {pair[0]} and {pair[1]}: Similarity = {pair[2]:.4f}")


Top 5 Most Similar Course Pairs based on Weighted Similarity:
Courses CMU-1039 and CMU-1051: Similarity = 0.6717
Courses CMU-1001 and CMU-1072: Similarity = 0.6627
Courses CMU-1007 and CMU-1073: Similarity = 0.6627
Courses CMU-1001 and CMU-1007: Similarity = 0.6127
Courses CMU-1001 and CMU-1073: Similarity = 0.6127


In [None]:
average_similarity = np.mean(weighted_similarity_matrix[np.triu_indices(num_courses, k=1)])
print(f"\nAverage Weighted Similarity: {average_similarity:.4f}")


Average Weighted Similarity: 0.2283


## CBF Recommendations

In [None]:
def get_top_n_recommendations(course_id: str, n: int = 5):
    if course_id not in courses['course_id'].values:
        raise ValueError(f"Course ID {course_id} not found in the dataset.")

    course_index = courses.index[courses['course_id'] == course_id].item()
    similarities = weighted_similarity_matrix[course_index]

    # Sort similarities, excluding the course itself
    similar_indices = similarities.argsort()[::-1][1:n+1]

    recommendations = []
    for idx in similar_indices:
        rec_course_id = courses.iloc[idx]['course_id']
        similarity = similarities[idx]
        recommendations.append((rec_course_id, similarity))

    return recommendations

In [None]:
# Example usage
# target_course_id = "CMU-1021"  # Replace with any course ID from your dataset
# top_n = 5

# try:
#     recommendations = get_top_n_recommendations(target_course_id, top_n)

#     print(f"\nTop {top_n} recommendations for course {target_course_id}:")
#     for i, (rec_course_id, similarity) in enumerate(recommendations, 1):
#         course_info = courses[courses['course_id'] == rec_course_id].iloc[0]
#         print(f"{i}. Course ID: {rec_course_id}")
#         print(f"   Title: {course_info['title']}")
#         print(f"   Department: {course_info['department']}")
#         print(f"   School: {course_info['school']}")
#         print(f"   Similarity: {similarity:.4f}")
#         print()

# except ValueError as e:
#     print(f"Error: {e}")

all_target_course_id = test_student_courses
top_n = 3 # for each course the test student has taken, how many content-based recommendations will be generated

# NEW FUNCTION ADDED (Jocelyn, not tested)
def get_CBF_Recommendation(top_n, all_target_course_id):
    all_recommendations = []
    all_similarities = []
    for target_course_id in all_target_course_id:

        try:
            recommendations = get_top_n_recommendations(target_course_id, top_n)

            print(f"\nTop {top_n} recommendations for course {target_course_id}:")
            for i, (rec_course_id, similarity) in enumerate(recommendations, 1):
                course_info = courses[courses['course_id'] == rec_course_id].iloc[0]
                #print(f"{i}. Course ID: {rec_course_id}")
                print(f"   Title: {course_info['title']}")
                all_recommendations.append(course_info['title'])
                #print(f"   Department: {course_info['department']}")
                #print(f"   School: {course_info['school']}")
                print(f"   Similarity: {similarity:.4f}")
                all_similarities.append(similarity:.4f)
                #print()

        except ValueError as e:
            print(f"Error: {e}")

    return all_recommendations, all_similarities

recommended_courses_from_CBF, _ = get_CBF_Recommendation(top_n, all_target_course_id)


Top 5 recommendations for course CMU-1021:
1. Course ID: CMU-1092
   Title: Course_92
   Department: Public Policy
   School: School of Computer Science
   Similarity: 0.4468

2. Course ID: CMU-1012
   Title: Course_12
   Department: Mechanical Engineering
   School: College of Engineering
   Similarity: 0.4250

3. Course ID: CMU-1036
   Title: Course_36
   Department: Business Analytics
   School: Tepper School of Business
   Similarity: 0.4250

4. Course ID: CMU-1074
   Title: Course_74
   Department: Art
   School: Heinz College of Information Systems and Public Policy
   Similarity: 0.4188

5. Course ID: CMU-1067
   Title: Course_67
   Department: Business Analytics
   School: Dietrich College of Humanities and Social Sciences
   Similarity: 0.4188



# Combining CF and CBF

In [None]:
#### Dummy
recommended_courses_from_CBF = ['Course_69', 'Course_82', 'Course_15', 'Course_74', 'Course_10', 'Course_32', 'Course_52', 'Course_48']
def combined_recommendation(relevance_score, recommended_courses_from_CBF, recommended_courses_from_CF, test_student, NUM_COURSES_RECOMMENDING = 10):
  # Collaborative Filtering add to output
  for course in recommended_courses_from_CF:
    if course in relevance_score:
      relevance_score[course] += test_student['CF Weight']
    else:
      relevance_score[course] = test_student['CF Weight']
  # Content based filtering add to output
  for course in recommended_courses_from_CBF:
    if course in relevance_score:
      relevance_score[course] += test_student['CBF Weight']
    else:
      relevance_score[course] = test_student['CBF Weight']
  sorted_relevance_score = sorted(relevance_score.items(), key=lambda item: item[1], reverse = True)[:NUM_COURSES_RECOMMENDING]
  for i in sorted_relevance_score:
    print(f"{i[0]}, Relevance Score: {i[1]}")

relevance_score = {}
combined_recommendation(relevance_score, recommended_courses_from_CF, recommended_courses_from_CBF, test_student)
print(test_student['CBF Weight'], test_student['CF Weight'])

Course_15, Relevance Score: 1.0
Course_74, Relevance Score: 1.0
Course_32, Relevance Score: 1.0
Course_69, Relevance Score: 0.5232932190453434
Course_82, Relevance Score: 0.5232932190453434
Course_10, Relevance Score: 0.5232932190453434
Course_52, Relevance Score: 0.5232932190453434
Course_48, Relevance Score: 0.5232932190453434
Course_68, Relevance Score: 0.4767067809546566
Course_83, Relevance Score: 0.4767067809546566


# Trend Analysis



*   Course popularity trends:
  *  Percentage of students in your major/minor/concentration who took each recommended course
  *  Percentage of students with the same career interest who took each recommended course
*   Course enrollment trends:
  *  Courses that have seen the highest volume or higher growth in enrollment over time
*  Course sequence trends
  *  Most students with your background take [Course X] before [Recommended Course Y] (skill-gap filling)
  *  Students who took the same courses as you typically went on to take [Course X] (future pathway planning)
  *  Recommended electives that covers beyond student’s existing skillset (for students seeking diversification of skillsets)
  *  Most commonly chosen electives taken alongside core courses
*  Course performnace trends
  *   Courses with high dropout rates vs. high completion rates
  *   Average grades or pass rates for students in your major/career track
  *   A course's grade distribution across students with different academic backgrounds

In [None]:
class TrendAnalysis:
    def __init__(self, students_df, test_student, recommended_courses_from_CF):
        self.data = students_df
        self.user = test_student
        self.recommendations = recommended_courses_from_CF

    def conditioned_analysis(self, feature, same_feature_students, electives_only):
      statistics_recommendation_based = {}
      for course in self.recommendations:
        count = 0
        for score in same_feature_students[course]:
          count += 1 if score > 0 else 0
        statistics_recommendation_based[course] = count
      statistics_recommendation_based = sorted(statistics_recommendation_based.items(), key=lambda item: item[1], reverse=True)[:10]
      print(f'Based on your {feature} in {self.user[feature]}, we recommend these courses: {recommended_courses_from_CF}')
      print(f'Why consider this recommendation? \n In {major} {feature.lower()},')
      for course_name, number_of_people_taken in statistics_recommendation_based:
        print(f" {number_of_people_taken/len(same_feature_students)* 100:.2f}% have taken course {course_name}")

      statistics_profile_based = {}

      isCore = {}
      for column_name in self.data.columns:
          if "Course_" in column_name:
            isCore[column_name] = random.choice([True, False])

      for i in range(len(same_feature_students)):
        student_data = same_feature_students.iloc[i]
        for column_name in self.data.columns:
          if "Course_" in column_name and not student_data[column_name] > 0:
            if electives_only:
              if not isCore[column_name] and column_name not in statistics_recommendation_based:
                statistics_profile_based[column_name] = 1 if (column_name not in statistics_profile_based) else statistics_profile_based[column_name] + 1
            else: # not necessarily has to be an elective
              statistics_profile_based[column_name] = 1 if (column_name not in statistics_profile_based) else statistics_profile_based[column_name] + 1

      statistics_profile_based = sorted(statistics_profile_based.items(), key=lambda item: item[1], reverse=True)[:10]

      print(f"These courses are also popular in {self.user[feature]} {feature}:")
      if not isCore: print('(for electives only)')
      for course_name, number_of_people_taken in statistics_profile_based:
        print(f" {number_of_people_taken/len(same_feature_students)* 100:.2f}% have taken course {course_name}")
      return '\n'

    def check_course_popularity(self, feature, electives_only):
      if feature not in self.data.columns:
        raise ValueError(f"Feature '{feature}' not found in the dataset.")
      user_feature = self.user[feature]
      mask = self.data[feature] == user_feature
      same_feature_students = self.data[mask].copy()
      self.conditioned_analysis(feature, same_feature_students, electives_only)
      return '\n'

TA = TrendAnalysis(students_df, test_student, recommended_courses_from_CF)
print(TA.check_course_popularity('Major', True))
print(TA.check_course_popularity('Career Directions', True))

Based on your Major in Drama, we recommend these courses: ['Course_85', 'Course_1', 'Course_52', 'Course_89', 'Course_60', 'Course_37', 'Course_72', 'Course_84', 'Course_61', 'Course_46', 'Course_80', 'Course_39', 'Course_97', 'Course_75', 'Course_9', 'Course_94', 'Course_7', 'Course_93', 'Course_10']
Why consider this recommendation? 
 In Statistics major,
 64.71% have taken course Course_7
 58.82% have taken course Course_46
 52.94% have taken course Course_75
 47.06% have taken course Course_60
 47.06% have taken course Course_39
 47.06% have taken course Course_97
 47.06% have taken course Course_93
 41.18% have taken course Course_1
 41.18% have taken course Course_89
 41.18% have taken course Course_37
These courses are also popular in Drama Major:
 94.12% have taken course Course_23
 88.24% have taken course Course_42
 88.24% have taken course Course_55
 82.35% have taken course Course_5
 82.35% have taken course Course_28
 82.35% have taken course Course_68
 82.35% have taken c