# CMUGPT Content-Based Filtering

Reference Paper - https://arxiv.org/abs/2402.08371

In [46]:
import numpy as np
import pandas as pd
import random
import math
from tqdm import tqdm
random.seed(43)

## Data Setup
Course information:

*   Professors involved in the course
*   Course competencies/skills
*   Course knowledge area
*   Theoretical and practical contents of the course



In [47]:
class CourseDataGenerator:
    def __init__(self, num_courses=100, num_professors=100, num_competences=12, num_knowledge_areas=8):
        self.num_courses = num_courses
        self.num_professors = num_professors
        self.num_competences = num_competences
        self.num_knowledge_areas = num_knowledge_areas

        self.schools = [
            "College of Fine Arts",
            "Tepper School of Business",
            "School of Computer Science",
            "College of Engineering",
            "Heinz College of Information Systems and Public Policy",
            "Dietrich College of Humanities and Social Sciences"
        ]

        self.departments = [
            "Economics", "Robotics", "Statistics", "Art", "Computer Science",
            "Business Analytics", "Design", "Information Systems",
            "Public Policy", "Mechanical Engineering", "Electrical Engineering",
            "Chemical Engineering", "Architecture", "Drama"
        ]

        self.course_levels = ["Undergraduate", "Graduate", "PhD"]

    def generate_professors(self):
        professors = []
        for i in range(self.num_professors):
            professor = {
                'professor_id': i,
                'name': f'Professor_{i}',
                'department': random.choice(self.departments),
                'school': random.choice(self.schools)
            }
            professors.append(professor)
        return pd.DataFrame(professors)

    def generate_courses(self, professors_df):
      courses = []
      for i in range(self.num_courses):
          department = random.choice(self.departments)
          school = random.choice(self.schools)

          # Create professor vector
          professor_vector = np.zeros(self.num_professors, dtype=int)
          num_course_professors = random.randint(1, 3)
          course_professors = random.sample(range(self.num_professors), num_course_professors)
          professor_vector[course_professors] = 1

          # Create competence vector
          competence_vector = np.zeros(self.num_competences, dtype=int)
          num_competences = random.randint(1, 5)
          course_competences = random.sample(range(self.num_competences), num_competences)
          competence_vector[course_competences] = 1

          course = {
              'course_id': f'CMU-{1000 + i}',
              'title': f'Course_{i}',
              'department': department,
              'school': school,
              'professors_vector': professor_vector,
              'competences_vector': competence_vector,
              'knowledge_area': random.randint(1, self.num_knowledge_areas),
              'credits': random.choice([3, 6, 9, 12]),
              'level': random.choice(self.course_levels),
              'keywords': [f'keyword_{j}' for j in range(random.randint(3, 7))]
          }
          courses.append(course)

      return pd.DataFrame(courses)

    def generate_dataset(self):
        professors_df = self.generate_professors()
        courses_df = self.generate_courses(professors_df)

        return {
            'professors': professors_df,
            'courses': courses_df
        }

In [48]:
generator = CourseDataGenerator()
dataset = generator.generate_dataset()
professors = dataset['professors']
courses = dataset['courses']
display("Professors Sample:")
display(professors.head())
print('\n')
display("Courses Sample:")
display(courses.head())


'Professors Sample:'

Unnamed: 0,professor_id,name,department,school
0,0,Professor_0,Economics,School of Computer Science
1,1,Professor_1,Chemical Engineering,Tepper School of Business
2,2,Professor_2,Information Systems,School of Computer Science
3,3,Professor_3,Electrical Engineering,Dietrich College of Humanities and Social Scie...
4,4,Professor_4,Architecture,College of Fine Arts






'Courses Sample:'

Unnamed: 0,course_id,title,department,school,professors_vector,competences_vector,knowledge_area,credits,level,keywords
0,CMU-1000,Course_0,Statistics,Tepper School of Business,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]",7,6,Graduate,"[keyword_0, keyword_1, keyword_2, keyword_3, k..."
1,CMU-1001,Course_1,Robotics,College of Fine Arts,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]",6,9,PhD,"[keyword_0, keyword_1, keyword_2, keyword_3]"
2,CMU-1002,Course_2,Public Policy,Heinz College of Information Systems and Publi...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1]",1,3,Graduate,"[keyword_0, keyword_1, keyword_2, keyword_3, k..."
3,CMU-1003,Course_3,Chemical Engineering,School of Computer Science,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0]",1,6,Graduate,"[keyword_0, keyword_1, keyword_2, keyword_3]"
4,CMU-1004,Course_4,Computer Science,Heinz College of Information Systems and Publi...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1]",7,12,PhD,"[keyword_0, keyword_1, keyword_2, keyword_3, k..."


## Calculating Similarity

### Professor Similarity

In [49]:
class ProfessorSimilarityCalculator:
    @staticmethod
    def jaccard_similarity(prof_vector1: np.ndarray, prof_vector2: np.ndarray) -> float:
        intersection = np.sum(np.logical_and(prof_vector1, prof_vector2))
        union = np.sum(np.logical_or(prof_vector1, prof_vector2))
        return intersection / union if union > 0 else 0.0

    @staticmethod
    def log_likelihood_similarity(prof_vector1: np.ndarray, prof_vector2: np.ndarray) -> float:
        n11 = np.sum(np.logical_and(prof_vector1, prof_vector2))
        n10 = np.sum(np.logical_and(prof_vector1, np.logical_not(prof_vector2)))
        n01 = np.sum(np.logical_and(np.logical_not(prof_vector1), prof_vector2))
        n00 = np.sum(np.logical_and(np.logical_not(prof_vector1), np.logical_not(prof_vector2)))

        n = n11 + n10 + n01 + n00
        epsilon = 1e-10

        p11 = (n11 + epsilon) / n
        p10 = (n10 + epsilon) / n
        p01 = (n01 + epsilon) / n

        if p11 == 0 or p10 == 0 or p01 == 0:
            return 0.0

        return n11 * np.log(p11) + n10 * np.log(p10) + n01 * np.log(p01)

    @staticmethod
    def normalize_log_likelihood(ll_score: float) -> float:
        min_ll, max_ll = -10, 0
        normalized = (ll_score - min_ll) / (max_ll - min_ll)
        return max(0, min(normalized, 1))

    def compute_similarity(self, course1: dict, course2: dict, weights: dict = None) -> float:
        if weights is None:
            weights = {'jaccard': 0.5, 'log_likelihood': 0.5}

        jaccard_sim = self.jaccard_similarity(course1['professors_vector'], course2['professors_vector'])
        ll_sim = self.log_likelihood_similarity(course1['professors_vector'], course2['professors_vector'])
        normalized_ll_sim = self.normalize_log_likelihood(ll_sim)

        return weights['jaccard'] * jaccard_sim + weights['log_likelihood'] * normalized_ll_sim

In [50]:
calculator = ProfessorSimilarityCalculator()
num_courses = len(courses)
professor_similarity_matrix = np.zeros((num_courses, num_courses))


In [51]:
for i in tqdm(range(num_courses)):
    for j in range(i+1, num_courses):
        similarity = calculator.compute_similarity(
            {'professors_vector': courses.iloc[i]['professors_vector']},
            {'professors_vector': courses.iloc[j]['professors_vector']}
        )
        professor_similarity_matrix[i, j] = similarity
        professor_similarity_matrix[j, i] = similarity  # Matrix is symmetric

100%|██████████| 100/100 [00:01<00:00, 89.14it/s]


In [52]:
similarity_df = pd.DataFrame(
    professor_similarity_matrix,
    index=courses['course_id'],
    columns=courses['course_id']
)

In [53]:
print("Professor Similarity Matrix (sample):")
display(similarity_df.iloc[:5, :5])


Professor Similarity Matrix (sample):


course_id,CMU-1000,CMU-1001,CMU-1002,CMU-1003,CMU-1004
course_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CMU-1000,0.0,0.039483,0.039483,0.0,0.0
CMU-1001,0.039483,0.0,0.039483,0.0,0.0
CMU-1002,0.039483,0.039483,0.0,0.0,0.0
CMU-1003,0.0,0.0,0.0,0.0,0.0
CMU-1004,0.0,0.0,0.0,0.0,0.0


In [54]:
most_similar_pairs = []
for i in range(num_courses):
    for j in range(i+1, num_courses):
        most_similar_pairs.append((
            courses.iloc[i]['course_id'],
            courses.iloc[j]['course_id'],
            similarity_matrix[i, j]
        ))

most_similar_pairs.sort(key=lambda x: x[2], reverse=True)

print("\nTop 5 Most Similar Course Pairs:")
for pair in most_similar_pairs[:5]:
    print(f"Courses {pair[0]} and {pair[1]}: Similarity = {pair[2]:.4f}")


Top 5 Most Similar Course Pairs:
Courses CMU-1001 and CMU-1007: Similarity = 0.8758
Courses CMU-1001 and CMU-1023: Similarity = 0.8758
Courses CMU-1001 and CMU-1072: Similarity = 0.8758
Courses CMU-1001 and CMU-1073: Similarity = 0.8758
Courses CMU-1007 and CMU-1023: Similarity = 0.8758


### Competence Similarity

In [55]:
class CompetenceSimilarityCalculator:
    @staticmethod
    def jaccard_similarity(comp_vector1: np.ndarray, comp_vector2: np.ndarray) -> float:
        intersection = np.sum(np.logical_and(comp_vector1, comp_vector2))
        union = np.sum(np.logical_or(comp_vector1, comp_vector2))
        return intersection / union if union > 0 else 0.0

    @staticmethod
    def log_likelihood_similarity(comp_vector1: np.ndarray, comp_vector2: np.ndarray) -> float:
        n11 = np.sum(np.logical_and(comp_vector1, comp_vector2))
        n10 = np.sum(np.logical_and(comp_vector1, np.logical_not(comp_vector2)))
        n01 = np.sum(np.logical_and(np.logical_not(comp_vector1), comp_vector2))
        n00 = np.sum(np.logical_and(np.logical_not(comp_vector1), np.logical_not(comp_vector2)))

        n = n11 + n10 + n01 + n00
        epsilon = 1e-10

        p11 = (n11 + epsilon) / n
        p10 = (n10 + epsilon) / n
        p01 = (n01 + epsilon) / n

        if p11 == 0 or p10 == 0 or p01 == 0:
            return 0.0

        return n11 * np.log(p11) + n10 * np.log(p10) + n01 * np.log(p01)

    @staticmethod
    def normalize_log_likelihood(ll_score: float) -> float:
        min_ll, max_ll = -10, 0
        normalized = (ll_score - min_ll) / (max_ll - min_ll)
        return max(0, min(normalized, 1))

    def compute_similarity(self, course1: dict, course2: dict, weights: dict = None) -> float:
        if weights is None:
            weights = {'jaccard': 0.5, 'log_likelihood': 0.5}

        jaccard_sim = self.jaccard_similarity(course1['competences_vector'], course2['competences_vector'])
        ll_sim = self.log_likelihood_similarity(course1['competences_vector'], course2['competences_vector'])
        normalized_ll_sim = self.normalize_log_likelihood(ll_sim)

        return weights['jaccard'] * jaccard_sim + weights['log_likelihood'] * normalized_ll_sim

In [70]:
comp_calculator = CompetenceSimilarityCalculator()
num_courses = len(courses)
competence_similarity_matrix = np.zeros((num_courses, num_courses))


In [71]:
for i in tqdm(range(num_courses)):
    for j in range(i+1, num_courses):
        similarity = comp_calculator.compute_similarity(
            {'competences_vector': courses.iloc[i]['competences_vector']},
            {'competences_vector': courses.iloc[j]['competences_vector']}
        )
        competence_similarity_matrix[i, j] = similarity
        competence_similarity_matrix[j, i] = similarity  # Matrix is symmetric

100%|██████████| 100/100 [00:01<00:00, 57.55it/s]


In [72]:
comp_similarity_df = pd.DataFrame(
    competence_similarity_matrix,
    index=courses['course_id'],
    columns=courses['course_id']
)

In [59]:
print("Competence Similarity Matrix (sample):")
display(comp_similarity_df.iloc[:5, :5])


Competence Similarity Matrix (sample):


course_id,CMU-1000,CMU-1001,CMU-1002,CMU-1003,CMU-1004
course_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CMU-1000,0.0,0.196579,0.141648,0.11288,0.101102
CMU-1001,0.196579,0.0,0.196579,0.363245,0.156032
CMU-1002,0.141648,0.196579,0.0,0.197333,0.143565
CMU-1003,0.11288,0.363245,0.197333,0.0,0.072333
CMU-1004,0.101102,0.156032,0.143565,0.072333,0.0


In [60]:
comp_most_similar_pairs = []
for i in range(num_courses):
    for j in range(i+1, num_courses):
        comp_most_similar_pairs.append((
            courses.iloc[i]['course_id'],
            courses.iloc[j]['course_id'],
            similarity_matrix[i, j]
        ))

comp_most_similar_pairs.sort(key=lambda x: x[2], reverse=True)

print("\nTop 5 Most Similar Course Pairs:")
for pair in comp_most_similar_pairs[:5]:
    print(f"Courses {pair[0]} and {pair[1]}: Similarity = {pair[2]:.4f}")


Top 5 Most Similar Course Pairs:
Courses CMU-1001 and CMU-1007: Similarity = 0.8758
Courses CMU-1001 and CMU-1023: Similarity = 0.8758
Courses CMU-1001 and CMU-1072: Similarity = 0.8758
Courses CMU-1001 and CMU-1073: Similarity = 0.8758
Courses CMU-1007 and CMU-1023: Similarity = 0.8758


### Knowledge Similarity

In [61]:
class KnowledgeAreaSimilarityCalculator:
    @staticmethod
    def compute_similarity(course1: dict, course2: dict) -> float:
        return 1.0 if course1['knowledge_area'] == course2['knowledge_area'] else 0.0

knowledge_area_similarity_matrix = np.zeros((num_courses, num_courses))

for i in range(num_courses):
    for j in range(i, num_courses):
        similarity = KnowledgeAreaSimilarityCalculator.compute_similarity(
            courses.iloc[i], courses.iloc[j]
        )
        knowledge_area_similarity_matrix[i, j] = similarity
        knowledge_area_similarity_matrix[j, i] = similarity  # Matrix is symmetric

knowledge_area_similarity_df = pd.DataFrame(
    knowledge_area_similarity_matrix,
    index=courses['course_id'],
    columns=courses['course_id']
)

print("Knowledge Area Similarity Matrix (sample):")
display(knowledge_area_similarity_df.iloc[:5, :5])

Knowledge Area Similarity Matrix (sample):


course_id,CMU-1000,CMU-1001,CMU-1002,CMU-1003,CMU-1004
course_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CMU-1000,1.0,0.0,0.0,0.0,1.0
CMU-1001,0.0,1.0,0.0,0.0,0.0
CMU-1002,0.0,0.0,1.0,1.0,0.0
CMU-1003,0.0,0.0,1.0,1.0,0.0
CMU-1004,1.0,0.0,0.0,0.0,1.0


In [62]:
same_knowledge_area_pairs = []
for i in range(num_courses):
    for j in range(i+1, num_courses):
        if knowledge_area_similarity_matrix[i, j] == 1:
            same_knowledge_area_pairs.append((
                courses.iloc[i]['course_id'],
                courses.iloc[j]['course_id'],
                courses.iloc[i]['knowledge_area']
            ))

print("\nSample of Courses with the Same Knowledge Area:")
for pair in same_knowledge_area_pairs[:5]:
    print(f"Courses {pair[0]} and {pair[1]}: Knowledge Area {pair[2]}")



Sample of Courses with the Same Knowledge Area:
Courses CMU-1000 and CMU-1004: Knowledge Area 7
Courses CMU-1000 and CMU-1006: Knowledge Area 7
Courses CMU-1000 and CMU-1010: Knowledge Area 7
Courses CMU-1000 and CMU-1014: Knowledge Area 7
Courses CMU-1000 and CMU-1017: Knowledge Area 7


In [63]:
# Calculate the percentage of course pairs with the same knowledge area
total_pairs = num_courses * (num_courses - 1) / 2
same_knowledge_area_percentage = len(same_knowledge_area_pairs) / total_pairs * 100

print(f"\nPercentage of course pairs with the same knowledge area: {same_knowledge_area_percentage:.2f}%")


Percentage of course pairs with the same knowledge area: 12.16%


### Keyword Similarity

In [64]:
from typing import List

class KeywordSimilarityCalculator:
    @staticmethod
    def jaccard_similarity(keywords1: List[str], keywords2: List[str]) -> float:
        set1 = set(keywords1)
        set2 = set(keywords2)
        intersection = len(set1.intersection(set2))
        union = len(set1.union(set2))
        return intersection / union if union > 0 else 0.0

    @staticmethod
    def compute_similarity(course1: dict, course2: dict) -> float:
        return KeywordSimilarityCalculator.jaccard_similarity(course1['keywords'], course2['keywords'])

# Calculate similarity matrix for keywords
num_courses = len(courses)
keyword_similarity_matrix = np.zeros((num_courses, num_courses))

for i in range(num_courses):
    for j in range(i, num_courses):
        similarity = KeywordSimilarityCalculator.compute_similarity(
            courses.iloc[i], courses.iloc[j]
        )
        keyword_similarity_matrix[i, j] = similarity
        keyword_similarity_matrix[j, i] = similarity  # Matrix is symmetric

# Create a DataFrame for the keyword similarity matrix
keyword_similarity_df = pd.DataFrame(
    keyword_similarity_matrix,
    index=courses['course_id'],
    columns=courses['course_id']
)

print("Keyword Similarity Matrix (sample):")
display(keyword_similarity_df.iloc[:5, :5])

Keyword Similarity Matrix (sample):


course_id,CMU-1000,CMU-1001,CMU-1002,CMU-1003,CMU-1004
course_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CMU-1000,1.0,0.666667,1.0,0.666667,0.857143
CMU-1001,0.666667,1.0,0.666667,1.0,0.571429
CMU-1002,1.0,0.666667,1.0,0.666667,0.857143
CMU-1003,0.666667,1.0,0.666667,1.0,0.571429
CMU-1004,0.857143,0.571429,0.857143,0.571429,1.0


In [65]:
most_similar_pairs = []
for i in range(num_courses):
    for j in range(i+1, num_courses):
        most_similar_pairs.append((
            courses.iloc[i]['course_id'],
            courses.iloc[j]['course_id'],
            keyword_similarity_matrix[i, j]
        ))

most_similar_pairs.sort(key=lambda x: x[2], reverse=True)

print("\nTop 5 Most Similar Course Pairs based on Keywords:")
for pair in most_similar_pairs[:5]:
    print(f"Courses {pair[0]} and {pair[1]}: Similarity = {pair[2]:.4f}")


Top 5 Most Similar Course Pairs based on Keywords:
Courses CMU-1000 and CMU-1002: Similarity = 1.0000
Courses CMU-1000 and CMU-1008: Similarity = 1.0000
Courses CMU-1000 and CMU-1011: Similarity = 1.0000
Courses CMU-1000 and CMU-1015: Similarity = 1.0000
Courses CMU-1000 and CMU-1016: Similarity = 1.0000


In [66]:
# Calculate average keyword similarity
average_similarity = np.mean(keyword_similarity_matrix[np.triu_indices(num_courses, k=1)])
print(f"\nAverage Keyword Similarity: {average_similarity:.4f}")


Average Keyword Similarity: 0.7379


### Overall Similarity

In [67]:
class WeightedSimilarityCalculator:
    def __init__(self, alpha, beta, gamma, delta):
        self.alpha = alpha  # weight for professor similarity
        self.beta = beta    # weight for competence similarity
        self.gamma = gamma  # weight for knowledge area similarity
        self.delta = delta  # weight for keyword similarity

        assert abs(self.alpha + self.beta + self.gamma + self.delta - 1.0) < 1e-6, "Weights must sum to 1"

    def compute_weighted_similarity(self, prof_sim, comp_sim, know_sim, key_sim):
        return (self.alpha * prof_sim +
                self.beta * comp_sim +
                self.gamma * know_sim +
                self.delta * key_sim)

In [68]:
calculator = WeightedSimilarityCalculator(0.3, 0.3, 0.2, 0.2)

In [73]:
# Calculate weighted similarity matrix
num_courses = len(courses)
weighted_similarity_matrix = np.zeros((num_courses, num_courses))

for i in range(num_courses):
    for j in range(i, num_courses):
        weighted_sim = calculator.compute_weighted_similarity(
            professor_similarity_matrix[i, j],
            competence_similarity_matrix[i, j],
            knowledge_area_similarity_matrix[i, j],
            keyword_similarity_matrix[i, j]
        )
        weighted_similarity_matrix[i, j] = weighted_sim
        weighted_similarity_matrix[j, i] = weighted_sim  # Matrix is symmetric

# Create a DataFrame for the weighted similarity matrix
weighted_similarity_df = pd.DataFrame(
    weighted_similarity_matrix,
    index=courses['course_id'],
    columns=courses['course_id']
)

print("Weighted Similarity Matrix (sample):")
display(weighted_similarity_df.iloc[:5, :5])


Weighted Similarity Matrix (sample):


course_id,CMU-1000,CMU-1001,CMU-1002,CMU-1003,CMU-1004
course_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CMU-1000,0.4,0.204152,0.254339,0.167197,0.401759
CMU-1001,0.204152,0.4,0.204152,0.308974,0.161095
CMU-1002,0.254339,0.204152,0.4,0.392533,0.214498
CMU-1003,0.167197,0.308974,0.392533,0.4,0.135986
CMU-1004,0.401759,0.161095,0.214498,0.135986,0.4


In [74]:
most_similar_pairs = []
for i in range(num_courses):
    for j in range(i+1, num_courses):
        most_similar_pairs.append((
            courses.iloc[i]['course_id'],
            courses.iloc[j]['course_id'],
            weighted_similarity_matrix[i, j]
        ))

most_similar_pairs.sort(key=lambda x: x[2], reverse=True)

print("\nTop 5 Most Similar Course Pairs based on Weighted Similarity:")
for pair in most_similar_pairs[:5]:
    print(f"Courses {pair[0]} and {pair[1]}: Similarity = {pair[2]:.4f}")


Top 5 Most Similar Course Pairs based on Weighted Similarity:
Courses CMU-1039 and CMU-1051: Similarity = 0.6717
Courses CMU-1001 and CMU-1072: Similarity = 0.6627
Courses CMU-1007 and CMU-1073: Similarity = 0.6627
Courses CMU-1001 and CMU-1007: Similarity = 0.6127
Courses CMU-1001 and CMU-1073: Similarity = 0.6127


In [75]:
average_similarity = np.mean(weighted_similarity_matrix[np.triu_indices(num_courses, k=1)])
print(f"\nAverage Weighted Similarity: {average_similarity:.4f}")


Average Weighted Similarity: 0.2283


## Recommendations

In [77]:
def get_top_n_recommendations(course_id: str, n: int = 5):
    if course_id not in courses['course_id'].values:
        raise ValueError(f"Course ID {course_id} not found in the dataset.")

    course_index = courses.index[courses['course_id'] == course_id].item()
    similarities = weighted_similarity_matrix[course_index]

    # Sort similarities, excluding the course itself
    similar_indices = similarities.argsort()[::-1][1:n+1]

    recommendations = []
    for idx in similar_indices:
        rec_course_id = courses.iloc[idx]['course_id']
        similarity = similarities[idx]
        recommendations.append((rec_course_id, similarity))

    return recommendations

In [83]:
# Example usage
target_course_id = "CMU-1021"  # Replace with any course ID from your dataset
top_n = 5

try:
    recommendations = get_top_n_recommendations(target_course_id, top_n)

    print(f"\nTop {top_n} recommendations for course {target_course_id}:")
    for i, (rec_course_id, similarity) in enumerate(recommendations, 1):
        course_info = courses[courses['course_id'] == rec_course_id].iloc[0]
        print(f"{i}. Course ID: {rec_course_id}")
        print(f"   Title: {course_info['title']}")
        print(f"   Department: {course_info['department']}")
        print(f"   School: {course_info['school']}")
        print(f"   Similarity: {similarity:.4f}")
        print()

except ValueError as e:
    print(f"Error: {e}")


Top 5 recommendations for course CMU-1021:
1. Course ID: CMU-1092
   Title: Course_92
   Department: Public Policy
   School: School of Computer Science
   Similarity: 0.4468

2. Course ID: CMU-1012
   Title: Course_12
   Department: Mechanical Engineering
   School: College of Engineering
   Similarity: 0.4250

3. Course ID: CMU-1036
   Title: Course_36
   Department: Business Analytics
   School: Tepper School of Business
   Similarity: 0.4250

4. Course ID: CMU-1074
   Title: Course_74
   Department: Art
   School: Heinz College of Information Systems and Public Policy
   Similarity: 0.4188

5. Course ID: CMU-1067
   Title: Course_67
   Department: Business Analytics
   School: Dietrich College of Humanities and Social Sciences
   Similarity: 0.4188

