In [1]:
# Libraries
import pandas as pd
import numpy as np
from sklearn.metrics import silhouette_score

# Load in data
test_users_df = pd.read_csv('rs_content_test.csv')
profile_df = pd.read_csv('user_profile.csv')
course_genres_df = pd.read_csv('course_genre.csv')


In [2]:
# Define course_vectors

#Define the unseen courses vector
all_courses = set(course_genres_df['COURSE_ID'].values)

# Define the test_user_ids vector
test_users = test_users_df.groupby(['user']).max().reset_index(drop=False)
test_user_ids = test_users['user'].to_list()


In [3]:
def generate_recommendation_scores_dot_product(score_threshold, test_user_ids, profile_df, test_users_df, all_courses, course_genres_df):
    """
    Generate recommendation scores using dot product.

    Args:
    score_threshold: a float representing the minimum recommendation score for a course to be included in the results
    test_user_ids: a list of strings representing the IDs of the test users
    profile_df: a pandas DataFrame representing the user profiles
    test_users_df: a pandas DataFrame representing the test user data
    all_courses: a set of strings representing all the course IDs
    course_genres_df: a pandas DataFrame representing the course genres

    Returns:
    Three lists representing the user IDs, course IDs, and recommendation scores respectively.
    """

    users = []
    courses = []
    scores = []
    # loop through each test user to generate recommendation scores
    for user_id in test_user_ids:
        # extract the user profile of the current user
        test_user_profile = profile_df[profile_df['user'] == user_id]
        test_user_vector = test_user_profile.iloc[:, 1:].values
        # extract the courses that the current user has enrolled in
        enrolled_courses = test_users_df[test_users_df['user'] == user_id]['item'].to_list()
        # extract the courses that the current user has not enrolled in
        unknown_courses = all_courses.difference(enrolled_courses)
        unknown_course_df = course_genres_df[course_genres_df['COURSE_ID'].isin(unknown_courses)]
        unknown_course_ids = unknown_course_df['COURSE_ID'].values
        unknown_course_vectors = unknown_course_df.iloc[:, 2:].values
        test_user_vector = test_user_vector.reshape(1, -1)
        # compute the recommendation scores using dot product
        recommendation_scores = np.dot(test_user_vector, unknown_course_vectors.T)
        recommendation_scores = np.transpose(recommendation_scores)
        # loop through each unknown course to generate recommendation scores
        for i in range(len(unknown_course_ids)):
            score = recommendation_scores[i]
            if score >= score_threshold:
                score_float = score.item(0)
                # add the recommendation score to the result if it meets the threshold
                users.append(user_id)
                courses.append(unknown_course_ids[i])
                scores.append(score_float)
    # return the lists of user IDs, course IDs, and recommendation scores
    return users, courses, scores

In [5]:
# Generate results dictionary, data frame and silhouette score
res_dict = {}
score_threshold = 10
users, courses, scores = generate_recommendation_scores_dot_product(score_threshold, test_user_ids, profile_df, test_users_df, all_courses, course_genres_df)
res_dict['USER'] = users
res_dict['COURSE_ID'] = courses
res_dict['SCORE'] = scores
res_df = pd.DataFrame(res_dict, columns=['USER', 'COURSE_ID', 'SCORE'])
res_df

Unnamed: 0,USER,COURSE_ID,SCORE
0,37465,RP0105EN,27.0
1,37465,GPXX06RFEN,12.0
2,37465,CC0271EN,15.0
3,37465,BD0145EN,24.0
4,37465,DE0205EN,15.0
...,...,...,...
53406,2087663,excourse88,15.0
53407,2087663,excourse89,15.0
53408,2087663,excourse90,15.0
53409,2087663,excourse92,15.0


In [12]:
# courses and scores are the same as in the original code
scores_array = np.array(scores).reshape(-1, 1)
silhouette_score = silhouette_score(scores_array, labels=courses)

In [13]:
silhouette_score

-0.5254520609819565