# Generating course recommendations based on the popular courses in the same cluster

The clustering-based recommender system first groups all users based on their profiles, and maintains a popular courses list for each group.

For any group member who needs course recommendations, the algorithm recommends the unselected courses from the popular course lists.

In [2]:
!pip install scikit-learn==1.0.2
!pip install seaborn==0.11.1



In [3]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

%matplotlib inline

In [4]:
test_user_url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-ML321EN-SkillsNetwork/labs/datasets/rs_content_test.csv"
test_users_df = pd.read_csv(test_user_url)[['user', 'item']]
test_users_df.head()

Unnamed: 0,user,item
0,1502801,RP0105EN
1,1609720,CNSC02EN
2,1347188,CO0301EN
3,755067,ML0103EN
4,538595,BD0115EN


In [5]:
user_profile_url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-ML321EN-SkillsNetwork/labs/datasets/user_profile.csv"
user_profile_df = pd.read_csv(user_profile_url)
user_profile_df.head()

Unnamed: 0,user,Database,Python,CloudComputing,DataAnalysis,Containers,MachineLearning,ComputerVision,DataScience,BigData,Chatbot,R,BackendDev,FrontendDev,Blockchain
0,2,52.0,14.0,6.0,43.0,3.0,33.0,0.0,29.0,41.0,2.0,18.0,34.0,9.0,6.0
1,4,40.0,2.0,4.0,28.0,0.0,14.0,0.0,20.0,24.0,0.0,6.0,6.0,0.0,2.0
2,5,24.0,8.0,18.0,24.0,0.0,30.0,0.0,22.0,14.0,2.0,14.0,26.0,4.0,6.0
3,7,2.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
4,8,6.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,6.0,0.0,2.0,0.0,0.0,0.0


In [74]:
course_genre_url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-ML321EN-SkillsNetwork/labs/datasets/course_genre.csv"
course_df = pd.read_csv(course_genre_url)

In [6]:
feature_names = list(user_profile_df.columns[1:])

In [7]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
user_profile_df[feature_names] = scaler.fit_transform(user_profile_df[feature_names])

In [9]:
features = user_profile_df.loc[:, user_profile_df.columns != 'user']
user_ids = user_profile_df.loc[:, user_profile_df.columns == 'user']

In [10]:
from sklearn.cluster import KMeans

In [11]:
km = KMeans(n_clusters=20, random_state=42)
km = km.fit(features)

cluster_labels = km.labels_
cluster_labels

array([ 3,  3,  6, ..., 18,  7,  7], dtype=int32)

In [12]:
def combine_cluster_labels(user_ids, labels):
    labels_df = pd.DataFrame(labels)
    cluster_df = pd.merge(user_ids, labels_df, left_index=True, right_index=True)
    cluster_df.columns = ['user', 'cluster']
    return cluster_df

In [13]:
labels = cluster_labels
cluster_df = combine_cluster_labels(user_ids, labels)

In [14]:
test_users_labelled = pd.merge(test_users_df, cluster_df, left_on='user', right_on='user')
test_users_labelled

Unnamed: 0,user,item,cluster
0,1502801,RP0105EN,13
1,1502801,BD0131EN,13
2,1502801,BD0212EN,13
3,1502801,BD0115EN,13
4,1502801,BD0211EN,13
...,...,...,...
9397,630511,BD0121EN,2
9398,630511,SC0101EN,2
9399,630511,BD0111EN,2
9400,630511,BD0115EN,2


In [15]:
courses_cluster = test_users_labelled[['item', 'cluster']]
courses_cluster['count'] = [1] * len(courses_cluster)
Course_enroll = courses_cluster.groupby(['cluster','item']).agg(enrollments = ('count','sum')).reset_index()
Course_enroll

Unnamed: 0,cluster,item,enrollments
0,0,AI0111EN,2
1,0,BC0101EN,10
2,0,BC0201EN,1
3,0,BC0202EN,1
4,0,BD0101EN,30
...,...,...,...
1221,19,TA0105,4
1222,19,TA0105EN,1
1223,19,TA0106EN,1
1224,19,TMP0105EN,1


In [16]:
Sorted_courses = Course_enroll.sort_values("enrollments",ascending = False)
Sorted_courses

Unnamed: 0,cluster,item,enrollments
57,0,PY0101EN,102
144,2,BD0111EN,95
143,2,BD0101EN,95
25,0,DA0101EN,87
1135,18,DS0101EN,85
...,...,...,...
994,16,BD0145EN,1
993,16,BD0143EN,1
502,8,DJ0101EN,1
230,3,DS0105EN,1


In [22]:
test_users_labelled

Unnamed: 0,user,item,cluster
0,1502801,RP0105EN,13
1,1502801,BD0131EN,13
2,1502801,BD0212EN,13
3,1502801,BD0115EN,13
4,1502801,BD0211EN,13
...,...,...,...
9397,630511,BD0121EN,2
9398,630511,SC0101EN,2
9399,630511,BD0111EN,2
9400,630511,BD0115EN,2


In [50]:
user_id = 1502801
cluster_label = test_users_labelled[test_users_labelled["user"] == user_id]["cluster"].unique()
cluster_label[0]

13

In [51]:
courses_same_cluster = test_users_labelled[test_users_labelled["cluster"] == cluster_label[0]]["item"].unique()
courses_same_cluster

array(['RP0105EN', 'BD0131EN', 'BD0212EN', 'BD0115EN', 'BD0211EN',
       'BD0221EN', 'BD0223EN', 'BD0141EN', 'BD0111EN', 'BD0101EN',
       'BD0121EN', 'DB0151EN', 'SC0101EN', 'DB0101EN', 'BD0145EN',
       'PY0101EN', 'TA0106EN', 'TA0105', 'DS0103EN', 'DS0101EN',
       'ML0101EN', 'DJ0101EN', 'SC0103EN', 'BD0137EN', 'BD0143EN',
       'ST0101EN', 'ML0120EN', 'BC0101EN', 'RP0101EN', 'BD0153EN',
       'PA0101EN', 'ML0115EN', 'DS0105EN', 'DS0321EN', 'SC0105EN',
       'BD0123EN', 'ML0101ENv3', 'CO0101EN', 'CL0101EN', 'BD0133EN',
       'CO0201EN', 'TA0105EN', 'DA0101EN', 'CC0201EN', 'CO0301EN',
       'DE0205EN', 'ML0109EN', 'WA0101EN', 'CC0103EN', 'DV0101EN',
       'LB0105ENv1', 'BD0151EN', 'DS0301EN', 'CC0101EN', 'DV0151EN',
       'BD0135EN', 'RP0151EN', 'TMP0105EN', 'CB0103EN', 'DW0101EN',
       'ML0151EN'], dtype=object)

In [40]:
courses_same_cluster_list = list(courses_same_cluster)
courses_same_cluster_list

['RP0105EN',
 'BD0131EN',
 'BD0212EN',
 'BD0115EN',
 'BD0211EN',
 'BD0221EN',
 'BD0223EN',
 'BD0141EN',
 'BD0111EN',
 'BD0101EN',
 'BD0121EN',
 'DB0151EN',
 'SC0101EN',
 'DB0101EN',
 'BD0145EN',
 'PY0101EN',
 'TA0106EN',
 'TA0105',
 'DS0103EN',
 'DS0101EN',
 'ML0101EN',
 'DJ0101EN',
 'SC0103EN',
 'BD0137EN',
 'BD0143EN',
 'ST0101EN',
 'ML0120EN',
 'BC0101EN',
 'RP0101EN',
 'BD0153EN',
 'PA0101EN',
 'ML0115EN',
 'DS0105EN',
 'DS0321EN',
 'SC0105EN',
 'BD0123EN',
 'ML0101ENv3',
 'CO0101EN',
 'CL0101EN',
 'BD0133EN',
 'CO0201EN',
 'TA0105EN',
 'DA0101EN',
 'CC0201EN',
 'CO0301EN',
 'DE0205EN',
 'ML0109EN',
 'WA0101EN',
 'CC0103EN',
 'DV0101EN',
 'LB0105ENv1',
 'BD0151EN',
 'DS0301EN',
 'CC0101EN',
 'DV0151EN',
 'BD0135EN',
 'RP0151EN',
 'TMP0105EN',
 'CB0103EN',
 'DW0101EN',
 'ML0151EN']

In [65]:
test_users = test_users_df.groupby(['user']).max().reset_index(drop=False)
test_user_ids = test_users['user'].to_list()
print(f"Total numbers of test users {len(test_user_ids)}")

Total numbers of test users 1000


In [82]:
courses_cluster = test_users_labelled[['item', 'cluster']]
courses_cluster['count'] = [1] * len(courses_cluster)
Course_enroll = courses_cluster.groupby(['cluster','item']).agg(enrollments = ('count','sum')).reset_index()
Course_enroll

Unnamed: 0,cluster,item,enrollments
0,0,AI0111EN,2
1,0,BC0101EN,10
2,0,BC0201EN,1
3,0,BC0202EN,1
4,0,BD0101EN,30
...,...,...,...
1221,19,TA0105,4
1222,19,TA0105EN,1
1223,19,TA0106EN,1
1224,19,TMP0105EN,1


In [117]:
cluster_label = 0
users = []
courses = []
cluster = []
for user_id in test_user_ids:
    cluster_label = test_users_labelled[test_users_labelled["user"] == user_id]["cluster"].unique()
    same_cluster_ids = test_users_labelled[test_users_labelled["cluster"] == cluster_label[0]]["item"].unique()
    same_cluster_ids = set(same_cluster_ids)
    courses_same_cluster = course_df[course_df['COURSE_ID'].isin(same_cluster_ids)]
    current_enrolled_course_ids = test_users_df[test_users_df['user'] == user_id]['item'].to_list()
    current_enrolled_courses = course_df[course_df['COURSE_ID'].isin(current_enrolled_course_ids)]
    unselected_course_ids = same_cluster_ids.difference(current_enrolled_course_ids)
    unselected_course_ids_list = list(unselected_course_ids)
    for i in range(0, len(unselected_course_ids)):
            courses.append(unselected_course_ids_list[i])
            users.append(user_id)
            cluster.append(cluster_label[0])

res_dict = {}

res_dict['USER'] = users
res_dict['COURSE_ID'] = courses
res_dict['Cluster']=cluster



In [118]:
res_df = pd.DataFrame(res_dict, columns=['USER', 'COURSE_ID','Cluster'])

In [119]:
res_df

Unnamed: 0,USER,COURSE_ID,Cluster
0,37465,DB0151EN,2
1,37465,AI0111EN,2
2,37465,DS0301EN,2
3,37465,BC0101EN,2
4,37465,CC0103EN,2
...,...,...,...
58404,2087663,BD0145EN,15
58405,2087663,CP0101EN,15
58406,2087663,CL0101EN,15
58407,2087663,IT0101EN,15


In [121]:
courses_cluster = res_df[['COURSE_ID', 'Cluster']]
courses_cluster['count'] = [1] * len(courses_cluster)
Course_enroll = courses_cluster.groupby(['Cluster','COURSE_ID']).agg(enrollments = ('count','sum')).reset_index()
Course_enroll

Unnamed: 0,Cluster,COURSE_ID,enrollments
0,0,AI0111EN,105
1,0,BC0101EN,97
2,0,BC0201EN,106
3,0,BC0202EN,106
4,0,BD0101EN,77
...,...,...,...
1204,19,TA0105,30
1205,19,TA0105EN,33
1206,19,TA0106EN,33
1207,19,TMP0105EN,33


In [123]:
#On average, how many new courses have been recommended per test user?
recom_total = res_df.groupby(['USER']).size()
recom_total.mean()

58.409

In [124]:
sorted_Course_enroll = Course_enroll.sort_values(by="enrollments",ascending = False)
sorted_Course_enroll

Unnamed: 0,Cluster,COURSE_ID,enrollments
177,2,DS0201EN,110
187,2,ML0109EN,110
153,2,BD0151EN,110
158,2,BD0223EN,110
183,2,LB0103ENv1,110
...,...,...,...
208,3,BD0101EN,1
351,6,RP0101EN,1
209,3,BD0115EN,1
350,6,PY0101EN,1


In [125]:
sorted_Course_enroll = sorted_Course_enroll.reset_index()
sorted_Course_enroll

Unnamed: 0,index,Cluster,COURSE_ID,enrollments
0,177,2,DS0201EN,110
1,187,2,ML0109EN,110
2,153,2,BD0151EN,110
3,158,2,BD0223EN,110
4,183,2,LB0103ENv1,110
...,...,...,...,...
1204,208,3,BD0101EN,1
1205,351,6,RP0101EN,1
1206,209,3,BD0115EN,1
1207,350,6,PY0101EN,1


In [127]:
#Selecting top 10 courses
top10_rec = sorted_Course_enroll.COURSE_ID.iloc[0:10]

In [128]:
top10_rec = pd.DataFrame(top10_rec)

In [130]:
courseID_Title = course_df[["COURSE_ID","TITLE"]]
courseID_Title.drop_duplicates(subset=['COURSE_ID'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [131]:
top10_rec_title = top10_rec.merge(courseID_Title, how="left", on= "COURSE_ID")
top10_rec_title[["COURSE_ID","TITLE"]]

Unnamed: 0,COURSE_ID,TITLE
0,DS0201EN,end to end data science on cloudpak for data
1,ML0109EN,machine learning dimensionality reduction
2,BD0151EN,text analytics 101
3,BD0223EN,exploring spark s graphx
4,LB0103ENv1,reactive architecture domain driven design
5,WA0103EN,watson analytics for social media
6,LB0101ENv1,reactive architecture introduction to reactiv...
7,CC0210EN,serverless computing using cloud functions d...
8,DV0151EN,data visualization with r
9,CO0401EN,beyond the basics istio and ibm cloud kuberne...
