In [1]:
import pandas as pd
import numpy as np
import random
from surprise import Dataset, Reader, accuracy, dump
from surprise import SVD, SVDpp, KNNBasic, SlopeOne, CoClustering, NMF
from surprise import SlopeOne, CoClustering, NMF
from surprise import NormalPredictor, BaselineOnly, KNNWithMeans
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV
import surprise_helper

### Preparation

In [2]:
rates = pd.read_csv("rates.csv")
rates.head(3)

Unnamed: 0,user_id,display_name,course_id,rate
0,11162,Jacynthe,1055720,5.0
1,19379,Norval,1055720,5.0
2,11393,Jany,1055720,4.0


In [3]:
courses = pd.read_csv("courses.csv")
courses.head(3)

Unnamed: 0,id,title,category,course_url
0,9287,Microsoft Excel 2010 Course Beginners/ Interme...,Office Productivity,/course/excel-tutorial/
1,9385,Microsoft Excel 2010: Advanced Training,Office Productivity,/course/advanced-excel/
2,9711,Beginner PHP and MySQL Tutorial,Development,/course/php-mysql-tutorial/


In [4]:
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(rates[["user_id", "course_id", "rate"]], reader)

### Model Selection & Testing

In [34]:
sim_options = {"name": "cosine", "user_based": False}
_ = cross_validate(KNNBasic(sim_options=sim_options), data, measures=['RMSE', 'MAE'], cv=3, verbose=True)
_ = cross_validate(SVD(), data, measures=['RMSE', 'MAE'], cv=3, verbose=True)
_ = cross_validate(SVDpp(), data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.7929  0.7979  0.7896  0.7935  0.0034  
MAE (testset)     0.4730  0.4754  0.4708  0.4731  0.0019  
Fit time          0.21    0.26    0.26    0.24    0.02    
Test time         1.10    0.59    0.42    0.71    0.29    
Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.6981  0.7054  0.6952  0.6995  0.0043  
MAE (testset)     0.4714  0.4755  0.4718  0.4729  0.0018  
Fit time          1.33    1.29    1.29    1.31    0.02    
Test time         0.43    0.28    0.43    0.38    0.07    
Evaluating RMSE, MAE of algorithm SVDpp on 3 split(s).

             

In [35]:
_ = cross_validate(SlopeOne(), data, measures=['RMSE', 'MAE'], cv=3, verbose=True)
_ = cross_validate(CoClustering(), data, measures=['RMSE', 'MAE'], cv=3, verbose=True)
_ = cross_validate(NMF(), data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm SlopeOne on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8684  0.8663  0.8704  0.8684  0.0017  
MAE (testset)     0.5423  0.5423  0.5444  0.5430  0.0010  
Fit time          0.32    0.38    0.39    0.36    0.03    
Test time         0.30    0.31    0.47    0.36    0.08    
Evaluating RMSE, MAE of algorithm CoClustering on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8416  0.8484  0.8317  0.8405  0.0069  
MAE (testset)     0.5426  0.5441  0.5401  0.5422  0.0017  
Fit time          4.27    4.23    4.28    4.26    0.02    
Test time         0.19    0.35    0.20    0.25    0.07    
Evaluating RMSE, MAE of algorithm NMF on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9200  0.9268  0.9245  0.9238  0.0028  
MAE (testset)     0.7109  0.7191  0.7142  0.7148  0.0034  
Fit time          3.05    3.04    3.06    3.05    0.01   

In [41]:
_ = cross_validate(NormalPredictor(), data, measures=['RMSE', 'MAE'], cv=3, verbose=True)
_ = cross_validate(BaselineOnly(), data, measures=['RMSE', 'MAE'], cv=3, verbose=True)
_ = cross_validate(KNNWithMeans(), data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm NormalPredictor on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9034  0.8998  0.8999  0.9010  0.0017  
MAE (testset)     0.6267  0.6220  0.6238  0.6242  0.0019  
Fit time          0.12    0.15    0.15    0.14    0.01    
Test time         0.34    0.18    0.32    0.28    0.07    
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.6935  0.6913  0.6925  0.6925  0.0009  
MAE (testset)     0.4736  0.4744  0.4735  0.4738  0.0004  
Fit time          0.37    0.39    0.40    0.39    0.01    
Test time         0.16    0.15    0.30    0.20    0.07    
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Don

In [6]:
param_grid = {"n_epochs": [5, 10, 15, 20, 25, 30], "lr_all": [0.002, 0.005, 0.01, 0.02]}
gs_svd = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=5)
gs_svd.fit(data)
print(gs_svd.best_score["rmse"])
print(gs_svd.best_params["rmse"])

0.6970477800516851
{'n_epochs': 15, 'lr_all': 0.005}


In [7]:
trainset, testset = train_test_split(data, random_state=78, test_size=0.2)
pred = SVD(n_epochs=20, lr_all=0.005).fit(trainset).test(testset)
accuracy.rmse(pred)

RMSE: 0.7034


0.703407710233993

### Recommendation

In [5]:
full_trainset = data.build_full_trainset()

In [18]:
svd = SVD(n_epochs=15, lr_all=0.005)
svd.fit(full_trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x221798d7820>

In [19]:
# causes memory errors, instead calculate anti testset for the user
#anti_testset = full_trainset.build_anti_testset()
anti_testset_user, user_id = surprise_helper.get_anti_testset_user(full_trainset)
print("User ID:", user_id)
print("Display Name:", rates[rates["user_id"] == user_id]["display_name"].unique()[0])

User ID: 13624
Display Name: Karen Eloise


In [20]:
user_pred = svd.test(anti_testset_user)

In [21]:
courses_taken = rates[rates["user_id"] == user_id]["course_id"]
courses_taken_df = courses[courses["id"].isin(courses_taken)]
courses_taken_df

Unnamed: 0,id,title,category,course_url
521,566284,Do Básico ao Avançado - O Curso Completo de Mi...,Office Productivity,/course/curso-excel-completo/
1437,1276020,Adobe Photoshop 2018 Completo - do Iniciante a...,Design,/course/photoshop-para-iniciante-o-basico/
1908,1680762,Curso de fotografia profissional para iniciantes,Photography & Video,/course/fotografia-simples-e-direta-com-vinici...
1928,1694044,Hiper-Foco & Hiper-Produtividade,Personal Development,/course/focoeprodutividade/
2077,1949732,Hiper-Memória & Hiper-Aprendizagem,Personal Development,/course/hiper-memoria-hiper-aprendizagem/
2386,2650614,Hiper-Leitura | A Máxima Performance na Habili...,Personal Development,/course/hiper-leitura/


In [22]:
user_pred_df = pd.DataFrame(user_pred)
user_pred_df.sort_values(by=['est'],inplace=True,ascending = False)
recom_list = user_pred_df.head(5)['iid'].to_list()
courses[courses["id"].isin(recom_list)]

Unnamed: 0,id,title,category,course_url
1474,1309058,Programação Shell Script - Automatizando Rotin...,IT & Software,/course/programacao-shell-script/
2211,2237864,Curso Forex Completo de A à Z,Finance & Accounting,/course/curso_forex_exodustrading/
2307,2431900,Dart: De cero hasta los detalles,Development,/course/dart-de-cero-hasta-los-detalles/
2354,2559100,JavaScript : la formation ULTIME,Development,/course/javascript-la-formation-ultime/
2377,2634160,Básico ao Avançado - O curso Completo de Macro...,Office Productivity,/course/descomplicando-macros-vba-excel/


In [14]:
#dump.dump("svd.sav", algo=svd, verbose=1)

### Similar Courses with KNN

In [23]:
param_grid = {"k": [20, 40, 60]}
gs_knn = GridSearchCV(KNNBasic, param_grid, measures=["rmse", "mae"], cv=3)
gs_knn.fit(data)
print(gs_knn.best_score["rmse"])
print(gs_knn.best_params["rmse"])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
0.7915462899484954
{'k': 20}


In [6]:
knn = KNNBasic(k=20, sim_options={"name": "cosine", "user_based": False})
knn.fit(full_trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x180f4c5d1f0>

In [10]:
similar_courses_raw_ids, course_id = surprise_helper.get_nearest_neighbors(full_trainset, knn)

In [11]:
courses[courses["id"]==course_id]

Unnamed: 0,id,title,category,course_url
2315,2462140,Learn JAVA Programming - Beginner to Master,Development,/course/java-se-programming/


In [12]:
courses[courses["id"].isin(similar_courses_raw_ids)]

Unnamed: 0,id,title,category,course_url
667,689956,Boost Your Excel Skills: Crash Course w/ Downl...,Office Productivity,/course/excel-tutorial-excel-training/
1145,1057142,The Complete VMware vSphere 7: Beginner to Adv...,IT & Software,/course/vmware-vsphere/
1431,1272092,Karuna Ki Reiki Master Training,Lifestyle,/course/karuna-ki-reiki-master-training/
1626,1400814,The Complete Final Cut Pro X Course Beginner T...,Photography & Video,/course/finalcutproxcourse/
1893,1661222,Python Data Analysis & Visualization Bootcamp,Business,/course/learn-data-analytics-complete-bootcamp...


In [None]:
#dump.dump("knn.sav", algo=knn, verbose=1)