In [2]:
from surprise import accuracy, Dataset, Reader
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import KNNBasic, SVD
import pandas as pd
import numpy as np

In [3]:
reviews = pd.read_csv("reviews_cleaned.csv")
reviewed_courses = reviews["course_id"].unique()
reviews.shape

(268711, 4)

In [4]:
courses = pd.read_csv("Course_info.csv")

In [5]:
courses = courses[["id", "title", "category", "course_url"]]
courses = courses[courses["id"].isin(reviewed_courses)]
courses.shape

(2549, 4)

In [6]:
# surprise iid - uid process
reviews["user_id"] = reviews["user_id"].astype(float)
reviews["user_id"] = reviews["user_id"].astype(int)
reviews["user_id"] = reviews["user_id"].astype(str)
reviews["course_id"] = reviews["course_id"].astype(float)
reviews["course_id"] = reviews["course_id"].astype(int)
reviews["course_id"] = reviews["course_id"].astype(str)
courses["id"] = courses["id"].astype(float)
courses["id"] = courses["id"].astype(int)
courses["id"] = courses["id"].astype(str)

In [7]:
reader = Reader(rating_scale=(0, 5))
surprise_data = Dataset.load_from_df(reviews[["user_id", "course_id", "rate"]], reader)

In [8]:
trainset = surprise_data.build_full_trainset()

In [9]:
sim_options = {"name": "cosine", "user_based": False}
algo1 = KNNBasic(sim_options=sim_options)
algo2 = SVD()

In [10]:
cross_validate(algo1, surprise_data, measures=["RMSE", "MAE"], cv=5, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.7955  0.7855  0.7880  0.8000  0.7967  0.7931  0.0055  
MAE (testset)     0.4678  0.4634  0.4634  0.4700  0.4683  0.4666  0.0027  
Fit time          0.32    0.38    0.38    0.38    0.39    0.37    0.02    
Test time         0.48    0.47    0.47    0.46    0.48    0.47    0.01    


{'test_rmse': array([0.79545479, 0.78546519, 0.78799917, 0.79998143, 0.79667426]),
 'test_mae': array([0.46783305, 0.46343971, 0.46343449, 0.47004768, 0.46826657]),
 'fit_time': (0.32196593284606934,
  0.3810000419616699,
  0.3779947757720947,
  0.38396382331848145,
  0.3900306224822998),
 'test_time': (0.4790315628051758,
  0.4670286178588867,
  0.4719972610473633,
  0.4590299129486084,
  0.47702980041503906)}

In [11]:
cross_validate(algo2, surprise_data, measures=["RMSE", "MAE"], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.6933  0.6954  0.6992  0.7003  0.6943  0.6965  0.0027  
MAE (testset)     0.4701  0.4701  0.4715  0.4739  0.4715  0.4714  0.0014  
Fit time          2.54    2.52    2.73    2.63    2.63    2.61    0.08    
Test time         0.46    0.47    0.32    0.31    0.31    0.37    0.07    


{'test_rmse': array([0.69333913, 0.69544236, 0.69915136, 0.70025309, 0.69432473]),
 'test_mae': array([0.47007244, 0.47011251, 0.4714877 , 0.47393864, 0.47148132]),
 'fit_time': (2.5415682792663574,
  2.52003812789917,
  2.7349979877471924,
  2.6259877681732178,
  2.6250033378601074),
 'test_time': (0.45999813079833984,
  0.46596407890319824,
  0.32102322578430176,
  0.31301307678222656,
  0.3099658489227295)}

In [12]:
algo1.fit(trainset)
algo2.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1f406511be0>

In [13]:
import pickle
filename = 'surprise.sav'
pickle.dump(algo2, open(filename, 'wb'))

In [154]:
from collections import Counter

def random_user(df, trainset):
    users = df['user_id'].unique()
    index = np.random.choice(users.shape[0], 1, replace=False)
    user_id = users[index][0]
    return user_id, trainset.to_inner_uid(user_id)

def get_recommendations_by_category(user, df, courses, trainset, algo):
    predictions = list()
    user_courses_list = list(df[df["user_id"] == user[0]]["course_id"])
    user_courses = courses[courses["id"].isin(user_courses_list)]
    cnt = Counter(user_courses["category"].array)
    user_category = cnt.most_common(1)[0][0]
    category_courses = courses.loc[courses["category"] == str(user_category)]
    for course in category_courses.id:
        course_id = trainset.to_inner_iid(course)
        pred = algo.predict(user[1], course_id)
        predictions.append([course, pred.est])
    predictions = np.array(predictions)
    return predictions[predictions[:,1].argsort()][::-1][:3]

def get_recommendations(user, courses, algo):
    predictions = list()
    for course in courses.id:
        course_id = trainset.to_inner_iid(course)
        pred = algo.predict(user[1], course_id)
        predictions.append([pred.uid, pred.iid, pred.r_ui, pred.est, pred.details, user[0], course])
        df = pd.DataFrame(np.array(predictions), columns=['uid', 'iid', 'rui', 'est', 'details', "user_id", "course_id"])
    return df.sort_values(by='est')[:5]

In [155]:
np.set_printoptions(suppress=True)
user = random_user(reviews, trainset)
recom = get_recommendations_by_category(user, reviews, courses, trainset, algo2)
recom_id = recom[:,0]
recom

recom2 = get_recommendations(user, courses, algo2)

In [156]:
user_courses = list(reviews[reviews["user_id"] == user[0]]["course_id"])

In [157]:
courses[courses["id"].isin(user_courses)]

Unnamed: 0,id,title,category,course_url
17800,851712,The Complete JavaScript Course 2022: From Zero...,Development,/course/the-complete-javascript-course/
18341,874012,The Ultimate Drawing Course - Beginner to Adva...,Design,/course/the-ultimate-drawing-course-beginner-t...
19385,917596,Introductory Photography Course,Photography & Video,/course/free-photography-course-for-beginners/
34357,1430746,The Complete Web Developer in 2022: Zero to Ma...,Development,/course/the-complete-web-developer-zero-to-mas...
108860,3377424,DP-203: Data Engineering on Microsoft Azure + ...,IT & Software,/course/dp200exam/


In [158]:
print("Recommending courses based on users category preference")
courses[courses["id"].isin(recom_id)]

Recommending courses based on users category preference


Unnamed: 0,id,title,category,course_url
19293,914024,Curso de TypeScript - El lenguaje utilizado po...,Development,/course/curso-de-typescript-el-lenguaje-utiliz...
20148,947098,Understanding TypeScript - 2022 Edition,Development,/course/understanding-typescript/
188139,4599580,Principios SOLID y Clean Code,Development,/course/solid-clean/


In [159]:
print("Recommending courses")
recom2.sort_values(by='est')[:5]

Recommending courses


Unnamed: 0,uid,iid,rui,est,details,user_id,course_id
0,39245,177,,4.626426,{'was_impossible': False},41424,9287
1695,39245,2250,,4.626426,{'was_impossible': False},41424,1455016
1696,39245,291,,4.626426,{'was_impossible': False},41424,1455618
1697,39245,1994,,4.626426,{'was_impossible': False},41424,1456464
1698,39245,936,,4.626426,{'was_impossible': False},41424,1458304


In [160]:
courses[courses["id"].isin(recom2["course_id"].to_numpy())]

Unnamed: 0,id,title,category,course_url
29,9287,Microsoft Excel 2010 Course Beginners/ Interme...,Office Productivity,/course/excel-tutorial/
35223,1455016,Build an app with ASPNET Core and Angular from...,Development,/course/build-an-app-with-aspnet-core-and-angu...
35236,1455618,WordPress 2022: The Complete WordPress Website...,Design,/course/complete-wp-course/
35273,1456464,CompTIA CySA+ (CS0-002) Complete Course & Prac...,IT & Software,/course/comptiacsaplus/
35326,1458304,The Digital Painting MEGA Course: Beginner to ...,Design,/course/the-digital-painting-mega-course-begin...
