# Load lib and data

In [None]:
import pandas as pd
import numpy as np
import math

In [None]:
rating_url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-ML321EN-SkillsNetwork/labs/datasets/ratings.csv"
rating_df = pd.read_csv(rating_url)

In [None]:
rating_df.head()

Unnamed: 0,user,item,rating
0,1889878,CC0101EN,3.0
1,1342067,CL0101EN,3.0
2,1990814,ML0120ENv3,3.0
3,380098,BD0211EN,3.0
4,779563,DS0101EN,3.0


In [None]:
rating_sparse_df = rating_df.pivot(index='user', columns='item', values='rating').fillna(0).reset_index().rename_axis(index=None, columns=None)
rating_sparse_df

Unnamed: 0,user,AI0111EN,BC0101EN,BC0201EN,BC0202EN,BD0101EN,BD0111EN,BD0115EN,BD0121EN,BD0123EN,...,SW0201EN,TA0105,TA0105EN,TA0106EN,TMP0101EN,TMP0105EN,TMP0106,TMP107,WA0101EN,WA0103EN
0,2,0.0,3.0,0.0,0.0,3.0,2.0,0.0,2.0,2.0,...,0.0,2.0,0.0,3.0,0.0,2.0,2.0,0.0,3.0,0.0
1,4,0.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,2.0,...,0.0,2.0,0.0,0.0,0.0,2.0,2.0,0.0,2.0,2.0
2,5,2.0,2.0,2.0,0.0,2.0,0.0,0.0,0.0,2.0,...,0.0,0.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,2.0
3,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,8,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33896,2102054,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33897,2102356,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33898,2102680,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33899,2102983,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Implementation Option 1: Use Surprise library

In [None]:
!pip install scikit-surprise==1.1.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-surprise==1.1.1
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.8/11.8 MB[0m [31m59.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp39-cp39-linux_x86_64.whl size=2135316 sha256=3baeb80dc060aa5db7ed029e8f11760454a05d628bd9efbca8a6afe73d593201
  Stored in directory: /root/.cache/pip/wheels/6b/10/c9/7f607c8cb522ef378844f41e63b30d7181a6495d2c1ae514e9
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.1


In [None]:
from surprise import KNNBasic
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy


## Demo

In [None]:
# Load the movielens-100k dataset (download it if needed),
data = Dataset.load_builtin('ml-100k', prompt=False)
# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data, test_size=.25)

# We'll use the famous KNNBasic algorithm.
algo = KNNBasic()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)


Trying to download dataset from http://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /root/.surprise_data/ml-100k
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9826


0.9825786645146664

## Load data

In [None]:
rating_df.to_csv("course_ratings.csv", index=False)
# Read the course rating dataset with columns user item rating
reader = Reader(
        line_format='user item rating', sep=',', skip_lines=1, rating_scale=(2, 3))

coruse_dataset = Dataset.load_from_file("course_ratings.csv", reader=reader)
# split into train / test set
trainset, testset = train_test_split(coruse_dataset, test_size=.3)
print(f"Total {trainset.n_users} users and {trainset.n_users} items in the trainingset")
print(f"Total {len(testset)} users and {len(testset)} items in the testset")

Total 31371 users and 31371 items in the trainingset
Total 69992 users and 69992 items in the testset


# Build model and train

- Define a KNNBasic() model
Note there are some arguments such as:
max_k and min_k, representing the max and min number of neighors for rating estimations
sim_option, representing similarity measurement such as cosine and whether you want it to be user_based or items_based
e.g., sim_option = {
       'name': 'cosine', 'user_based': False,
   }
more KNN model hyperparamets can be found here:
https://surprise.readthedocs.io/en/stable/knn_inspired.html
You may try different hyperparamet combinations to see which one has the best performance

- Train the KNNBasic model on the trainset, and predict ratings for the testset

- Then compute RMSE

In [None]:
sim_options = {
    'name':'cosine', 'user_based': False,
}

# We'll use the famous KNNBasic algorithm.
algo = KNNBasic(sim_options=sim_options)

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)


Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.1924


0.19243568041617795

Note: User_based much more complex than items based

# Implemetation options 2: Use numpy, pandas, and sklearn

**User-based collaborative filtering**

In [None]:
# read data
import pandas as pd
import numpy as np
rating_url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-ML321EN-SkillsNetwork/labs/datasets/ratings.csv"
rating_df = pd.read_csv(rating_url)
# size too much cause problem
rating_df = rating_df[:80000]


### Split train/test set

In [None]:
rating_df

Unnamed: 0,user,item,rating
0,1889878,CC0101EN,3.0
1,1342067,CL0101EN,3.0
2,1990814,ML0120ENv3,3.0
3,380098,BD0211EN,3.0
4,779563,DS0101EN,3.0
...,...,...,...
79995,404836,BD0111EN,3.0
79996,950442,DS0103EN,3.0
79997,1049317,ST0101EN,3.0
79998,767514,PY0101EN,3.0


In [None]:
from sklearn.model_selection import train_test_split
rating_df_train, rating_df_test  = train_test_split(rating_df, test_size = 0.1, random_state = 42)

In [None]:
rating_df_train

Unnamed: 0,user,item,rating
51531,1453760,DS0105EN,3.0
77647,1177338,PY0101EN,3.0
14043,196746,BD0141EN,3.0
75237,727800,ML0122EN,2.0
44043,1438065,BD0211EN,3.0
...,...,...,...
6265,1254964,DW0101EN,3.0
54886,1938013,CC0150EN,3.0
76820,1562132,ST0101EN,3.0
860,916637,ST0101EN,3.0


In [None]:
rating_sparse_df = rating_df_train.pivot(index='user', columns='item', values='rating').fillna(0).reset_index().rename_axis(index=None, columns=None)
rating_sparse_df

Unnamed: 0,user,AI0111EN,BC0101EN,BC0201EN,BC0202EN,BD0101EN,BD0111EN,BD0115EN,BD0121EN,BD0123EN,...,SW0201EN,TA0105,TA0105EN,TA0106EN,TMP0101EN,TMP0105EN,TMP0106,TMP107,WA0101EN,WA0103EN
0,2,0.0,3.0,0.0,0.0,3.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,3.0,0.0,2.0,0.0,0.0,3.0,0.0
1,4,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
3,8,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,9,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25988,2102054,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25989,2102356,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25990,2102680,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25991,2102983,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
## - Calculate the similarity between two items using their rating history (the row vectors of interaction matrix)
## - Build a similarity matrix for each pair of items with the training dataset
from sklearn.metrics.pairwise import cosine_similarity
# Calculate cosine similarity matrix for user
sim_matrix = cosine_similarity(rating_sparse_df)
sim_matrix_df = pd.DataFrame(sim_matrix)
# reset columns and index
sim_matrix_df.columns = rating_sparse_df.iloc[:,0]
sim_matrix_df = sim_matrix_df.set_index(rating_sparse_df.iloc[:,0])
# show the sims matrix
sim_matrix_df

user,2,4,5,8,9,12,16,20,22,23,...,2100030,2100731,2101142,2101262,2101535,2102054,2102356,2102680,2102983,2103039
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,1.000000,0.276736,0.243143,0.129066,0.124159,0.131228,0.163571,0.132378,0.132492,0.149825,...,0.133039,0.133038,0.133038,0.133038,0.133038,0.133038,0.133038,0.133038,0.133038,0.133038
4,0.276736,1.000000,0.450570,0.538138,0.517678,0.547153,0.582202,0.551947,0.552422,0.552615,...,0.554700,0.554700,0.554700,0.554700,0.554700,0.554700,0.554700,0.554701,0.554700,0.554700
5,0.243143,0.450570,1.000000,0.492515,0.473789,0.500766,0.487171,0.505154,0.505588,0.505765,...,0.507673,0.507673,0.507673,0.507673,0.507673,0.507673,0.507673,0.507673,0.507673,0.507673
8,0.129066,0.538138,0.492515,1.000000,0.905392,0.956943,0.930964,0.965328,0.966158,0.966495,...,0.970143,0.970143,0.970143,0.970143,0.970143,0.970143,0.970143,0.970143,0.970143,0.970143
9,0.124159,0.517678,0.473789,0.905392,1.000000,0.954653,0.895568,0.928625,0.929424,0.929748,...,0.933257,0.933257,0.933257,0.933257,0.933257,0.933257,0.933257,0.933257,0.933257,0.933257
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2102054,0.133038,0.554700,0.507673,0.970143,0.933257,0.986394,0.959616,0.995037,0.995893,0.996241,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
2102356,0.133038,0.554700,0.507673,0.970143,0.933257,0.986394,0.959616,0.995037,0.995893,0.996241,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
2102680,0.133038,0.554701,0.507673,0.970143,0.933257,0.986394,0.959616,0.995037,0.995893,0.996241,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
2102983,0.133038,0.554700,0.507673,0.970143,0.933257,0.986394,0.959616,0.995037,0.995893,0.996241,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000


In [None]:
## - For each items, find its k nearest neighbors in the sim matrix
def findKNearestNeighbors(user_id, sim_matrix_df = sim_matrix_df, k = 5):
  sims = sim_matrix_df.loc[:,user_id].sort_values(axis = 0,ascending = False)[1:k+1]
  return sims

In [None]:
findKNearestNeighbors(5)

user
28    0.538953
61    0.523998
25    0.520597
40    0.517180
44    0.516370
Name: 5, dtype: float64

In [None]:
# predict rating for a user and item
def predictRating(user_id, item_id, sim_matrix_df = sim_matrix_df, rating_sparse_df = rating_sparse_df, k = 5):
  sim_users = findKNearestNeighbors(user_id, sim_matrix_df = sim_matrix_df, k = k)
  knn_sims = sim_users
  knn_ratings = []
  for user in sim_users.index:
    knn_ratings.append(rating_sparse_df[rating_sparse_df['user'] == user_id][item_id].to_numpy()[0])
  rating_user_item = np.dot(knn_sims, knn_ratings)/ sum(knn_sims)
  return rating_user_item


In [None]:
predictRating(user_id = 2, item_id = 'BD0123EN')

2.0

In [None]:
## - For each rating in the test dataset, estimate its rating using the KNN collaborative filtering equations shown before
def evaluate(rating_df_test = rating_df_test):
  test_array = rating_df_test.to_numpy()
  rmse = 0
  for user_id, item_id, actual_rate in rating_df_test.to_numpy():
    try:
      predict_rate = predictRating(user_id, item_id)
    except:
      predict_rate = 0
    rmse = rmse + (actual_rate - predict_rate) ** 2

  return np.sqrt(rmse / rating_df_test.shape[0])

In [None]:
for user_id, item_id, actual_rate in rating_df_test.to_numpy():
  print(user_id)
  print(item_id)
  print(actual_rate)
  break


622358
ML0115EN
3.0


In [None]:
rmse = evaluate()
print("rmse = ", rmse)

rmse =  2.959307351391538


In [None]:
rating_df_test

Unnamed: 0,user,item,rating
47044,622358,ML0115EN,3.0
44295,1226824,DV0101EN,3.0
74783,1501800,DS0301EN,3.0
70975,1293117,ML0101ENv3,3.0
46645,1386422,CO0101EN,3.0
...,...,...,...
18536,938914,ML0101EN,3.0
44268,702914,BD0101EN,3.0
19723,615186,ML0101EN,3.0
34891,1695287,CB0103EN,3.0


In [None]:
predictRating(user_id = 615186, item_id = 'CO0101EN')

0.0