In [2]:
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357258 sha256=dab66adcb08cd1260551e11be5b4bf6e489a27eac6fd950281e512b4edd6ba73
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully inst

In [3]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#for collaborative filtering
import os
import math
import random

from surprise import accuracy, Reader, Dataset, dump
from surprise import NormalPredictor, KNNBasic, SVD, SVDpp
from surprise.model_selection import cross_validate, GridSearchCV

In [5]:
pd.options.display.float_format = '{:.2f}'.format
Ratings = pd.read_csv('Ratings.csv')
Books = pd.read_csv('DataGabungan.csv')

In [6]:
# #updating column names in ratings dataset
Ratings.rename(columns = {"User-ID":"user_id", "Book-Rating": "rating"}, inplace=True)

Ratings = Ratings[Ratings['rating'] != 0]

# counters = Ratings.groupby(['user_id', 'ISBN']).size().reset_index(name='rating_count')

# # Filter users who have rated more than 30 books and books that have more than 40 ratings
# user_filter = counters.groupby('user_id').size() >= 30
# book_filter = counters.groupby('ISBN').size() >= 40

# # Apply the filters to the original dataset
# Ratings = Ratings[Ratings['user_id'].isin(user_filter[user_filter].index) & Ratings['ISBN'].isin(book_filter[book_filter].index)]

# Books.drop(['Unnamed: 0', 'user_id', 'rating'], axis=1, inplace=True)

In [7]:

# Step 1: Count the number of unique users who rated each book
counters = Ratings.groupby('ISBN')['user_id'].nunique().reset_index(name='user_count')

# Step 2: Find books that have been rated by at least 40 users
book_filter = counters[counters['user_count'] >= 40]['ISBN']

# Step 3: Find users who have rated at least 30 books
user_filter = Ratings.groupby('user_id').size() >= 30

# Step 4: Apply the filters to the original dataset
Ratings = Ratings[Ratings['user_id'].isin(user_filter[user_filter].index) & Ratings['ISBN'].isin(book_filter)]

Books.drop(['Unnamed: 0', 'user_id', 'rating'], axis=1, inplace=True)

In [8]:
#to have reproducible experiments
my_seed = 0
random.seed(my_seed)
np.random.seed(my_seed)

#Load the full dataset
reader = Reader(rating_scale=(1,10))
data = Dataset.load_from_df(Ratings, reader)

#shuffle the ratings for unbiased result
all_ratings = data.raw_ratings
random.shuffle(all_ratings)

#split data into train and test data with the ratio 70:30
threshold = int(0.7 * len(all_ratings))
train_ratings = all_ratings[:threshold]
test_ratings = all_ratings[threshold:]

def book_read(user_id):
    '''Take user_id and return list of book that user has read'''
    books_list = list(Books['ISBN'])
    book_read_list = list(Ratings['ISBN'][Ratings['user_id'] == user_id])
    return books_list, book_read_list

In [9]:
# Menghitung jumlah rating per user
user_rating_counts = Ratings.groupby('user_id').size()

# Memfilter pengguna dengan lebih dari 30 rating
# filtered_users = user_rating_counts[user_rating_counts >= 30]

# Mengurutkan hasil berdasarkan count_rating secara descending
filtered_users_sorted = user_rating_counts.sort_values(ascending=False)

# Menampilkan hasil dalam DataFrame dengan format yang diinginkan
df_filtered_users = pd.DataFrame({'user_id': filtered_users_sorted.index, 'count_ratings': filtered_users_sorted.values})
df_filtered_users

Unnamed: 0,user_id,count_ratings
0,11676,459
1,16795,130
2,95359,120
3,60244,87
4,104636,83
...,...,...
2117,204946,1
2118,218241,1
2119,99227,1
2120,31008,1


In [16]:
%%time

# prepare train data
data.raw_ratings = train_ratings

#select algorithm
npred = NormalPredictor()

# cross validation for train data
np_result = cross_validate(npred, data, measures=['RMSE'], cv=5, verbose=True, n_jobs=1)

#retrain whole train test
trainset = data.build_full_trainset()
npred.fit(trainset)
print('\n')

# Compute RMSE on trainset (without fold)
np_train_pred = npred.test(trainset.build_testset())
print('Train RMSE:')
train_rmse = accuracy.rmse(np_train_pred)
print('Train MAE:')
train_mae = accuracy.mae(np_train_pred)
print('\n')

#compute RMSE on testset
testset = data.construct_testset(test_ratings)
np_test_pred = npred.test(testset)
print('Test RMSE:')
test_rmse = accuracy.rmse(np_test_pred)
print('Test MAE:')
test_mae = accuracy.mae(np_test_pred)

data.raw_ratings = all_ratings
npred.fit(data.build_full_trainset())

def get_recommendation_npred(user_id, n=5):
    '''Give n recommendation to user_id'''

    all_books, user_books =  book_read(user_id)
    next_books = [book for book in all_books if book not in user_books]

    if n <= len(next_books):
        ratings = []
        for book in next_books:
            est = npred.predict(user_id, book).est
            ratings.append((book, est))
        ratings = sorted(ratings, key=lambda x: x[1], reverse=True)
        book_ids = [id for id, rate in ratings[:n]]
        return Books[Books.ISBN.isin(book_ids)][['ISBN', 'title', 'author', 'average_rating', 'count_ratings']]
    else:
        print('Please reduce your recommendation request')
        return

Evaluating RMSE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    2.3765  2.3590  2.3261  2.3316  2.3277  2.3442  0.0201  
Fit time          0.02    0.03    0.03    0.03    0.03    0.03    0.00    
Test time         0.02    0.02    0.02    0.02    0.03    0.02    0.00    


Train RMSE:
RMSE: 2.3291
Train MAE:
MAE:  1.8381


Test RMSE:
RMSE: 2.3710
Test MAE:
MAE:  1.8625
CPU times: user 814 ms, sys: 12.8 ms, total: 827 ms
Wall time: 833 ms


In [18]:
get_recommendation_npred(11676, 5)

Unnamed: 0,ISBN,title,author,average_rating,count_ratings
3,60809833,Brave New World,Aldous Huxley,9.1,100
55,312983271,Full House (Janet Evanovich's Full Series),Janet Evanovich,7.72,1600
64,316690619,Cradle and All,James Patterson,6.07,225
86,345351525,The Queen of the Damned (Vampire Chronicles (P...,Anne Rice,7.61,2116
118,380807343,Coraline,Neil Gaiman,8.31,169


In [10]:
%%time
# change data to trainset
data.raw_ratings = train_ratings

# select algorithm
sim_options = {"name": "cosine",
              "user_based": True}
knn = KNNBasic(sim_options=sim_options)

# cross validation for train data
knn_result = cross_validate(knn, data, measures=['RMSE'], cv=5, verbose=True, n_jobs = 1)

# retrain whole train test
trainset = data.build_full_trainset()
knn.fit(trainset)

# Compute RMSE on trainset (without fold)
knn_train_pred = knn.test(trainset.build_testset())
print('Train RMSE:')
train_rmse = accuracy.rmse(knn_train_pred)
print('Train MAE:')
train_mae = accuracy.mae(knn_train_pred)
print('\n')

# compute RMSE on testset
testset = data.construct_testset(test_ratings)
knn_test_pred = knn.test(testset)
print('Test RMSE:')
test_rmse = accuracy.rmse(knn_test_pred)
print('Test MAE:')
test_mae = accuracy.mae(knn_test_pred)
print('\n')

data.raw_ratings = all_ratings
knn.fit(data.build_full_trainset())

def get_recommendation_knn(user_id, n=5):
    '''Give n recommendation to user_id'''

    all_books, user_books =  book_read(user_id)
    next_books = [book for book in all_books if book not in user_books]

    if n <= len(next_books):
        ratings = []
        for book in next_books:
            est = knn.predict(user_id, book).est
            ratings.append((book, est))
        ratings = sorted(ratings, key=lambda x: x[1], reverse=True)
        book_ids = [id for id, rate in ratings[:n]]
        return Books[Books.ISBN.isin(book_ids)][['ISBN', 'title', 'author', 'average_rating', 'count_ratings']]
    else:
        print('Please reduce your recommendation request')
        return

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.9030  1.9338  1.8719  1.8589  1.8951  1.8925  0.0260  
Fit time          0.37    0.35    0.30    0.26    0.16    0.29    0.08    
Test time         0.10    0.15    0.15    0.12    0.14    0.13    0.02    
Computing the cosine similarity matrix...
Done computing similarity matrix.
Train RMSE:
RMSE: 1.6055
Train MAE:
MAE:  1.2609


Test RMSE:
RMSE: 1.8271
Test MAE:
MAE:  1.4134


Computing the cosine similarity matrix...
Done computing similarity matrix.
CPU times: use

In [11]:
get_recommendation_knn(11676, 5)

Unnamed: 0,ISBN,title,author,average_rating,count_ratings
82,0345339711,"The Two Towers (The Lord of the Rings, Part 2)",J.R.R. TOLKIEN,9.18,1444
83,0345339738,"The Return of the King (The Lord of the Rings,...",J.R.R. TOLKIEN,9.41,1369
173,0439136350,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,9.2,6561
177,043935806X,Harry Potter and the Order of the Phoenix (Boo...,J. K. Rowling,9.03,12544
333,076790592X,"Tuesdays with Morrie: An Old Man, a Young Man,...",Mitch Albom,9.11,81


In [12]:
%%time
#change data to trainset
data.raw_ratings = train_ratings

#select algorithm
svd = SVD(random_state=0)

#retrain whole train test
trainset = data.build_full_trainset()
svd.fit(trainset)

# Compute RMSE on trainset (without fold)
svd_train_pred = svd.test(trainset.build_testset())
print('Train RMSE:')
train_rmse = accuracy.rmse(svd_train_pred)
print('Train MAE:')
train_mae = accuracy.mae(svd_train_pred)
print('\n')

#compute RMSE on testset
testset = data.construct_testset(test_ratings)
svd_test_pred = svd.test(testset)
print('Test RMSE:')
test_rmse = accuracy.rmse(svd_test_pred)
print('Test MAE:')
train_mae = accuracy.mae(svd_test_pred)
print('\n')

data.raw_ratings = all_ratings
svd.fit(data.build_full_trainset())

def get_recommendation_svd(user_id, n=5):
    '''Give n recommendation to user_id'''

    all_books, user_books =  book_read(user_id)
    next_books = [book for book in all_books if book not in user_books]

    if n <= len(next_books):
        ratings = []
        for book in next_books:
            est = svd.predict(user_id, book).est
            ratings.append((book, est))
        ratings = sorted(ratings, key=lambda x: x[1], reverse=True)
        book_ids = [id for id, rate in ratings[:n]]
        return Books[Books.ISBN.isin(book_ids)][['ISBN', 'title', 'author', 'average_rating', 'count_ratings']]
    else:
        print('Please reduce your recommendation request')
        return

Train RMSE:
RMSE: 1.0205
Train MAE:
MAE:  0.7905


Test RMSE:
RMSE: 1.5652
Test MAE:
MAE:  1.1933


CPU times: user 951 ms, sys: 32.4 ms, total: 983 ms
Wall time: 966 ms


In [20]:
get_recommendation_svd(11676, 5)

Unnamed: 0,ISBN,title,author,average_rating,count_ratings
52,312971346,High Five (A Stephanie Plum Novel),Janet Evanovich,8.73,2304
80,345339681,The Hobbit : The Enchanting Prelude to The Lor...,J.R.R. TOLKIEN,8.76,3844
81,345339703,The Fellowship of the Ring (The Lord of the Ri...,J.R.R. TOLKIEN,9.0,2025
138,385492081,Into Thin Air : A Personal Account of the Mt. ...,JON KRAKAUER,8.57,1764
184,440212561,Outlander,DIANA GABALDON,8.38,1521


In [14]:
%%time
#change data to trainset
data.raw_ratings = train_ratings

#select algorithm
svdpp = SVDpp(random_state=0)

#retrain whole train test
trainset = data.build_full_trainset()
svdpp.fit(trainset)

# Compute RMSE on trainset (without fold)
svdpp_train_pred = svdpp.test(trainset.build_testset())
print('Train RMSE:')
train_rmse = accuracy.rmse(svdpp_train_pred)
print('Train MAE:')
train_mae = accuracy.mae(svdpp_train_pred)
print('\n')


#compute RMSE on testset
testset = data.construct_testset(test_ratings)
svdpp_test_pred = svdpp.test(testset)
print('Test RMSE:')
test_rmse = accuracy.rmse(svdpp_test_pred)
print('Test MAE:')
train_mae = accuracy.mae(svdpp_test_pred)
print('\n')


data.raw_ratings = all_ratings
svdpp.fit(data.build_full_trainset())

def get_recommendation_svdpp(user_id, n=5):
    '''Give n recommendation to user_id'''

    all_books, user_books =  book_read(user_id)
    next_books = [book for book in all_books if book not in user_books]

    if n <= len(next_books):
        ratings = []
        for book in next_books:
            est = svdpp.predict(user_id, book).est
            ratings.append((book, est))
        ratings = sorted(ratings, key=lambda x: x[1], reverse=True)
        book_ids = [id for id, rate in ratings[:n]]
        return Books[Books.ISBN.isin(book_ids)][['ISBN', 'title', 'author', 'average_rating', 'count_ratings']]
    else:
        print('Please reduce your recommendation request')
        return

Train RMSE:
RMSE: 1.0143
Train MAE:
MAE:  0.7803


Test RMSE:
RMSE: 1.5729
Test MAE:
MAE:  1.1891


CPU times: user 3.59 s, sys: 93.8 ms, total: 3.69 s
Wall time: 3.64 s


In [21]:
get_recommendation_svdpp(11676, 5)

Unnamed: 0,ISBN,title,author,average_rating,count_ratings
34,142001740,The Secret Life of Bees,Sue Monk Kidd,8.81,9409
54,312980140,Seven Up (A Stephanie Plum Novel),Janet Evanovich,8.56,2916
69,316769487,The Catcher in the Rye,J.D. Salinger,7.6,7396
98,345413369,"The Subtle Knife (His Dark Materials, Book 2)",PHILIP PULLMAN,8.6,225
183,440211727,A Time to Kill,JOHN GRISHAM,8.05,6724


In [19]:
import surprise
print(surprise.__version__)

1.1.4
