In [None]:
pip install surprise



In [1]:
import pandas as pd
from plotly.graph_objects import *
import numpy as np

from surprise import Dataset
from surprise import Reader
from surprise import KNNWithMeans, KNNWithZScore, SVD
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV

In [2]:
ratings = pd.read_csv('Ratings.csv', sep=';', on_bad_lines='skip')
ratings.head()

Unnamed: 0,User-ID,ISBN,Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [3]:
books = pd.read_csv('Books.csv', on_bad_lines='skip', sep=';')
books.head()

Unnamed: 0,ISBN,Title,Author,Year,Publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton & Company


In [4]:
booksRatings = pd.merge(books, ratings, on='ISBN', how='inner')
booksRatings.head()

Unnamed: 0,ISBN,Title,Author,Year,Publisher,User-ID,Rating
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,2,0
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,8,5
2,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,11400,0
3,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,11676,8
4,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,41385,0


**EDA**

In [5]:
booksRatings.isnull().sum()/len(booksRatings)

ISBN         0.000000
Title        0.000000
Author       0.000002
Year         0.000000
Publisher    0.000002
User-ID      0.000000
Rating       0.000000
dtype: float64

In [6]:
booksRatings.dropna(inplace=True)
booksRatings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1031183 entries, 0 to 1031186
Data columns (total 7 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   ISBN       1031183 non-null  object
 1   Title      1031183 non-null  object
 2   Author     1031183 non-null  object
 3   Year       1031183 non-null  int64 
 4   Publisher  1031183 non-null  object
 5   User-ID    1031183 non-null  int64 
 6   Rating     1031183 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 62.9+ MB


In [7]:
notZeroBooksRatings = booksRatings[booksRatings['Rating'] != 0]
notZeroBooksRatings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 383852 entries, 1 to 1031183
Data columns (total 7 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   ISBN       383852 non-null  object
 1   Title      383852 non-null  object
 2   Author     383852 non-null  object
 3   Year       383852 non-null  int64 
 4   Publisher  383852 non-null  object
 5   User-ID    383852 non-null  int64 
 6   Rating     383852 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 23.4+ MB


In [8]:
notZeroBooksRatings = notZeroBooksRatings.drop_duplicates()
notZeroBooksRatings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 383848 entries, 1 to 1031183
Data columns (total 7 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   ISBN       383848 non-null  object
 1   Title      383848 non-null  object
 2   Author     383848 non-null  object
 3   Year       383848 non-null  int64 
 4   Publisher  383848 non-null  object
 5   User-ID    383848 non-null  int64 
 6   Rating     383848 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 23.4+ MB


In [9]:
# кількість оцінок, виставлених читачами кожній із книг, не менше 15
countRatingsForBook = notZeroBooksRatings.groupby('ISBN')['Rating'].count().reset_index()
countRatingsForBook = countRatingsForBook[countRatingsForBook["Rating"] > 15]
countRatingsForBook.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2844 entries, 391 to 148830
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ISBN    2844 non-null   object
 1   Rating  2844 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 66.7+ KB


In [10]:
# кількість оцінок, виставлених книгам кожним із читачів, не менше 10
countRatingsByUser = notZeroBooksRatings.groupby('User-ID')['Rating'].count().reset_index()
countRatingsByUser = countRatingsByUser[countRatingsByUser["Rating"] > 10]
countRatingsByUser.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5981 entries, 60 to 68089
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   User-ID  5981 non-null   int64
 1   Rating   5981 non-null   int64
dtypes: int64(2)
memory usage: 140.2 KB


In [11]:
# відкидаємо неактивних читачів та книги, які отримали мало оцінок
notZeroBooksRatings = notZeroBooksRatings[notZeroBooksRatings["ISBN"].isin(countRatingsForBook["ISBN"]) & notZeroBooksRatings["User-ID"].isin(countRatingsByUser["User-ID"])]
notZeroBooksRatings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 64534 entries, 31 to 790978
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ISBN       64534 non-null  object
 1   Title      64534 non-null  object
 2   Author     64534 non-null  object
 3   Year       64534 non-null  int64 
 4   Publisher  64534 non-null  object
 5   User-ID    64534 non-null  int64 
 6   Rating     64534 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 3.9+ MB


**пошук найкращих параметрів алгоритму SVD**

In [12]:
reader = Reader(rating_scale=(1,10))

In [13]:
data = Dataset.load_from_df(notZeroBooksRatings[['User-ID','ISBN', 'Rating']], reader)

In [14]:
param_grid = {"n_factors": range(10,100,20),
              "n_epochs" : [5, 10, 20],
              "lr_all"   : [0.002, 0.005],
              "reg_all"  : [0.2, 0.5]}

gridsearchSVD = GridSearchCV(SVD, param_grid, measures=['mae', 'rmse'], cv=3)

gridsearchSVD.fit(data)

In [15]:
print(f'MAE Best Parameters:  {gridsearchSVD.best_params["mae"]}')
print(f'MAE Best Score:       {gridsearchSVD.best_score["mae"]}\n')

print(f'RMSE Best Parameters: {gridsearchSVD.best_params["rmse"]}')
print(f'RMSE Best Score:      {gridsearchSVD.best_score["rmse"]}\n')

MAE Best Parameters:  {'n_factors': 10, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.2}
MAE Best Score:       1.2123052414248585

RMSE Best Parameters: {'n_factors': 10, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.2}
RMSE Best Score:      1.5675830815551397



In [16]:
trainset, testset = train_test_split(data, test_size=.2)

In [17]:
# Model fit & prediction - SVD
final_model = SVD(n_factors=10, n_epochs=20, lr_all=0.005, reg_all= 0.2)

# Fitting the model on trainset & predicting on testset
final_model.fit(trainset).test(testset)

[Prediction(uid=115572, iid='0064471063', r_ui=9.0, est=8.582954154546991, details={'was_impossible': False}),
 Prediction(uid=77809, iid='0425128164', r_ui=7.0, est=6.926904662337659, details={'was_impossible': False}),
 Prediction(uid=241980, iid='0671039741', r_ui=10.0, est=9.493050370569943, details={'was_impossible': False}),
 Prediction(uid=27647, iid='0140276904', r_ui=5.0, est=7.064732916688376, details={'was_impossible': False}),
 Prediction(uid=37644, iid='014100018X', r_ui=8.0, est=8.273575677309324, details={'was_impossible': False}),
 Prediction(uid=104636, iid='0671568175', r_ui=9.0, est=8.146255404983709, details={'was_impossible': False}),
 Prediction(uid=241666, iid='0425151867', r_ui=5.0, est=7.067187565987054, details={'was_impossible': False}),
 Prediction(uid=10314, iid='0060976845', r_ui=9.0, est=7.206112514591777, details={'was_impossible': False}),
 Prediction(uid=205383, iid='1857022424', r_ui=10.0, est=7.958357418843982, details={'was_impossible': False}),
 Pr

In [18]:
trainset = data.build_full_trainset()

**рекомендації для читача**

In [52]:
# допоміжна функція
def getBookByISBN(books, isbns):
  book = books[books["ISBN"] == isbns]
  return str(book.values[0][0]) + ' \'' + str(book.values[0][1]) + '\' ' + str(book.values[0][2]) + ' ' + str(book.values[0][3])

In [54]:
def getRecommendations(books, userID):
    model = SVD(n_factors=10, n_epochs=20, lr_all=0.005, reg_all= 0.2)
    model.fit(trainset)

    # predict rating for all pairs of users & items that are not in the trainset
    testset = trainset.build_anti_testset()
    predictions = model.test(testset)
    predictions_df = pd.DataFrame(predictions)

    # get the top get_recommend predictions for userID
    predictions_userID = predictions_df[predictions_df['uid'] == userID].sort_values(by="est", ascending = False).head(10)

    itemIds = []
    itemIds.append(list(predictions_userID['iid']))
    itemIds = itemIds[0]

    recommendations = []
    for id in itemIds:
        recommendations.append(getBookByISBN(books, id))

    return(recommendations)

In [55]:
recommendations = getRecommendations(books, 238065)
recommendations

["0439425220 'Harry Potter and the Chamber of Secrets Postcard Book' J. K. Rowling 2002",
 "0743454529 'My Sister's Keeper : A Novel (Picoult, Jodi)' Jodi Picoult 2004",
 "0618002235 'The Two Towers (The Lord of the Rings, Part 2)' J. R. R. Tolkien 1999",
 "0140143505 '84 Charing Cross Road' Helene Hanff 1990",
 "0836218620 'Weirdos From Another Planet!' Bill Watterson 1990",
 "193156146X 'The Time Traveler's Wife' Audrey Niffenegger 2003",
 "0060256672 'Where the Sidewalk Ends : Poems and Drawings' Shel Silverstein 1974",
 "0345339738 'The Return of the King (The Lord of the Rings, Part 3)' J.R.R. TOLKIEN 1986",
 "0060256656 'The Giving Tree' Shel Silverstein 1964",
 "006092988X 'A Tree Grows in Brooklyn' Betty Smith 1998"]