In [1]:
!pip install surprise



In [0]:
#load libraries
import pandas as pd
import numpy as np
import surprise
from surprise import SVD
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.model_selection import cross_validate
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from collections import defaultdict

ratings = pd.read_csv('https://raw.githubusercontent.com/zygmuntz/goodbooks-10k/master/ratings.csv')
books = pd.read_csv('https://raw.githubusercontent.com/zygmuntz/goodbooks-10k/master/books.csv')
book_tags = pd.read_csv('https://raw.githubusercontent.com/zygmuntz/goodbooks-10k/master/book_tags.csv')
tag_names = pd.read_csv('https://raw.githubusercontent.com/zygmuntz/goodbooks-10k/master/tags.csv')

In [0]:
book_tags_m = book_tags.merge(tag_names, on = 'tag_id' )

#removing some of the most common tags that do not indicate anything about the book itself (to-read, audio-book, wish-list)
stk_list = ['to-read','favorites','owned', 'books-i-own', 'currently-reading', 'have', 
            'library', 'to-buy', 'kindle', 'owned-books', 'my-books', 'audiobook',
            'my-library', 'audio', 'own-it', 'books', 'ebooks', 'audible', 'audio-book',
            'default', 'ebook', 'audiobooks', 'ebooks', 'wish-list', 'audiobooks', 'i-own', 
            'borrowed', 'e-books', 'e-book', 'maybe', 'audio-book', 'audio-books']
#clean dataset
book_tags_m = book_tags_m[~book_tags_m['tag_name'].isin(stk_list)]
book_tags_m = book_tags_m[~book_tags_m.tag_name.str.contains("read-in-")]
#merge tags to one value
books_grouped_tags = book_tags_m.groupby('goodreads_book_id')['tag_name'].apply(' '.join).reset_index()
books_grouped_tags['tag_name'] = books_grouped_tags['tag_name'].str.replace('-',' ')
#merge dataframes
tagged_books = pd.merge(books, books_grouped_tags, left_on='book_id', right_on='goodreads_book_id', how='inner')
books_final = books_grouped_tags.merge( books[['goodreads_book_id', 'title', 'authors']], on = 'goodreads_book_id' )

In [0]:
#Perform Content Filtering Analysis using sklearn TfidfVectorizer on book-tags
tf1 = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0)
matrix = tf1.fit_transform(books_final['tag_name'].head(10000))
cosine_sim_tag = linear_kernel(matrix, matrix)

#Perform Content Filtering Analysis using sklearn TfidfVectorizer on author
tf2 = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0)
matrix2 = tf2.fit_transform(books_final['title'].head(10000))
cosine_sim_title = linear_kernel(matrix2, matrix2)

#Perform Content Filtering Analysis using sklearn TfidfVectorizer on title
tf3 = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0)
matrix3 = tf3.fit_transform(books_final['authors'].head(10000))
cosine_sim_author = linear_kernel(matrix3, matrix3)

In [6]:
#Use Surprise to get SVD similarity
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['user_id', 'book_id', 'rating']], reader)

# sample random trainset and testset
trainset, testset = train_test_split(data, test_size=.25)

# We'll use the famous SVD algorithm.
algo = SVD()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 0.8347


0.8346888335494874

In [0]:
def get_recs(predictions):
     
    top_recs = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_recs[iid].append((uid, est))
     
    return top_recs

predictons_list_svd = get_recs(predictions)

In [0]:
def get_final_rec(book_title, predictons_list_svd = predictons_list_svd, cosine_sim_tag = cosine_sim_tag, cosine_sim_title = cosine_sim_title, cosine_sim_author = cosine_sim_author):

    book_id = books[books['title'] == book_title].book_id.iloc[0]

    book = {k: v for k, v in predictons_list_svd.items() if k == book_id}
    pred = book.get(book_id)

    lst_books = [item[0] for item in pred]
    lst_pred = [item[1] for item in pred]
    svd_ratings_svd = pd.DataFrame(list(zip(lst_books, lst_pred)), columns =['books', 'pred_svd'])

    sim_scores = list(enumerate(cosine_sim_tag[book_id]))
    lst_books_tag = [item[0] for item in sim_scores]
    lst_pred_tag = [item[1] for item in sim_scores]
    svd_ratings_sim_tag = pd.DataFrame(list(zip(lst_books_tag, lst_pred_tag)), columns =['books', 'pred_sim_tag'])

    sim_scores = list(enumerate(cosine_sim_title[book_id]))
    lst_books_tag = [item[0] for item in sim_scores]
    lst_pred_tag = [item[1] for item in sim_scores]
    svd_ratings_sim_title = pd.DataFrame(list(zip(lst_books_tag, lst_pred_tag)), columns =['books', 'pred_sim_title'])

    sim_scores = list(enumerate(cosine_sim_author[book_id]))
    lst_books_tag = [item[0] for item in sim_scores]
    lst_pred_tag = [item[1] for item in sim_scores]
    svd_ratings_sim_auth = pd.DataFrame(list(zip(lst_books_tag, lst_pred_tag)), columns =['books', 'pred_sim_auth'])

    merged_prediction1 = svd_ratings_svd.merge(svd_ratings_sim_tag, on = 'books')
    merged_prediction2 = merged_prediction1.merge(svd_ratings_sim_title, on = 'books')
    merged_prediction = merged_prediction2.merge(svd_ratings_sim_auth, on = 'books')
    merged_prediction['final_prediction'] = merged_prediction['pred_svd'] * merged_prediction['pred_sim_tag'] * merged_prediction['pred_sim_title'] * merged_prediction['pred_sim_auth']
    book_rec = merged_prediction[merged_prediction['final_prediction']==merged_prediction['final_prediction'].max()].books.iloc[0]

    return print ("If you like " + book_title + " then you may also like: " + books[books['book_id'] == book_rec].title.iloc[0])

In [36]:
get_final_rec("Slaughterhouse-Five")

If you like Slaughterhouse-Five then you may also like: Where Eagles Dare 
