In [149]:
#Preview list of data files so can grab names in future cells
print("Main Directory:")
!ls
print("\n\GoodReadsData:")
!ls GoodReadsData

Main Directory:
BookSearchbyKeyword.ipynb  Recommender_advanced.ipynb
[34mGoodReadsData[m[m              books_titles.json
Recommender.ipynb          liked_books.csv

\GoodReadsData:
book_id_map.csv            goodreads_interactions.csv
goodreads_books.json.gz


In [150]:
#Read in query data containing liked books

import pandas as pd

query_books = pd.read_csv("liked_books.csv", index_col = 0)
query_books["bookRecord_bookId"] = query_books["bookRecord_bookId"].astype(str)
print(query_books.head())
query_bookRecord_bookIds = set(query_books["bookRecord_bookId"]) #Set of unique books in query

   user_id bookRecord_bookId  rating                title
0       -1            854757       5               Bet Me
1       -1          33571217       5  Reincarnation Blues


In [151]:
#Preview book_id_map data to see how to import in next cell
!head GoodReadsData/book_id_map.csv

book_id_csv,book_id
0,34684622
1,34536488
2,34017076
3,71730
4,30422361
5,33503613
6,33517540
7,34467031
8,6383669


In [152]:
#Import BookId mapping between user interactions data and book record data

bookId_mapping_interaction_to_bookRecord = {}
#stream file line by line
with open("GoodReadsData/book_id_map.csv") as f:
    while True:
        line = f.readline()
        if not line:
            break
        interactions_bookId, bookRecord_bookId = line.strip().split(",") #strip necessary to remove a hidden new line character that otherwise interferes with matching to ids in other locations
        bookId_mapping_interaction_to_bookRecord[interactions_bookId] = bookRecord_bookId

In [153]:
#Preview interactions data to be imported in next cell
previewData =!head GoodReadsData/goodreads_interactions.csv
nLines = !wc -l GoodReadsData/goodreads_interactions.csv

print(f"INTERACTIONS FILE CONTENTS PREVIEW:\n{previewData}\n")
print(f"NUMBER OF LINES INFO: {nLines}")

INTERACTIONS FILE CONTENTS PREVIEW:
['user_id,book_id,is_read,rating,is_reviewed', '0,948,1,5,0', '0,947,1,5,1', '0,946,1,5,0', '0,945,1,5,0', '0,944,1,5,0', '0,943,1,5,0', '0,942,1,5,0', '0,941,1,5,0', '0,940,1,5,0']

NUMBER OF LINES INFO: [' 228648343 GoodReadsData/goodreads_interactions.csv']


In [154]:
#Stream Interactions to find users who read books in query and count how many query books they reviewed (do not need to be positively reviewed)
overlap_users = {}

with open("GoodReadsData/goodreads_interactions.csv") as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id,interactions_bookId,_,rating,_ = line.strip().split(",") #strip to get rid of any hidden characters like new line
        
        bookRecord_bookId = bookId_mapping_interaction_to_bookRecord.get(interactions_bookId) #.get used rather than directly querying with [] because otherwise some issue with header results in an error. With .get returns none rather than an error so code continues on
        if bookRecord_bookId in query_bookRecord_bookIds:
            if user_id not in overlap_users:
                overlap_users[user_id] = 1
            else:
                overlap_users[user_id] += 1
    

In [155]:
#Filter overlap_users to only include users who have reviewed at least some minimum percent of the books in the query
minPct = .5
print(f"{len(overlap_users)} users have read at least one queried book.")

filtered_overlap_users = set([k for k in overlap_users if overlap_users[k] > minPct * len(query_bookRecord_bookIds)])

print(f"{len(filtered_overlap_users)} users have read at least {minPct * 100}% of the queried books.")

13248 users have read at least one queried book.
89 users have read at least 50.0% of the queried books.


In [156]:
#Get records of all books read by users who read more than threshold amount of books in query (determined above)

overlap_users_interactions = []
with open("GoodReadsData/goodreads_interactions.csv") as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id,interactions_bookId,_,rating,_ = line.strip().split(",") #strip to get rid of any hidden characters like new line
        
        if user_id in filtered_overlap_users:
            bookRecord_bookId = bookId_mapping_interaction_to_bookRecord[interactions_bookId]
            overlap_users_interactions.append([user_id, bookRecord_bookId, rating])
            
print(len(overlap_users_interactions))

899428


In [157]:
#Create User/Book Ratings Matrix
#rows= overlap users, cols = books recommended by any overlap user

from scipy.sparse import coo_matrix
#compile query and user ratings in one dataframe
interactions = pd.DataFrame(overlap_users_interactions, columns = ["user_id", "bookRecord_bookId", "rating"])
interactions = pd.concat([query_books[["user_id", "bookRecord_bookId", "rating"]],interactions]) #add query books as rows of interactions dataframe
#Make sure all columns are desired datatypes
interactions["bookRecord_bookId"] = interactions["bookRecord_bookId"].astype(str)                                      
interactions["user_id"] = interactions["user_id"].astype(str) 
interactions["rating"] = pd.to_numeric(interactions["rating"]) 
#Assign row and column index destinations to users and books
interactions["user_index"] = interactions["user_id"].astype("category").cat.codes #this assigns sequential values to each unique id
interactions["book_index"] = interactions["bookRecord_bookId"].astype("category").cat.codes #this assigns sequential values to each unique id
#Create sparse matrix of ratings with row for each user and col for each book
ratings_mat = coo_matrix((interactions["rating"],(interactions["user_index"],interactions["book_index"]))) #coo is coordinate format for a sparse matri
ratings_mat = ratings_mat.tocsr() #csr (Compressed Sparse Row) mysparse matrix format is just easier to work with programatically, while coo is a little easier to create in first place... so create then convert

In [158]:
#Find cosine similarity between query user and overlap users
from sklearn.metrics.pairwise import cosine_similarity

query_row_idx = interactions[interactions["user_id"] == "-1"]["user_index"][0] #get row index corresponding to query user (who should have been assigned user_id -1 when assembling query data)
similarity = cosine_similarity(ratings_mat[query_row_idx,:], ratings_mat).flatten()

In [159]:
#Get count and rating of books rated by nMostSimilar users
import numpy as np
nMostSimilar = 5
indices = np.argpartition(similarity, -nMostSimilar) [-nMostSimilar:] #finds nMostSimilar users row indices
similar_users = interactions[interactions["user_index"].isin(indices)].copy() #get user_ids of nMostSimilar
similar_users = similar_users[similar_users != "-1"] #remove query user from most similar users list
book_recommendations = similar_users.groupby("bookRecord_bookId").rating.agg(["count","mean","median"])
book_recommendations.head()

Unnamed: 0_level_0,count,mean,median
bookRecord_bookId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,3,4.666667,5.0
10025305,1,4.0,4.0
10049436,1,0.0,0.0
100915,1,3.0,3.0
10176,1,0.0,0.0


In [160]:
#Get Book Record for nMostSimilar rated books
book_titles = pd.read_json("books_titles.json") #import book records file, cleaned in a previous notebook
book_titles["bookRecord_bookId"] = book_titles["bookRecord_bookId"].astype(str) #make sure in right format

book_recommendations = book_recommendations.merge(book_titles, how = "inner", on="bookRecord_bookId")


In [161]:
# Filter for top recommended books by nMostSimilar users
book_recs = book_recommendations.copy() #copy so don't have to rerun above cell if mess with definition of book_recs here

#Filter out books already in query
query_books["mod_title"] = query_books["title"].str.replace("[^a-zA-Z0-9 ]","",regex=True).str.lower()#standardize appearance of titles in query so easier to find duplicates (book_titles file has already had titles standardized in same way)
query_books["mod_title"] = query_books["mod_title"] .str.replace("\s+", " ", regex=True) #continue standardizing appearance of title to easier to find duplicates (here replace any multiple spaces with a single space)
book_recs = book_recs[~book_recs["mod_title"].isin(query_books["mod_title"])]

#Keep only books that occur at least n times in nMostSimilar users reviews
min_reviews_by_nMostSimilarUsers = (int)(nMostSimilar * .1)
book_recs = book_recs[book_recs["count"]>=min_reviews_by_nMostSimilarUsers]

#Keep only books above a certain mean rating
min_rating = 4
book_recs = book_recs[book_recs["mean"] >= min_rating]

#Score remaining books
countAllGoodReadsRecs = book_recs["ratings"] #total number of ratings in goodreads for each book
book_recs["adjusted_count"] = book_recs["count"]/countAllGoodReadsRecs #Changed this from orig to more strongly emphasize uniqueness of these reviews #book_recs["count"] * (book_recs["count"]/countAllGoodReadsRecs) #normalizes count by how its magnitude compares to all goodReads reviewers (we want books esepcially liked by similar users not just broadly liked by everyone)
book_recs["adjusted_count"]  = book_recs["adjusted_count"] /book_recs["adjusted_count"].max()
max_median = book_recs["median"].max()
book_recs["normalized_rating"] = book_recs["median"]/max_median
book_recs["score"] = book_recs["normalized_rating"] * (book_recs["adjusted_count"]) #create score based on average rating and relative number of ratings by nMostSimilar users

#Get top scoring books of what remains
top_recs = book_recs.sort_values("score", ascending = False)


In [162]:
#Display Recs
def make_clickable(URL):
    return f'<a target ="_blank" href="{URL}">Goodreads Listing</a>' #creates html to link to title, which can be used when diplaying book results

def show_image(imageSource):
    return f'<img src="{imageSource}" width=50></img>' #creates html to display image associated with title, which can be used when diplaying book results

top_recs.head(15).style.format({'url': make_clickable, 'cover_image': show_image}) #adds formating to results so url's are clickable and image of book cover shows

Unnamed: 0,bookRecord_bookId,count,mean,median,title,ratings,url,cover_image,mod_title,adjusted_count,normalized_rating,score
1460,33521986,1,5.0,5.0,January Buzz Books Monthly,17,Goodreads Listing,,january buzz books monthly,1.0,1.0,1.0
1342,31934673,1,5.0,5.0,"Art on the Rocks: More than 35 colorful & contemporary rock-painting projects, tips, and techniques to inspire your creativity!",21,Goodreads Listing,,art on the rocks more than 35 colorful contemporary rockpainting projects tips and techniques to inspire your creativity,0.809524,1.0,0.809524
2038,8845918,1,4.0,4.0,The Witchy Worries of Abbie Adams,18,Goodreads Listing,,the witchy worries of abbie adams,0.944444,0.8,0.755556
1179,30121789,1,5.0,5.0,The Bee Charmer (The Chancellor Fairy Tales Book 3),23,Goodreads Listing,,the bee charmer the chancellor fairy tales book 3,0.73913,1.0,0.73913
1332,31741758,1,5.0,5.0,Diary of an AssCan: A Mark Watney Short Story,26,Goodreads Listing,,diary of an asscan a mark watney short story,0.653846,1.0,0.653846
1563,35138828,1,5.0,5.0,"Buzz Books 2017: Fall/Winter: Exclusive Excerpts from Forthcoming Titles by Louise Erdrich, Bill McKibben, Celeste Ng, Robin Sloan, Amy Tan and 35 More",30,Goodreads Listing,,buzz books 2017 fallwinter exclusive excerpts from forthcoming titles by louise erdrich bill mckibben celeste ng robin sloan amy tan and 35 more,0.566667,1.0,0.566667
948,27352634,1,5.0,5.0,Marked by Fortune,46,Goodreads Listing,,marked by fortune,0.369565,1.0,0.369565
860,26025631,1,4.0,4.0,"Bossa Novas, Bikinis, and Bad Ends",46,Goodreads Listing,,bossa novas bikinis and bad ends,0.369565,0.8,0.295652
1311,31450418,1,4.0,4.0,Christmas Angels: A Novella,48,Goodreads Listing,,christmas angels a novella,0.354167,0.8,0.283333
1559,34998729,1,4.0,4.0,The Genesis of Evangeline (Lost Royals Saga #1),52,Goodreads Listing,,the genesis of evangeline lost royals saga 1,0.326923,0.8,0.261538


IDEAS FOR IMPROVEMENT

A) Play with parameters
1) requirements on overlap user selection
2) include more most similar users
3) weighting of number of reviews vs rating in score
4) weight reviews by how similar a user is to me

B) This highlights books liked by users most like me
But maybe instead should find books most liked by people who like same individual books (kind of like vector fitting pants and shirt separately so can benefit from information about indpendent body parts. Here independent elements of taste... although that's often the problem I have with recommendation systems that recommend books that are only similar to one other book I like, when you should be able to tell from the other books I like that I WOULDNT like some of those reccs). Maybe could for each books find most liked AND most disliked books by other users and use both those directions of information?

B) Work on getting more similar users
- No user has reviewed more than 2 of the books on my list and only 15 have read 2
- Could expand number of query titles to get more similar users (keep in mind that more popular books in query will have many more reviews, which doesnt mean theyre the ones querier cares most about. might want to put in some criteria for books reviewers MUST have read versus ones they should have read some fraction of. Maybe use a similar weighting system where the less frequent a query book appears in all of goodreads the more important users who read it are. Or, building on this, make every book in query have same amount of weight in recommendations by weighting users with reviews of less reviewed books higher) 
- Could scrape more goodreads data