# Collaborative Filtering

Using goodreads book list exported from my personal goodreads account. (goodread_library_export)


In [1]:
import pandas as pd
#only care about Book_Id, User_Id, and My_Rating
#goodreads_library_export is my export of data
my_books = pd.read_csv("data/goodreads_library_export.csv", skiprows = 1)

#filter for only Exclusive Shelf == read
my_books.columns = my_books.columns.str.replace(' ', '_')
my_books = my_books[my_books['Exclusive_Shelf'] == 'read']
my_books['Book_Id'] = my_books['Book_Id'].astype(str)

my_books

Unnamed: 0,Book_Id,User_Id,Title,Author,Author_l-f,Additional_Authors,ISBN,ISBN13,My_Rating,Average_Rating,...,Date_Read,Date_Added,Bookshelves,Bookshelves_with_positions,Exclusive_Shelf,My_Review,Spoiler,Private_Notes,Read_Count,Owned_Copies
0,7896527,-1,"Throne of Glass (Throne of Glass, #1)",Sarah J. Maas,"Maas, Sarah J.",,,,4,4.18,...,,2022/09/08,,,read,,,,1,0
1,35504431,-1,Turtles All the Way Down,John Green,"Green, John",,0525555366,9.780526e+12,4,3.88,...,,2022/07/18,,,read,,,,1,0
2,18774964,-1,A Man Called Ove,Fredrik Backman,"Backman, Fredrik",Henning Koch,1476738017,9.781477e+12,3,4.38,...,,2019/10/16,,,read,,,,1,0
3,50659467,-1,A Court of Thorns and Roses (A Court of Thorns...,Sarah J. Maas,"Maas, Sarah J.",,1635575567,9.781636e+12,4,4.18,...,,2022/08/12,,,read,,,,1,0
4,17788401,-1,Ugly Love,Colleen Hoover,"Hoover, Colleen",,,,3,4.03,...,,2022/07/22,,,read,,,,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206,39122774,-1,Children of Virtue and Vengeance (Legacy of Or...,Tomi Adeyemi,"Adeyemi, Tomi",,1250170990,9.781250e+12,5,3.89,...,2021/01/11,2020/12/22,,,read,,,,1,0
224,40597810,-1,Daisy Jones & The Six,Taylor Jenkins Reid,"Reid, Taylor Jenkins",,1524798622,9.781525e+12,5,4.20,...,2020/07/13,2020/07/07,,,read,,,,1,0
234,34313931,-1,A Woman Is No Man,Etaf Rum,"Rum, Etaf",,0062699768,9.780063e+12,5,4.26,...,2020/06/24,2020/06/21,,,read,,,,1,0
236,32051912,-1,The Alice Network,Kate Quinn,"Quinn, Kate",,,,5,4.32,...,2020/05/01,2020/05/29,,,read,,,,1,0


Load the mapping file to be able to link data together

In [2]:
#load in mapping file to link data
csv_book_mapping = {}

with open("data/book_id_map.csv", "r") as f:
    while True: 
        line = f.readline()
        if not line: 
            break
        csv_id, book_id = line.strip().split(",")
        csv_book_mapping[csv_id] = book_id

Make a set of my_books so if there are any duplicates, they are handled

In [4]:
#set of books with all unique books we have read
#if you have duplicates, this would get rid of them
book_set = set(my_books["Book_Id"])

In [5]:
#looking at users that overlap with our books, 
#keys are user_ids and values are num of times user has read a book that we have in our list too

overlap_users = {}

with open("data/goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        book_id = csv_book_mapping.get(csv_id)
        
        if book_id in book_set:
            if user_id not in overlap_users:
                overlap_users[user_id] = 1
            else:
                overlap_users[user_id] += 1

In [6]:
len(overlap_users)

532737

Filtering for people we have books in common with from the goodreads interaction data

In [19]:
#filter for people who have at least 20% books in common with us
filtered_overlap_users = set([k for k in overlap_users if overlap_users[k] > my_books.shape[0]/6])

In [20]:
len(filtered_overlap_users)

384

In [21]:
#add the filtered overlap users interactions into a list to look at
interactions_list = []

with open("data/goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        if user_id in filtered_overlap_users:
            book_id = csv_book_mapping[csv_id]
            interactions_list.append([user_id, book_id, rating])

In [22]:
len(interactions_list)

3209729

In [23]:
interactions_list[0] #user_id, book_id, rating

['520', '13609836', '2']

In [24]:
interactions = pd.DataFrame(interactions_list, columns=["user_id", "book_id", "rating"])
interactions

Unnamed: 0,user_id,book_id,rating
0,520,13609836,2
1,520,11521040,5
2,520,301082,4
3,520,19501,0
4,520,7654769,4
...,...,...,...
3209724,442043,31931941,0
3209725,442043,34130000,0
3209726,442043,25387393,0
3209727,442043,30237404,0


In [25]:
#add my ratings to interactions df 
# rename Book_Id, User_Id, and My_Rating to match interactions col names
my_books = my_books.rename(columns={
    "User_Id": "user_id",
    "Book_Id": "book_id",
    "My_Rating": "rating"
})

interactions = pd.concat([my_books[["user_id", "book_id", "rating"]], interactions])
interactions

Unnamed: 0,user_id,book_id,rating
0,-1,7896527,4
1,-1,35504431,4
2,-1,18774964,3
3,-1,50659467,4
4,-1,17788401,3
...,...,...,...
3209724,442043,31931941,0
3209725,442043,34130000,0
3209726,442043,25387393,0
3209727,442043,30237404,0


In [26]:
#make sure everything is in correct form
interactions["book_id"] = interactions["book_id"].astype(str)
interactions["user_id"] = interactions["user_id"].astype(str)
interactions["rating"] = pd.to_numeric(interactions["rating"])

In [None]:
#build a collaborative filtering matrix
#row is different user
#col is different book
#cell is rating

In [27]:
interactions["user_index"] = interactions["user_id"].astype("category").cat.codes

In [28]:
len(interactions["user_index"].unique())

385

In [29]:
interactions["book_index"] = interactions["book_id"].astype("category").cat.codes

In [30]:
len(interactions["book_index"].unique())

600077

In [31]:
#sparse matrix
from scipy.sparse import coo_matrix

ratings_mat_coo = coo_matrix((interactions["rating"], (interactions["user_index"], interactions["book_index"])))

In [32]:
ratings_mat_coo

<385x600077 sparse matrix of type '<class 'numpy.int64'>'
	with 3209865 stored elements in COOrdinate format>

In [33]:
ratings_mat_coo.shape

(385, 600077)

In [34]:
ratings_mat = ratings_mat_coo.tocsr()

In [35]:
interactions[interactions["user_id"] == "-1"]

Unnamed: 0,user_id,book_id,rating,user_index,book_index
0,-1,7896527,4,0,552790
1,-1,35504431,4,0,446534
2,-1,18774964,3,0,196321
3,-1,50659467,4,0,478789
4,-1,17788401,3,0,157692
...,...,...,...,...,...
206,-1,39122774,5,0,457699
224,-1,40597810,5,0,460820
234,-1,34313931,5,0,435884
236,-1,32051912,5,0,414040


In [36]:
my_index = 0

In [37]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(ratings_mat[my_index,:], ratings_mat).flatten()

In [38]:
similarity[2]

0.006187881761852927

In [39]:
#take the top 15 users who are similar
import numpy as np
indices = np.argpartition(similarity, -15)[-15:]

In [40]:
indices

array([ 52, 293, 286, 214, 341,  39,  73,  90,  44, 124, 257, 369, 189,
       281,   0])

In [41]:
similar_users = interactions[interactions["user_index"].isin(indices)].copy()

In [42]:
len(similar_users)

24499

In [43]:
similar_users = similar_users[similar_users["user_id"]!="-1"]

In [44]:
len(similar_users)

24363

In [45]:
book_recs = similar_users.groupby("book_id").rating.agg(['count', 'mean'])

In [46]:
book_recs

Unnamed: 0_level_0,count,mean
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,11,4.090909
10000191,1,0.000000
10000600,1,0.000000
100020,1,0.000000
1000751,1,0.000000
...,...,...
99894,1,0.000000
99944,1,5.000000
9996645,2,0.000000
9998,1,0.000000


In [47]:
similar_users

Unnamed: 0,user_id,book_id,rating,user_index,book_index
340648,56471,10210,5,341,3869
340649,56471,43641,4,341,466454
340650,56471,17245,4,341,139905
340651,56471,128029,5,341,53566
340652,56471,1934,0,341,207347
...,...,...,...,...,...
2970845,402650,25852735,3,293,332369
2970846,402650,35035160,0,293,442636
2970847,402650,72854,0,293,537923
2970848,402650,33641244,0,293,429502


In [49]:
books_titles = pd.read_json("data/books_titles.json")
books_titles["book_id"] = books_titles["book_id"].astype(str)

In [50]:
book_recs = book_recs.merge(books_titles, how="inner", on="book_id")


In [51]:
book_recs

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,modified_title
0,1,11,4.090909,Harry Potter and the Half-Blood Prince (Harry ...,1713866,https://www.goodreads.com/book/show/1.Harry_Po...,https://images.gr-assets.com/books/1361039191m...,harry potter and the halfblood prince harry po...
1,10000191,1,0.000000,Yellow Crocus,17787,https://www.goodreads.com/book/show/10000191-y...,https://s.gr-assets.com/assets/nophoto/book/11...,yellow crocus
2,10000600,1,0.000000,"Old Habits (Wicked Lovely, #2.6)",1628,https://www.goodreads.com/book/show/10000600-o...,https://images.gr-assets.com/books/1327881013m...,old habits wicked lovely 26
3,100020,1,0.000000,Philosophy of Law: A Very Short Introduction,187,https://www.goodreads.com/book/show/100020.Phi...,https://s.gr-assets.com/assets/nophoto/book/11...,philosophy of law a very short introduction
4,1000751,1,0.000000,"Pollyanna (Pollyanna, #1)",58040,https://www.goodreads.com/book/show/1000751.Po...,https://s.gr-assets.com/assets/nophoto/book/11...,pollyanna pollyanna 1
...,...,...,...,...,...,...,...,...
14794,99894,1,0.000000,The Boys from Brazil,25833,https://www.goodreads.com/book/show/99894.The_...,https://images.gr-assets.com/books/1328882615m...,the boys from brazil
14795,99944,1,5.000000,The Bhagavad Gita,33855,https://www.goodreads.com/book/show/99944.The_...,https://images.gr-assets.com/books/1383059639m...,the bhagavad gita
14796,9996645,2,0.000000,"Truly, Madly, Deeply",1268,https://www.goodreads.com/book/show/9996645-tr...,https://images.gr-assets.com/books/1293090850m...,truly madly deeply
14797,9998,1,0.000000,The Woman in the Dunes,11841,https://www.goodreads.com/book/show/9998.The_W...,https://images.gr-assets.com/books/1361254930m...,the woman in the dunes


In [52]:
book_recs["adjusted_count"] = book_recs["count"] * (book_recs["count"] / book_recs["ratings"])

In [53]:
book_recs["score"] = book_recs["mean"] * book_recs["adjusted_count"]

In [54]:
#take out books we have read
book_recs = book_recs[~book_recs["book_id"].isin(my_books["book_id"])]

In [60]:
book_recs

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,modified_title,adjusted_count,score
0,1,11,4.090909,Harry Potter and the Half-Blood Prince (Harry ...,1713866,https://www.goodreads.com/book/show/1.Harry_Po...,https://images.gr-assets.com/books/1361039191m...,harry potter and the halfblood prince harry po...,0.000071,0.000289
1,10000191,1,0.000000,Yellow Crocus,17787,https://www.goodreads.com/book/show/10000191-y...,https://s.gr-assets.com/assets/nophoto/book/11...,yellow crocus,0.000056,0.000000
2,10000600,1,0.000000,"Old Habits (Wicked Lovely, #2.6)",1628,https://www.goodreads.com/book/show/10000600-o...,https://images.gr-assets.com/books/1327881013m...,old habits wicked lovely 26,0.000614,0.000000
3,100020,1,0.000000,Philosophy of Law: A Very Short Introduction,187,https://www.goodreads.com/book/show/100020.Phi...,https://s.gr-assets.com/assets/nophoto/book/11...,philosophy of law a very short introduction,0.005348,0.000000
4,1000751,1,0.000000,"Pollyanna (Pollyanna, #1)",58040,https://www.goodreads.com/book/show/1000751.Po...,https://s.gr-assets.com/assets/nophoto/book/11...,pollyanna pollyanna 1,0.000017,0.000000
...,...,...,...,...,...,...,...,...,...,...
14794,99894,1,0.000000,The Boys from Brazil,25833,https://www.goodreads.com/book/show/99894.The_...,https://images.gr-assets.com/books/1328882615m...,the boys from brazil,0.000039,0.000000
14795,99944,1,5.000000,The Bhagavad Gita,33855,https://www.goodreads.com/book/show/99944.The_...,https://images.gr-assets.com/books/1383059639m...,the bhagavad gita,0.000030,0.000148
14796,9996645,2,0.000000,"Truly, Madly, Deeply",1268,https://www.goodreads.com/book/show/9996645-tr...,https://images.gr-assets.com/books/1293090850m...,truly madly deeply,0.003155,0.000000
14797,9998,1,0.000000,The Woman in the Dunes,11841,https://www.goodreads.com/book/show/9998.The_W...,https://images.gr-assets.com/books/1361254930m...,the woman in the dunes,0.000084,0.000000


In [61]:
#take out books we have read
my_books["mod_title"] = my_books["Title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True).str.lower()

In [62]:
#remove if we already read it
# book_recs = book_recs[~book_recs["mod_title"].isin(my_books["mod_title"])]

In [63]:
book_recs = book_recs[book_recs["mean"] >=3]

In [64]:
book_recs = book_recs[book_recs["count"]>2]

In [65]:
top_recs = book_recs.sort_values("mean", ascending=False)

In [66]:
top_recs

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,modified_title,adjusted_count,score
747,114345,3,5.000000,"The Little House Collection (Little House, #1-9)",125070,https://www.goodreads.com/book/show/114345.The...,https://s.gr-assets.com/assets/nophoto/book/11...,the little house collection little house 19,0.000072,0.000360
13853,8306857,3,5.000000,"Divergent (Divergent, #1)",213680,https://www.goodreads.com/book/show/8306857-di...,https://images.gr-assets.com/books/1327873996m...,divergent divergent 1,0.000042,0.000211
11468,5093,3,4.333333,Song of Susannah,87313,https://www.goodreads.com/book/show/5093.Song_...,https://images.gr-assets.com/books/1372296326m...,song of susannah,0.000103,0.000447
14216,8949352,3,4.333333,Night Road,59265,https://www.goodreads.com/book/show/8949352-ni...,https://images.gr-assets.com/books/1282369257m...,night road,0.000152,0.000658
9229,30253864,3,4.333333,Talking as Fast as I Can: From Gilmore Girls t...,35052,https://www.goodreads.com/book/show/30253864-t...,https://images.gr-assets.com/books/1492797634m...,talking as fast as i can from gilmore girls to...,0.000257,0.001113
...,...,...,...,...,...,...,...,...,...,...
5357,20727654,4,3.000000,The Paper Magician (The Paper Magician Trilogy...,26937,https://www.goodreads.com/book/show/20727654-t...,https://images.gr-assets.com/books/1405618531m...,the paper magician the paper magician trilogy 1,0.000594,0.001782
5734,21853621,14,3.000000,The Nightingale,271556,https://www.goodreads.com/book/show/21853621-t...,https://images.gr-assets.com/books/1451446316m...,the nightingale,0.000722,0.002165
8075,26245850,3,3.000000,Before the Fall,54557,https://www.goodreads.com/book/show/26245850-b...,https://images.gr-assets.com/books/1462515889m...,before the fall,0.000165,0.000495
7224,248483,3,3.000000,"Austenland (Austenland, #1)",66992,https://www.goodreads.com/book/show/248483.Aus...,https://s.gr-assets.com/assets/nophoto/book/11...,austenland austenland 1,0.000134,0.000403


In [67]:
def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val, val)

def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)

top_recs.style.format({'url': make_clickable, 'cover_image': show_image})

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,modified_title,adjusted_count,score
747,114345,3,5.0,"The Little House Collection (Little House, #1-9)",125070,Goodreads,,the little house collection little house 19,7.2e-05,0.00036
13853,8306857,3,5.0,"Divergent (Divergent, #1)",213680,Goodreads,,divergent divergent 1,4.2e-05,0.000211
11468,5093,3,4.333333,Song of Susannah,87313,Goodreads,,song of susannah,0.000103,0.000447
14216,8949352,3,4.333333,Night Road,59265,Goodreads,,night road,0.000152,0.000658
9229,30253864,3,4.333333,"Talking as Fast as I Can: From Gilmore Girls to Gilmore Girls, and Everything in Between",35052,Goodreads,,talking as fast as i can from gilmore girls to gilmore girls and everything in between,0.000257,0.001113
1689,132541,3,4.333333,The Secret of Platform 13,9467,Goodreads,,the secret of platform 13,0.000951,0.00412
8406,2767052,14,4.285714,"The Hunger Games (The Hunger Games, #1)",4899965,Goodreads,,the hunger games the hunger games 1,4e-05,0.000171
11062,43641,11,4.272727,Water for Elephants,1082952,Goodreads,,water for elephants,0.000112,0.000477
2074,136251,13,4.230769,"Harry Potter and the Deathly Hallows (Harry Potter, #7)",1784684,Goodreads,,harry potter and the deathly hallows harry potter 7,9.5e-05,0.000401
11424,5,13,4.230769,"Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)",1876252,Goodreads,,harry potter and the prisoner of azkaban harry potter 3,9e-05,0.000381
