# Collaborative Filtering

Using goodreads book list exported from my personal goodreads account. (goodread_library_export)


In [35]:
import pandas as pd
#only care about Book_Id, User_Id, and My_Rating
#goodreads_library_export is my export of data
my_books = pd.read_csv("data/goodreads_library_export.csv", skiprows = 1)

#filter for only Exclusive Shelf == read
my_books.columns = my_books.columns.str.replace(' ', '_')
my_books = my_books[my_books['Exclusive_Shelf'] == 'read']

my_books

Unnamed: 0,Book_Id,User_Id,Title,Author,Author_l-f,Additional_Authors,ISBN,ISBN13,My_Rating,Average_Rating,...,Date_Read,Date_Added,Bookshelves,Bookshelves_with_positions,Exclusive_Shelf,My_Review,Spoiler,Private_Notes,Read_Count,Owned_Copies
0,7896527,-1,"Throne of Glass (Throne of Glass, #1)",Sarah J. Maas,"Maas, Sarah J.",,,,4,4.18,...,,2022/09/08,,,read,,,,1,0
1,35504431,-1,Turtles All the Way Down,John Green,"Green, John",,0525555366,9.780526e+12,4,3.88,...,,2022/07/18,,,read,,,,1,0
2,18774964,-1,A Man Called Ove,Fredrik Backman,"Backman, Fredrik",Henning Koch,1476738017,9.781477e+12,3,4.38,...,,2019/10/16,,,read,,,,1,0
3,50659467,-1,A Court of Thorns and Roses (A Court of Thorns...,Sarah J. Maas,"Maas, Sarah J.",,1635575567,9.781636e+12,4,4.18,...,,2022/08/12,,,read,,,,1,0
4,17788401,-1,Ugly Love,Colleen Hoover,"Hoover, Colleen",,,,3,4.03,...,,2022/07/22,,,read,,,,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206,39122774,-1,Children of Virtue and Vengeance (Legacy of Or...,Tomi Adeyemi,"Adeyemi, Tomi",,1250170990,9.781250e+12,5,3.89,...,2021/01/11,2020/12/22,,,read,,,,1,0
224,40597810,-1,Daisy Jones & The Six,Taylor Jenkins Reid,"Reid, Taylor Jenkins",,1524798622,9.781525e+12,5,4.20,...,2020/07/13,2020/07/07,,,read,,,,1,0
234,34313931,-1,A Woman Is No Man,Etaf Rum,"Rum, Etaf",,0062699768,9.780063e+12,5,4.26,...,2020/06/24,2020/06/21,,,read,,,,1,0
236,32051912,-1,The Alice Network,Kate Quinn,"Quinn, Kate",,,,5,4.32,...,2020/05/01,2020/05/29,,,read,,,,1,0


In [37]:
my_books['Book_Id'] = my_books['Book_Id'].astype(str)

In [41]:
#load in mapping file to link data
csv_book_mapping = {}

with open("data/book_id_map.csv", "r") as f:
    while True: 
        line = f.readline()
        if not line: 
            break
        csv_id, book_id = line.strip().split(",")
        csv_book_mapping[csv_id] = book_id

In [47]:
#set of books with all unique books we have read
book_set = set(my_books["Book_Id"])

In [49]:
#looking at users that overlap with our books, 
#keys are user_ids and values are num of times user has read a book that we have in our list too

overlap_users = {}

with open("data/goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        book_id = csv_book_mapping.get(csv_id)
        
        if book_id in book_set:
            if user_id not in overlap_users:
                overlap_users[user_id] = 1
            else:
                overlap_users[user_id] += 1

In [50]:
len(overlap_users)

532737

In [53]:
#filter for people who have at least 20% books in common with us
filtered_overlap_users = set([k for k in overlap_users if overlap_users[k] > my_books.shape[0]/5])

In [57]:
len(filtered_overlap_users)

70

In [59]:
#add the filtered overlap users interactions into a list to look at
interactions_list = []

with open("data/goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        if user_id in filtered_overlap_users:
            book_id = csv_book_mapping[csv_id]
            interactions_list.append([user_id, book_id, rating])

In [60]:
len(interactions_list)

1472254

In [63]:
interactions_list[0] #user_id, book_id, rating

['2794', '682745', '5']

In [71]:
interactions = pd.DataFrame(interactions_list, columns=["user_id", "book_id", "rating"])
interactions

Unnamed: 0,user_id,book_id,rating
0,2794,682745,5
1,2794,11466,5
2,2794,50275,0
3,2794,79090,0
4,2794,4980,0
...,...,...,...
1472249,439355,33574122,0
1472250,439355,32446437,0
1472251,439355,35906271,0
1472252,439355,760309,0


In [73]:
#add my ratings to interactions df 
# rename Book_Id, User_Id, and My_Rating to match interactions col names
my_books = my_books.rename(columns={
    "User_Id": "user_id",
    "Book_Id": "book_id",
    "My_Rating": "rating"
})

interactions = pd.concat([my_books[["user_id", "book_id", "rating"]], interactions])
interactions

Unnamed: 0,user_id,book_id,rating
0,-1,7896527,4
1,-1,35504431,4
2,-1,18774964,3
3,-1,50659467,4
4,-1,17788401,3
...,...,...,...
1472249,439355,33574122,0
1472250,439355,32446437,0
1472251,439355,35906271,0
1472252,439355,760309,0


In [75]:
#make sure everything is in correct form
interactions["book_id"] = interactions["book_id"].astype(str)
interactions["user_id"] = interactions["user_id"].astype(str)
interactions["rating"] = pd.to_numeric(interactions["rating"])

In [None]:
#build a collaborative filtering matrix
#row is different user
#col is different book
#cell is rating

In [77]:
interactions["user_index"] = interactions["user_id"].astype("category").cat.codes

In [85]:
len(interactions["user_index"].unique())

71

In [81]:
interactions["book_index"] = interactions["book_id"].astype("category").cat.codes

In [83]:
len(interactions["book_index"].unique())

466993

In [87]:
#sparse matrix
from scipy.sparse import coo_matrix

ratings_mat_coo = coo_matrix((interactions["rating"], (interactions["user_index"], interactions["book_index"])))

In [89]:
ratings_mat_coo

<71x466993 sparse matrix of type '<class 'numpy.int64'>'
	with 1472390 stored elements in COOrdinate format>

In [91]:
ratings_mat_coo.shape

(71, 466993)

In [93]:
ratings_mat = ratings_mat_coo.tocsr()

In [95]:
interactions[interactions["user_id"] == "-1"]

Unnamed: 0,user_id,book_id,rating,user_index,book_index
0,-1,7896527,4,0,432192
1,-1,35504431,4,0,352429
2,-1,18774964,3,0,153833
3,-1,50659467,4,0,377721
4,-1,17788401,3,0,122501
...,...,...,...,...,...
206,-1,39122774,5,0,361072
224,-1,40597810,5,0,363516
234,-1,34313931,5,0,344422
236,-1,32051912,5,0,327516


In [97]:
my_index = 0

In [99]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(ratings_mat[my_index,:], ratings_mat).flatten()

In [105]:
similarity[2]

0.0339632314336798

In [109]:
#take the top 15 users who are similar
import numpy as np
indices = np.argpartition(similarity, -15)[-15:]

In [111]:
indices

array([36, 48, 51,  2, 34, 61, 27, 38, 19, 17, 57,  6,  4, 56,  0])

In [115]:
similar_users = interactions[interactions["user_index"].isin(indices)].copy()

In [117]:
len(similar_users)

68703

In [119]:
similar_users = similar_users[similar_users["user_id"]!="-1"]

In [129]:
len(similar_users)

68567

In [125]:
book_recs = similar_users.groupby("book_id").rating.agg(['count', 'mean'])

In [127]:
book_recs

Unnamed: 0_level_0,count,mean
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,9,4.111111
1000012,1,0.000000
10000191,2,1.500000
10000269,2,0.000000
100003,1,0.000000
...,...,...
9998705,1,0.000000
9998825,1,0.000000
9999033,1,0.000000
9999107,5,0.000000


In [131]:
similar_users

Unnamed: 0,user_id,book_id,rating,user_index,book_index
8554,11961,562660,3,4,385099
8555,11961,9681098,4,4,462074
8556,11961,5043,5,4,377455
8557,11961,930,5,4,455536
8558,11961,4989,5,4,376811
...,...,...,...,...,...
1420113,430359,35604204,0,57,352957
1420114,430359,33515066,0,57,338435
1420115,430359,31737890,2,57,325148
1420116,430359,34723185,0,57,347467


In [133]:
books_titles = pd.read_json("books_titles.json")
books_titles["book_id"] = books_titles["book_id"].astype(str)

In [134]:
book_recs = book_recs.merge(books_titles, how="inner", on="book_id")


In [137]:
book_recs

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,modified_title
0,1,9,4.111111,Harry Potter and the Half-Blood Prince (Harry ...,1713866,https://www.goodreads.com/book/show/1.Harry_Po...,https://images.gr-assets.com/books/1361039191m...,harry potter and the halfblood prince harry po...
1,1000012,1,0.000000,A Wild Yearning,298,https://www.goodreads.com/book/show/1000012.A_...,https://s.gr-assets.com/assets/nophoto/book/11...,a wild yearning
2,10000191,2,1.500000,Yellow Crocus,17787,https://www.goodreads.com/book/show/10000191-y...,https://s.gr-assets.com/assets/nophoto/book/11...,yellow crocus
3,10000269,2,0.000000,Prey,5976,https://www.goodreads.com/book/show/10000269-prey,https://images.gr-assets.com/books/1333576631m...,prey
4,100003,1,0.000000,Stephen Hawking's Life Works: The Cambridge Le...,51,https://www.goodreads.com/book/show/100003.Ste...,https://s.gr-assets.com/assets/nophoto/book/11...,stephen hawkings life works the cambridge lect...
...,...,...,...,...,...,...,...,...
39869,9998705,1,0.000000,"Flash and Bones (Temperance Brennan, #14)",14249,https://www.goodreads.com/book/show/9998705-fl...,https://images.gr-assets.com/books/1306253347m...,flash and bones temperance brennan 14
39870,9998825,1,0.000000,"Northwest Angle (Cork O'Connor, #11)",3357,https://www.goodreads.com/book/show/9998825-no...,https://s.gr-assets.com/assets/nophoto/book/11...,northwest angle cork oconnor 11
39871,9999033,1,0.000000,"Wear This, Toss That!: Hundreds of Fashion and...",166,https://www.goodreads.com/book/show/9999033-we...,https://s.gr-assets.com/assets/nophoto/book/11...,wear this toss that hundreds of fashion and be...
39872,9999107,5,0.000000,The American Heiress,24522,https://www.goodreads.com/book/show/9999107-th...,https://images.gr-assets.com/books/1307342832m...,the american heiress


In [139]:
book_recs["adjusted_count"] = book_recs["count"] * (book_recs["count"] / book_recs["ratings"])

In [141]:
book_recs["score"] = book_recs["mean"] * book_recs["adjusted_count"]

In [143]:
#take out books we have read
book_recs = book_recs[~book_recs["book_id"].isin(my_books["book_id"])]

In [145]:
book_recs

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,modified_title,adjusted_count,score
0,1,9,4.111111,Harry Potter and the Half-Blood Prince (Harry ...,1713866,https://www.goodreads.com/book/show/1.Harry_Po...,https://images.gr-assets.com/books/1361039191m...,harry potter and the halfblood prince harry po...,0.000047,0.000194
1,1000012,1,0.000000,A Wild Yearning,298,https://www.goodreads.com/book/show/1000012.A_...,https://s.gr-assets.com/assets/nophoto/book/11...,a wild yearning,0.003356,0.000000
2,10000191,2,1.500000,Yellow Crocus,17787,https://www.goodreads.com/book/show/10000191-y...,https://s.gr-assets.com/assets/nophoto/book/11...,yellow crocus,0.000225,0.000337
3,10000269,2,0.000000,Prey,5976,https://www.goodreads.com/book/show/10000269-prey,https://images.gr-assets.com/books/1333576631m...,prey,0.000669,0.000000
4,100003,1,0.000000,Stephen Hawking's Life Works: The Cambridge Le...,51,https://www.goodreads.com/book/show/100003.Ste...,https://s.gr-assets.com/assets/nophoto/book/11...,stephen hawkings life works the cambridge lect...,0.019608,0.000000
...,...,...,...,...,...,...,...,...,...,...
39869,9998705,1,0.000000,"Flash and Bones (Temperance Brennan, #14)",14249,https://www.goodreads.com/book/show/9998705-fl...,https://images.gr-assets.com/books/1306253347m...,flash and bones temperance brennan 14,0.000070,0.000000
39870,9998825,1,0.000000,"Northwest Angle (Cork O'Connor, #11)",3357,https://www.goodreads.com/book/show/9998825-no...,https://s.gr-assets.com/assets/nophoto/book/11...,northwest angle cork oconnor 11,0.000298,0.000000
39871,9999033,1,0.000000,"Wear This, Toss That!: Hundreds of Fashion and...",166,https://www.goodreads.com/book/show/9999033-we...,https://s.gr-assets.com/assets/nophoto/book/11...,wear this toss that hundreds of fashion and be...,0.006024,0.000000
39872,9999107,5,0.000000,The American Heiress,24522,https://www.goodreads.com/book/show/9999107-th...,https://images.gr-assets.com/books/1307342832m...,the american heiress,0.001019,0.000000


In [149]:
#take out books we have read
my_books["mod_title"] = my_books["Title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True).str.lower()

In [157]:
#remove if we already read it
# book_recs = book_recs[~book_recs["mod_title"].isin(my_books["mod_title"])]

In [173]:
book_recs = book_recs[book_recs["mean"] >=3]

In [163]:
book_recs = book_recs[book_recs["count"]>2]

In [165]:
top_recs = book_recs.sort_values("mean", ascending=False)

In [175]:
top_recs

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,modified_title,adjusted_count,score
16067,2,12,4.5,Harry Potter and the Order of the Phoenix (Har...,1766895,https://www.goodreads.com/book/show/2.Harry_Po...,https://images.gr-assets.com/books/1507396732m...,harry potter and the order of the phoenix harr...,8.1e-05,0.000367
37410,8306857,4,4.5,"Divergent (Divergent, #1)",213680,https://www.goodreads.com/book/show/8306857-di...,https://images.gr-assets.com/books/1327873996m...,divergent divergent 1,7.5e-05,0.000337
2587,11870085,14,4.428571,The Fault in Our Stars,2429317,https://www.goodreads.com/book/show/11870085-t...,https://images.gr-assets.com/books/1360206420m...,the fault in our stars,8.1e-05,0.000357
31903,5,11,4.272727,Harry Potter and the Prisoner of Azkaban (Harr...,1876252,https://www.goodreads.com/book/show/5.Harry_Po...,https://images.gr-assets.com/books/1499277281m...,harry potter and the prisoner of azkaban harry...,6.4e-05,0.000276
6531,136251,13,4.230769,Harry Potter and the Deathly Hallows (Harry Po...,1784684,https://www.goodreads.com/book/show/136251.Har...,https://images.gr-assets.com/books/1474171184m...,harry potter and the deathly hallows harry pot...,9.5e-05,0.000401
24959,2767052,14,4.142857,"The Hunger Games (The Hunger Games, #1)",4899965,https://www.goodreads.com/book/show/2767052-th...,https://images.gr-assets.com/books/1447303603m...,the hunger games the hunger games 1,4e-05,0.000166
0,1,9,4.111111,Harry Potter and the Half-Blood Prince (Harry ...,1713866,https://www.goodreads.com/book/show/1.Harry_Po...,https://images.gr-assets.com/books/1361039191m...,harry potter and the halfblood prince harry po...,4.7e-05,0.000194
5220,13335037,11,4.090909,"Divergent (Divergent, #1)",1962813,https://www.goodreads.com/book/show/13335037-d...,https://images.gr-assets.com/books/1328559506m...,divergent divergent 1,6.2e-05,0.000252
33081,6,12,4.083333,Harry Potter and the Goblet of Fire (Harry Pot...,1792561,https://www.goodreads.com/book/show/6.Harry_Po...,https://images.gr-assets.com/books/1361482611m...,harry potter and the goblet of fire harry pott...,8e-05,0.000328
19900,23164983,3,4.0,Hollow City (Miss Peregrine’s Peculiar Childre...,77865,https://www.goodreads.com/book/show/23164983-h...,https://images.gr-assets.com/books/1429175859m...,hollow city miss peregrines peculiar children 2,0.000116,0.000462


In [177]:
def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val, val)

def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)

top_recs.style.format({'url': make_clickable, 'cover_image': show_image})

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,modified_title,adjusted_count,score
16067,2,12,4.5,"Harry Potter and the Order of the Phoenix (Harry Potter, #5)",1766895,Goodreads,,harry potter and the order of the phoenix harry potter 5,8.1e-05,0.000367
37410,8306857,4,4.5,"Divergent (Divergent, #1)",213680,Goodreads,,divergent divergent 1,7.5e-05,0.000337
2587,11870085,14,4.428571,The Fault in Our Stars,2429317,Goodreads,,the fault in our stars,8.1e-05,0.000357
31903,5,11,4.272727,"Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)",1876252,Goodreads,,harry potter and the prisoner of azkaban harry potter 3,6.4e-05,0.000276
6531,136251,13,4.230769,"Harry Potter and the Deathly Hallows (Harry Potter, #7)",1784684,Goodreads,,harry potter and the deathly hallows harry potter 7,9.5e-05,0.000401
24959,2767052,14,4.142857,"The Hunger Games (The Hunger Games, #1)",4899965,Goodreads,,the hunger games the hunger games 1,4e-05,0.000166
0,1,9,4.111111,"Harry Potter and the Half-Blood Prince (Harry Potter, #6)",1713866,Goodreads,,harry potter and the halfblood prince harry potter 6,4.7e-05,0.000194
5220,13335037,11,4.090909,"Divergent (Divergent, #1)",1962813,Goodreads,,divergent divergent 1,6.2e-05,0.000252
33081,6,12,4.083333,"Harry Potter and the Goblet of Fire (Harry Potter, #4)",1792561,Goodreads,,harry potter and the goblet of fire harry potter 4,8e-05,0.000328
19900,23164983,3,4.0,"Hollow City (Miss Peregrine’s Peculiar Children, #2)",77865,Goodreads,,hollow city miss peregrines peculiar children 2,0.000116,0.000462
