In [1]:
# Download these files

# books_titles.json
# https://drive.google.com/file/d/1Iqv9TROqNgYbUDijSaDegv4EPpxO97t3/view?usp=sharing

# goodreads_interactions.csv
# https://drive.google.com/open?id=1zmylV7XW2dfQVCLeg1LbllfQtHD2KUon

# book_id_map.csv
# https://drive.google.com/uc?id=1CHTAaNwyzvbi1TR08MJrJ03BxA266Yxr

# liked_books.csv
# https://drive.google.com/file/d/1dhPhfD5hAOJjrdf8JhvbOPxDpF4qWYnb/view?usp=sharing

# Full code is at https://github.com/dataquestio/project-walkthroughs/tree/master/books

import pandas as pd

my_books = pd.read_csv("karl_good_reads.csv", index_col=0)
my_books["book_id"] = my_books["book_id"].astype(str)

In [3]:
my_books

Unnamed: 0,user_id,book_id,rating,title
0,-1,58684140,0,The Picture of Dorian Gray (Collins Classics)
1,-1,35721089,5,Beautiful Boy: A Father's Journey Through His ...
2,-1,15196,0,Maus I: A Survivor's Tale: My Father Bleeds Hi...
3,-1,60224365,0,Before Your Memory Fades (Before the Coffee Ge...
4,-1,7853133,0,"The Talented Mr Ripley (Ripley, #1)"
5,-1,54373691,4,Tales from the Café (Before the Coffee Gets Co...
6,-1,50269327,0,Earthlings
7,-1,31213519,4,The Peculiar Life of a Lonely Postman
8,-1,102927,5,Never Let Me Go
9,-1,33532,0,Dracula


In [4]:
csv_book_mapping = {}

with open("book_id_map.csv", "r") as f:
    while True:
        line = f.readline()
        if not line:
            break
        csv_id, book_id = line.strip().split(",")
        csv_book_mapping[csv_id] = book_id

In [5]:
book_set = set(my_books["book_id"])

In [7]:
overlap_users = {}

with open("chunk0.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
            
        user_id, csv_id, _, rating, _ = line.split(",")

        book_id = csv_book_mapping.get(csv_id)

        if book_id in book_set:
            if user_id not in overlap_users:
                overlap_users[user_id] = 1
            else:
                overlap_users[user_id] += 1

In [8]:
len(overlap_users)

512

In [11]:
filtered_overlap_users = set([k for k in overlap_users if overlap_users[k] > my_books.shape[0]/10])

In [12]:
len(filtered_overlap_users)

61

In [13]:
interactions_list = []

with open("chunk0.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        if user_id in filtered_overlap_users:
            book_id = csv_book_mapping[csv_id]
            interactions_list.append([user_id, book_id, rating])

In [14]:
len(interactions_list)

65813

In [15]:
interactions_list[0]

['13', '9516', '0']

In [16]:
interactions = pd.DataFrame(interactions_list, columns=["user_id", "book_id", "rating"])

In [17]:
interactions = pd.concat([my_books[["user_id", "book_id", "rating"]], interactions])

In [18]:
interactions

Unnamed: 0,user_id,book_id,rating
0,-1,58684140,0
1,-1,35721089,5
2,-1,15196,0
3,-1,60224365,0
4,-1,7853133,0
...,...,...,...
65808,1033,34735173,0
65809,1033,33840295,0
65810,1033,33816139,0
65811,1033,34321642,0


In [19]:
interactions["book_id"] = interactions["book_id"].astype(str)
interactions["user_id"] = interactions["user_id"].astype(str)
interactions["rating"] = pd.to_numeric(interactions["rating"])

In [20]:
interactions["user_index"] = interactions["user_id"].astype("category").cat.codes

In [21]:
interactions["book_index"] = interactions["book_id"].astype("category").cat.codes

In [22]:
from scipy.sparse import coo_matrix

ratings_mat_coo = coo_matrix((interactions["rating"], (interactions["user_index"], interactions["book_index"])))

In [23]:
ratings_mat_coo.shape

(62, 37182)

In [24]:
ratings_mat = ratings_mat_coo.tocsr()

In [25]:
interactions[interactions["user_id"] == "-1"]

Unnamed: 0,user_id,book_id,rating,user_index,book_index
0,-1,58684140,0,0,29401
1,-1,35721089,5,0,25714
2,-1,15196,0,0,6051
3,-1,60224365,0,0,29737
4,-1,7853133,0,0,33729
5,-1,54373691,4,0,28784
6,-1,50269327,0,0,28116
7,-1,31213519,4,0,23212
8,-1,102927,5,0,347
9,-1,33532,0,0,24609


In [26]:
my_index = 0

In [27]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(ratings_mat[my_index,:], ratings_mat).flatten()

In [28]:
similarity[0]

1.0

In [29]:
import numpy as np

indices = np.argpartition(similarity, -15)[-15:]

In [30]:
indices

array([10, 45, 17, 60, 52, 48, 20, 54, 27, 43, 57,  4, 39, 13,  0],
      dtype=int64)

In [31]:
similar_users = interactions[interactions["user_index"].isin(indices)].copy()

In [32]:
similar_users = similar_users[similar_users["user_id"]!="-1"]

In [33]:
similar_users

Unnamed: 0,user_id,book_id,rating,user_index,book_index
2141,78,2657,4,45,19735
2142,78,7613,4,45,33025
2143,78,5470,0,45,28841
2144,78,3,4,45,22301
2145,78,2120932,3,45,13397
...,...,...,...,...,...
58904,982,1233859,5,60,2894
58905,982,3425041,4,60,25068
58906,982,20821284,0,60,13114
58907,982,28116847,0,60,20757


In [34]:
book_recs = similar_users.groupby("book_id").rating.agg(['count', 'mean'])

In [35]:
book_recs

Unnamed: 0_level_0,count,mean
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,7,3.857143
10000191,2,1.500000
10009,1,0.000000
1001309,1,0.000000
10025305,1,0.000000
...,...,...
99794,1,0.000000
999513,1,0.000000
99955,1,1.000000
9997650,1,0.000000


In [39]:
books_titles = pd.read_json("books_titles.json")
books_titles["book_id"] = books_titles["book_id"].astype(str)

In [40]:
book_recs = book_recs.merge(books_titles, how="inner", on="book_id")

In [41]:
book_recs

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title
0,1,7,3.857143,Harry Potter and the Half-Blood Prince (Harry ...,1713866,https://www.goodreads.com/book/show/1.Harry_Po...,https://images.gr-assets.com/books/1361039191m...,harry potter and the halfblood prince harry po...
1,10000191,2,1.500000,Yellow Crocus,17787,https://www.goodreads.com/book/show/10000191-y...,https://s.gr-assets.com/assets/nophoto/book/11...,yellow crocus
2,10009,1,0.000000,Homo Faber,9652,https://www.goodreads.com/book/show/10009.Homo...,https://s.gr-assets.com/assets/nophoto/book/11...,homo faber
3,1001309,1,0.000000,Islands,239,https://www.goodreads.com/book/show/1001309.Is...,https://images.gr-assets.com/books/1328876739m...,islands
4,10025305,1,0.000000,"Clockwork Prince (The Infernal Devices, #2)",318750,https://www.goodreads.com/book/show/10025305-c...,https://images.gr-assets.com/books/1460477747m...,clockwork prince the infernal devices 2
...,...,...,...,...,...,...,...,...
6453,99794,1,0.000000,The Golem,2954,https://www.goodreads.com/book/show/99794.The_...,https://s.gr-assets.com/assets/nophoto/book/11...,the golem
6454,999513,1,0.000000,Simplicissimus,1000,https://www.goodreads.com/book/show/999513.Sim...,https://images.gr-assets.com/books/1349140020m...,simplicissimus
6455,99955,1,1.000000,"Common Sense, The Rights of Man and Other Esse...",13207,https://www.goodreads.com/book/show/99955.Comm...,https://images.gr-assets.com/books/1309203355m...,common sense the rights of man and other essen...
6456,9997650,1,0.000000,برما يقابل ريا وسكينه,2763,https://www.goodreads.com/book/show/9997650,https://images.gr-assets.com/books/1293107490m...,


In [42]:
book_recs["adjusted_count"] = book_recs["count"] * (book_recs["count"] / book_recs["ratings"])

In [43]:
book_recs["score"] = book_recs["mean"] * book_recs["adjusted_count"]

In [44]:
book_recs = book_recs[~book_recs["book_id"].isin(my_books["book_id"])]

In [45]:
my_books["mod_title"] = my_books["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True).str.lower()

In [46]:
my_books["mod_title"] = my_books["mod_title"].str.replace("\s+", " ", regex=True)

In [47]:
book_recs = book_recs[~book_recs["mod_title"].isin(my_books["mod_title"])]

In [48]:
book_recs = book_recs[book_recs["mean"] >=4]

In [49]:
book_recs = book_recs[book_recs["count"]>2]

In [50]:
top_recs = book_recs.sort_values("mean", ascending=False)

In [51]:
def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val, val)

def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)

top_recs.style.format({'url': make_clickable, 'cover_image': show_image})

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title,adjusted_count,score
1133,15753740,3,5.0,The Storyteller,111927,Goodreads,,the storyteller,8e-05,0.000402
603,128029,7,4.714286,A Thousand Splendid Suns,835172,Goodreads,,a thousand splendid suns,5.9e-05,0.000277
1319,1617,10,4.7,Night (The Night Trilogy #1),708754,Goodreads,,night the night trilogy 1,0.000141,0.000663
2610,23129410,3,4.666667,"Welcome to Night Vale (Night Vale, #1)",18935,Goodreads,,welcome to night vale night vale 1,0.000475,0.002218
1289,16130549,3,4.666667,Doctor Sleep,98088,Goodreads,,doctor sleep,9.2e-05,0.000428
3086,25489625,3,4.666667,Between the World and Me,80886,Goodreads,,between the world and me,0.000111,0.000519
4911,5113,3,4.666667,Franny and Zooey,154527,Goodreads,,franny and zooey,5.8e-05,0.000272
1948,18774964,3,4.666667,A Man Called Ove,207038,Goodreads,,a man called ove,4.3e-05,0.000203
4514,391729,3,4.666667,The Tell-Tale Heart and Other Writings,196577,Goodreads,,the telltale heart and other writings,4.6e-05,0.000214
206,10917,5,4.6,My Sister's Keeper,876319,Goodreads,,my sisters keeper,2.9e-05,0.000131
