In [1]:
import pandas as pd


In [11]:
my_books = pd.read_csv('liked_books.csv', index_col=0)
my_books.head()


Unnamed: 0,user_id,book_id,rating,title
0,-1,2517439,5,"The Forever War (The Forever War, #1)"
1,-1,113576,5,The Smartest Guys in the Room: The Amazing Ris...
2,-1,35100,5,Battle Cry of Freedom
3,-1,228221,5,The Mask of Command
5,-1,17662739,5,"2001: A Space Odyssey (Space Odyssey, #1)"


In [12]:
my_books.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27 entries, 0 to 532
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   user_id  27 non-null     int64 
 1   book_id  27 non-null     int64 
 2   rating   27 non-null     int64 
 3   title    27 non-null     object
dtypes: int64(3), object(1)
memory usage: 1.1+ KB


In [13]:
my_books['book_id'] = my_books['book_id'].astype('str')

In [14]:
book_id_map = {}
with open("book_id_map.csv", "r") as f:
    while True:
        line = f.readline()
        if not line:
            break
        id, book_id = line.split(',')
        book_id_map[id] = book_id.split('\n')[0]

In [15]:
my_books_set = set(my_books["book_id"])

In [20]:
similar_users = {}
with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, id, _, rating, _ = line.split(",")
            
#         try:
        book_id = book_id_map.get(id)
#         except:
#             print(line)
        
        if book_id in my_books_set:
            similar_users[user_id] = similar_users.get(user_id, 0) + 1
            

In [21]:
len(similar_users)

316341

In [22]:
# filtering out users who have read at least 20% books similar to that of mine
filtered_similar_users = set([user for user in similar_users if similar_users[user] > my_books.shape[0]/5])
len(filtered_similar_users)


1258

In [23]:
similar_users_interactions = [] 
with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, id, _, rating, _ = line.split(",")
        
        if user_id in filtered_similar_users:
            book_id = book_id_map[id]
            similar_users_interactions.append([user_id, book_id, rating])
            

In [24]:
len(similar_users_interactions)

5638701

In [25]:
interactions = pd.DataFrame(similar_users_interactions, columns=['user_id', 'book_id', 'rating'])

In [26]:
interactions = pd.concat([my_books[['user_id', 'book_id', 'rating']], interactions])

In [29]:
interactions['user_id'] = interactions['user_id'].astype('str')
interactions['book_id'] = interactions['book_id'].astype('str')
interactions['rating'] = pd.to_numeric(interactions['rating'])


In [30]:
interactions['user_id'].unique()

array(['-1', '282', '874', ..., '442043', '712588', '804100'],
      dtype=object)

In [37]:
interactions['user_index'] = interactions['user_id'].astype("category").cat.codes
interactions['book_index'] = interactions['book_id'].astype("category").cat.codes
print(interactions.head())
print(interactions.tail())

  user_id   book_id  rating  user_index  book_index
0      -1   2517439       5           0      414880
1      -1    113576       5           0       38971
2      -1     35100       5           0      575858
3      -1    228221       5           0      356004
5      -1  17662739       5           0      214285
        user_id  book_id  rating  user_index  book_index
5638696  804100   475178       0        1183      617107
5638697  804100   186074       0        1183      258768
5638698  804100   153008       0        1183      141428
5638699  804100    45107       0        1183      611284
5638700  804100  1762198       5        1183      213203


We will create a metrix with user_index as rows, book_index as columns and correspondig user's book rating as the cell value.

This will be a *len(interactions['user_index'].unique())* * *len(interactions['book_index'].unique())* matrix and will be sparsely populated. To reduce memory usage, we will create a sparse matrix instead of a dense matrix. In a sparse matrix, if a cell has no value to be assigned, it will be left out blank, thereyby using up no space in the memory.

In [38]:
from scipy.sparse import coo_matrix # a type of sparse matrix

ratings_mat_coo = coo_matrix((interactions.rating, (interactions.user_index, interactions.book_index)))

In [39]:
ratings_mat_coo

<1259x802870 sparse matrix of type '<class 'numpy.int64'>'
	with 5638728 stored elements in COOrdinate format>

If it weren't created as a sparse matrix, it would have taken up 1259 x 802870 = 1010813330.

In [40]:
# because csr matrix is equired for further computations and coo matrices are easier to create
ratings_mat = ratings_mat_coo.tocsr()
ratings_mat


<1259x802870 sparse matrix of type '<class 'numpy.int64'>'
	with 5638728 stored elements in Compressed Sparse Row format>

In [44]:
# identifying my row in the matrix (= my user_index)
my_index = interactions[interactions.user_id == "-1"].loc[0, 'user_index']

Now, we will use cosine similarity to identify the most similar users.

In [46]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(ratings_mat[my_index, :], ratings_mat).flatten()
similarity


array([1.        , 0.04579826, 0.06143443, ..., 0.        , 0.00393254,
       0.02317069])

The similarity value 1 in the first position indicates that I am fuly similar to myself. Obviously :P

In [61]:
import numpy as np

indices = np.argsort(similarity)[-15:] # top 15 similar users
indices

array([1188,  942,  129, 1208, 1213,  435,  218,  795,  496, 1210,  294,
        321,  862, 1143,    0], dtype=int64)

In [62]:
users = interactions[interactions.user_index.isin(indices) & interactions.user_index != my_index].copy()
users.head()

Unnamed: 0,user_id,book_id,rating,user_index,book_index
472439,36497,257845,5,795,435943
472440,36497,2767052,5,795,472031
472441,36497,6148028,4,795,656087
472442,36497,7260188,3,795,709494
472443,36497,7170627,5,795,706571


In [63]:
users.shape

(1237, 5)

In [65]:
# finding the average rating given by similar users to the books they read
book_recs = users.groupby('book_id').rating.agg(['count', 'mean'])
book_recs.head()

Unnamed: 0_level_0,count,mean
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2,4.5
100365,1,0.0
1005,1,0.0
100629,1,0.0
100834,1,0.0


In [66]:
# fetching book metadata
books_data = pd.read_json('books.json')
books_data['book_id'] = books_data['book_id'].astype('str')

In [67]:
book_recs = book_recs.merge(books_data, on='book_id', how='inner')
book_recs.head()

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,processed_title
0,1,2,4.5,Harry Potter and the Half-Blood Prince (Harry ...,1713866,https://www.goodreads.com/book/show/1.Harry_Po...,https://images.gr-assets.com/books/1361039191m...,harry potter and the halfblood prince harry po...
1,100365,1,0.0,The Mote in God's Eye,48736,https://www.goodreads.com/book/show/100365.The...,https://images.gr-assets.com/books/1399490037m...,the mote in gods eye
2,1005,1,0.0,Think and Grow Rich,87634,https://www.goodreads.com/book/show/1005.Think...,https://s.gr-assets.com/assets/nophoto/book/11...,think and grow rich
3,100629,1,0.0,The Universe in a Single Atom: The Convergence...,6310,https://www.goodreads.com/book/show/100629.The...,https://images.gr-assets.com/books/1320558690m...,the universe in a single atom the convergence ...
4,100834,1,0.0,"A World Undone: The Story of the Great War, 19...",2959,https://www.goodreads.com/book/show/100834.A_W...,https://s.gr-assets.com/assets/nophoto/book/11...,a world undone the story of the great war 1914...


In [68]:
# adjusting or penalising for popularity
# really popular among users like me but weren't as popular among users unlike me
book_recs['adjusted_count'] = book_recs['count'] * book_recs['count']/book_recs['ratings']

In [69]:
book_recs['score'] = book_recs['mean'] * book_recs['adjusted_count']

In [70]:
# removing books that I already read
book_recs = book_recs[~book_recs.book_id.isin(my_books.book_id)]

# processing book titles to remove same but differently formatted book titles
my_books['processed_title'] = my_books['title'].str.lower()
my_books['processed_title'] = my_books['processed_title'].str.replace('[^a-z0-9 ]', '', regex=True)
my_books['processed_title'] = my_books['processed_title'].str.replace('\s+', ' ', regex=True)
my_books['processed_title'] = my_books['processed_title'].str.strip()

book_recs = book_recs[~book_recs.processed_title.isin(my_books.processed_title)]

In [74]:
# at least three users should have read it & mean rating > 4
book_recs = book_recs[book_recs['count'] > 2]
book_recs = book_recs[book_recs['mean'] > 4]

In [75]:
book_recs.shape

(4, 10)

In [76]:
top_recs = book_recs.sort_values("score", ascending=False)
top_recs

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,processed_title,adjusted_count,score
5,100915,3,4.666667,"The Lion, the Witch, and the Wardrobe (Chronic...",1575387,https://www.goodreads.com/book/show/100915.The...,https://images.gr-assets.com/books/1353029077m...,the lion the witch and the wardrobe chronicles...,6e-06,2.7e-05
420,2429135,3,4.666667,"The Girl with the Dragon Tattoo (Millennium, #1)",1858152,https://www.goodreads.com/book/show/2429135.Th...,https://images.gr-assets.com/books/1327868566m...,the girl with the dragon tattoo millennium 1,5e-06,2.3e-05
489,2767052,4,4.75,"The Hunger Games (The Hunger Games, #1)",4899965,https://www.goodreads.com/book/show/2767052-th...,https://images.gr-assets.com/books/1447303603m...,the hunger games the hunger games 1,3e-06,1.6e-05
469,2657,3,4.666667,To Kill a Mockingbird,3255518,https://www.goodreads.com/book/show/2657.To_Ki...,https://images.gr-assets.com/books/1361975680m...,to kill a mockingbird,3e-06,1.3e-05


In [78]:
def url_format(url):
    return(f'<a target="_blank" href="{url}">Goodreads</a>')

def display_image(image):
    return(f'<img src="{image}" width=50></img>')

top_recs[['title', 'ratings', 'url', 'cover_image']].head(5).style.format({'url': url_format, 'cover_image': display_image})

Unnamed: 0,title,ratings,url,cover_image
5,"The Lion, the Witch, and the Wardrobe (Chronicles of Narnia, #1)",1575387,Goodreads,
420,"The Girl with the Dragon Tattoo (Millennium, #1)",1858152,Goodreads,
489,"The Hunger Games (The Hunger Games, #1)",4899965,Goodreads,
469,To Kill a Mockingbird,3255518,Goodreads,
