In [89]:
#create dataframe from liked_books.csv
import pandas as pd
my_books = pd.read_csv("liked_books.csv", index_col=0)
my_books["book_id"] = my_books["book_id"].astype(str)

In [90]:
my_books # dataframe of my liked books

Unnamed: 0,user_id,book_id,rating,title
0,-1,2517439,5,"The Forever War (The Forever War, #1)"
1,-1,113576,5,The Smartest Guys in the Room: The Amazing Ris...
2,-1,35100,5,Battle Cry of Freedom
3,-1,228221,5,The Mask of Command
5,-1,17662739,5,"2001: A Space Odyssey (Space Odyssey, #1)"
6,-1,356824,5,India After Gandhi: The History of the World's...
7,-1,12125412,5,The Lady or the Tiger?: and Other Logic Puzzles
8,-1,139069,5,Endurance: Shackleton's Incredible Voyage
10,-1,76680,5,"Foundation (Foundation, #1)"
11,-1,1898,5,Into Thin Air: A Personal Account of the Mount...


In [91]:
#convert book_id_map.csv file into dictionary
csv_book_mapping = {}

with open("book_id_map.csv", "r") as f:
    while True:
        line = f.readline()
        if not line:
            break
        csv_id, book_id = line.strip().split(",")
        csv_book_mapping[csv_id] = book_id

In [92]:
book_set = set(my_books["book_id"])

In [9]:
book_set

{'113576',
 '12125412',
 '1215032',
 '128029',
 '139069',
 '1685995',
 '17662739',
 '18949861',
 '1898',
 '228221',
 '228665',
 '2517439',
 '25659450',
 '28187',
 '2913377',
 '35100',
 '356824',
 '437143',
 '5096865',
 '5439',
 '5578108',
 '6448772',
 '76680',
 '77203',
 '8161140',
 '82599',
 '883438'}

In [93]:
#creat user_id: count dictinory from goodreads_interactions.csv based on the same book_id that listed in book_set

overlap_users = {}
with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.strip().split(",")
        
        book_id = csv_book_mapping.get(csv_id)
        
        if book_id in book_set:
            if user_id not in overlap_users:
                overlap_users[user_id] = 1
            else:
                overlap_users[user_id] += 1

In [94]:
len(overlap_users)

316341

In [95]:
my_books.shape[0]

27

In [96]:
my_books.shape[1]

4

In [97]:
#filter 20% of the overlap_users
filtered_overlap_users = set([key for key in overlap_users if overlap_users[key] > my_books.shape[0]/5])

In [98]:
len(filtered_overlap_users)

1258

In [99]:
#create final list of user_id with its respective book_id and ratings
interactions_list = []

with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        if user_id in filtered_overlap_users:
            book_id = csv_book_mapping[csv_id]
            interactions_list.append([user_id, book_id, rating])

In [100]:
len(interactions_list)

5638701

In [101]:
interactions_list[0]

['282', '627206', '4']

In [102]:
#create dataframe
interactions = pd.DataFrame(interactions_list, columns=["user_id", "book_id", "rating"])

In [103]:
# concatenate my_books with interactions (user_books)
interactions = pd.concat([my_books[["user_id", "book_id", "rating"]], interactions])

In [104]:
interactions

Unnamed: 0,user_id,book_id,rating
0,-1,2517439,5
1,-1,113576,5
2,-1,35100,5
3,-1,228221,5
5,-1,17662739,5
...,...,...,...
5638696,804100,475178,0
5638697,804100,186074,0
5638698,804100,153008,0
5638699,804100,45107,0


In [105]:
interactions["book_id"] = interactions["book_id"].astype(str)
interactions["user_id"] = interactions["user_id"].astype(str)
interactions["rating"] = pd.to_numeric(interactions["rating"])

In [106]:
# create unique index for user_id
interactions["user_index"] = interactions["user_id"].astype("category").cat.codes

In [107]:
interactions

Unnamed: 0,user_id,book_id,rating,user_index
0,-1,2517439,5,0
1,-1,113576,5,0
2,-1,35100,5,0
3,-1,228221,5,0
5,-1,17662739,5,0
...,...,...,...,...
5638696,804100,475178,0,1183
5638697,804100,186074,0,1183
5638698,804100,153008,0,1183
5638699,804100,45107,0,1183


In [108]:
# create unique index for book_id
interactions["book_index"] = interactions["book_id"].astype("category").cat.codes

In [109]:
interactions

Unnamed: 0,user_id,book_id,rating,user_index,book_index
0,-1,2517439,5,0,414880
1,-1,113576,5,0,38971
2,-1,35100,5,0,575858
3,-1,228221,5,0,356004
5,-1,17662739,5,0,214285
...,...,...,...,...,...
5638696,804100,475178,0,1183,617107
5638697,804100,186074,0,1183,258768
5638698,804100,153008,0,1183,141428
5638699,804100,45107,0,1183,611284


In [110]:
# create sparse matrix
from scipy.sparse import coo_matrix

ratings_mat_coo = coo_matrix((interactions["rating"], (interactions["user_index"], interactions["book_index"])))

In [None]:
print(ratings_mat_coo)

In [112]:
ratings_mat_coo.shape

(1259, 802870)

In [113]:
#convert a sparse matrix (in COO format) into a compressed sparse row (CSR) format
ratings_mat = ratings_mat_coo.tocsr()

In [None]:
print(ratings_mat)

In [None]:
interactions[interactions["user_id"] == "-1"] # returns my liked book_id details

In [152]:
my_index = 0

In [153]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(ratings_mat[my_index:], ratings_mat).flatten()

In [154]:
similarity

array([1.        , 0.04579826, 0.06143443, ..., 0.07970932, 0.0791374 ,
       1.        ])

In [155]:
import numpy as np

indices = np.argpartition(similarity, -20)[-20:]

In [156]:
indices

array([1219680,  893340,  609840,  616140, 1367100, 1456560, 1108800,
        567000,  323820,  221760, 1204560, 1009260,  840420, 1254960,
        152460, 1205820,  987840,  254520,  989100, 1064700], dtype=int64)

In [157]:
similar_users = interactions[interactions["book_index"].isin(indices)].copy()

In [158]:
similar_users = similar_users[similar_users["user_id"]!="-1"]

In [159]:
similar_users

Unnamed: 0,user_id,book_id,rating,user_index,book_index
24178,2001,18490591,0,314,254520
134734,13715,15789399,0,112,152460
422546,32978,34331816,0,696,567000
430879,33413,18490591,3,705,254520
442756,34085,17834830,0,724,221760
688417,58545,17834830,0,1102,221760
819826,66153,15789399,1,1123,152460
934921,77376,15789399,0,1168,152460
966020,78417,15789399,0,1171,152460
986327,78854,17834830,0,1175,221760


In [160]:
book_recs = similar_users.groupby("book_id")

In [161]:
print(book_recs.head(10))

        user_id   book_id  rating  user_index  book_index
24178      2001  18490591       0         314      254520
134734    13715  15789399       0         112      152460
422546    32978  34331816       0         696      567000
430879    33413  18490591       3         705      254520
442756    34085  17834830       0         724      221760
688417    58545  17834830       0        1102      221760
819826    66153  15789399       1        1123      152460
934921    77376  15789399       0        1168      152460
966020    78417  15789399       0        1171      152460
986327    78854  17834830       0        1175      221760
1066829   83250  15789399       0        1193      152460
1404192  108045  17834830       4          24      221760
1510478  117798  17834830       0          44      221760
1521379  118262  15789399       0          46      152460
1576246  119660  18490591       0          56      254520
1787828  134419    471031       0         105      616140
1864603  14131

In [162]:
book_recs = similar_users.groupby("book_id").rating.agg(['count', 'mean'])

In [163]:
book_recs

Unnamed: 0_level_0,count,mean
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
15789399,24,0.25
17834830,11,0.363636
18490591,7,0.428571
2156782,1,3.0
34331816,2,0.0
445321,2,0.0
471031,3,0.0


In [164]:
books_titles = pd.read_json("books_titles.json")
books_titles["book_id"] = books_titles["book_id"].astype(str)

In [165]:
book_recs = book_recs.merge(books_titles, how="inner", on="book_id")

In [166]:
book_recs

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title
0,15789399,24,0.25,Cross Roads,7125,https://www.goodreads.com/book/show/15789399-c...,https://images.gr-assets.com/books/1347989200m...,cross roads
1,17834830,11,0.363636,Monsters: The 1985 Chicago Bears and the Wild ...,713,https://www.goodreads.com/book/show/17834830-m...,https://images.gr-assets.com/books/1370797175m...,monsters the 1985 chicago bears and the wild h...
2,18490591,7,0.428571,Labor Day: Birth Stories for the Twenty-first ...,213,https://www.goodreads.com/book/show/18490591-l...,https://images.gr-assets.com/books/1393065823m...,labor day birth stories for the twentyfirst ce...
3,2156782,1,3.0,Mallory and the Trouble With Twins (The Baby-S...,77,https://www.goodreads.com/book/show/2156782.Ma...,https://s.gr-assets.com/assets/nophoto/book/11...,mallory and the trouble with twins the babysit...
4,34331816,2,0.0,"The Slave Boy (The Orfeo Saga, #6)",6,https://www.goodreads.com/book/show/34331816-t...,https://images.gr-assets.com/books/1487513205m...,the slave boy the orfeo saga 6
5,445321,2,0.0,Amy's Answering Machine: Messages from Mom,195,https://www.goodreads.com/book/show/445321.Amy...,https://s.gr-assets.com/assets/nophoto/book/11...,amys answering machine messages from mom
6,471031,3,0.0,The Day We Met,104,https://www.goodreads.com/book/show/471031.The...,https://s.gr-assets.com/assets/nophoto/book/11...,the day we met


In [167]:
my_books["mod_title"] = my_books["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True).str.lower()
my_books["mod_title"] = my_books["mod_title"].str.replace("\s+", " ", regex=True)
#book_recs = book_recs[~book_recs["mod_title"].isin(my_books["mod_title"])]

In [168]:
book_recs

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title
0,15789399,24,0.25,Cross Roads,7125,https://www.goodreads.com/book/show/15789399-c...,https://images.gr-assets.com/books/1347989200m...,cross roads
1,17834830,11,0.363636,Monsters: The 1985 Chicago Bears and the Wild ...,713,https://www.goodreads.com/book/show/17834830-m...,https://images.gr-assets.com/books/1370797175m...,monsters the 1985 chicago bears and the wild h...
2,18490591,7,0.428571,Labor Day: Birth Stories for the Twenty-first ...,213,https://www.goodreads.com/book/show/18490591-l...,https://images.gr-assets.com/books/1393065823m...,labor day birth stories for the twentyfirst ce...
3,2156782,1,3.0,Mallory and the Trouble With Twins (The Baby-S...,77,https://www.goodreads.com/book/show/2156782.Ma...,https://s.gr-assets.com/assets/nophoto/book/11...,mallory and the trouble with twins the babysit...
4,34331816,2,0.0,"The Slave Boy (The Orfeo Saga, #6)",6,https://www.goodreads.com/book/show/34331816-t...,https://images.gr-assets.com/books/1487513205m...,the slave boy the orfeo saga 6
5,445321,2,0.0,Amy's Answering Machine: Messages from Mom,195,https://www.goodreads.com/book/show/445321.Amy...,https://s.gr-assets.com/assets/nophoto/book/11...,amys answering machine messages from mom
6,471031,3,0.0,The Day We Met,104,https://www.goodreads.com/book/show/471031.The...,https://s.gr-assets.com/assets/nophoto/book/11...,the day we met


In [171]:
def make_clickable(val):
    return '<a target="_blank" href="{}">Go to website</a>'.format(val)

def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)

book_recs.style.format({'url': make_clickable, 'cover_image': show_image})

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title
0,15789399,24,0.25,Cross Roads,7125,Go to website,,cross roads
1,17834830,11,0.363636,Monsters: The 1985 Chicago Bears and the Wild Heart of Football,713,Go to website,,monsters the 1985 chicago bears and the wild heart of football
2,18490591,7,0.428571,"Labor Day: Birth Stories for the Twenty-first Century: Thirty Artful, Unvarnished, Hilarious, Harrowing, Totally True Tales",213,Go to website,,labor day birth stories for the twentyfirst century thirty artful unvarnished hilarious harrowing totally true tales
3,2156782,1,3.0,"Mallory and the Trouble With Twins (The Baby-Sitters Club, #21)",77,Go to website,,mallory and the trouble with twins the babysitters club 21
4,34331816,2,0.0,"The Slave Boy (The Orfeo Saga, #6)",6,Go to website,,the slave boy the orfeo saga 6
5,445321,2,0.0,Amy's Answering Machine: Messages from Mom,195,Go to website,,amys answering machine messages from mom
6,471031,3,0.0,The Day We Met,104,Go to website,,the day we met
