# Reading in books we like 

In [4]:
import pandas as pd

my_books = pd.read_csv("liked_books.csv", index_col=0)  # If you take a look at our file, our first column is an index column 
# so now we actually have 2 index colums, our real index and the index from within our file itself. 
# To fix this we set our file index column, which is column 0 to be our index column using "index_col = 0"

In [5]:
my_books

Unnamed: 0,user_id,book_id,rating,title
0,-1,2517439,5,"The Forever War (The Forever War, #1)"
1,-1,113576,5,The Smartest Guys in the Room: The Amazing Ris...
2,-1,35100,5,Battle Cry of Freedom
3,-1,228221,5,The Mask of Command
5,-1,17662739,5,"2001: A Space Odyssey (Space Odyssey, #1)"
6,-1,356824,5,India After Gandhi: The History of the World's...
7,-1,12125412,5,The Lady or the Tiger?: and Other Logic Puzzles
8,-1,139069,5,Endurance: Shackleton's Incredible Voyage
10,-1,76680,5,"Foundation (Foundation, #1)"
11,-1,1898,5,Into Thin Air: A Personal Account of the Mount...


In [9]:
# Control if our book_id values are string/text data types 

if my_books['book_id'].dtypes == 'object':
    print('yes, they are')
else: 
    print('no, they are not')

no, they are not


In [10]:
# converting our book_id's to string values 

my_books['book_id'] = my_books['book_id'].astype(str)

# Finding similar users

In [11]:
!head book_id_map.csv

book_id_csv,book_id
0,34684622
1,34536488
2,34017076
3,71730
4,30422361
5,33503613
6,33517540
7,34467031
8,6383669


In [12]:
# As in the recommendation file, we first make a dictionary which maps (key->value) 
# the file containing the id's of the books with file that contains information about the id's/books

csv_book_mapping = {}

with open("book_id_map.csv", "r") as f:
    while True:
        line = f.readline()
        if not line:
            break
        csv_id, book_id = line.strip().split(",") 
        csv_book_mapping[csv_id] = book_id

In [13]:
# Making a unique list containing our liked books

book_set = set(my_books["book_id"])

In [15]:
!head goodreads_interactions.csv

user_id,book_id,is_read,rating,is_reviewed
0,948,1,5,0
0,947,1,5,1
0,946,1,5,0
0,945,1,5,0
0,944,1,5,0
0,943,1,5,0
0,942,1,5,0
0,941,1,5,0
0,940,1,5,0


# Creating a dictionary

In [16]:
overlap_users = {}

with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")  # For every line we are splitting every comma as you can see above and we only take the "user_id", "book_id" and the "rating", I don't care about the other 2 
        
        book_id = csv_book_mapping.get(csv_id) # Using .get method returns "None" if there is a specific key not available in stead of raising a Keyerror

        
        if book_id in book_set:
            if user_id not in overlap_users: # Keeping count of how many times a user has liked a book tht we have liked 
                overlap_users[user_id] = 1
            else:
                overlap_users[user_id] += 1 # Returns a dictionary with other users as key and their values the count described above 

In [23]:
len(overlap_users)

316341

In [22]:
# Only finding users that have liked the same books as us 
# Only showing the users who have read at least 20% of the books that we have read 

filtered_overlap_users = set([u for u in overlap_users if overlap_users[u] > my_books.shape[0]/5])

In [24]:
len(filtered_overlap_users)

1258

# Finding similar users book ratings 

In [29]:
# For every user that has read a specific amount of books that we have read (cell above) 
# we will create a list with all the books they have read 

interactions_list = []

with open ("goodreads_interactions.csv", "r") as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")

        if user_id in filtered_overlap_users:
            book_id = csv_book_mapping[csv_id]
            interactions_list.append([user_id, csv_id, rating])

# Creating a user/book matrix 

In [30]:
len(interactions_list)

5638701

In [31]:
interactions_list[0]

['282', '1064', '4']

In [33]:
# Turning our list into a dataframe and naming the columns

interactions = pd.DataFrame(interactions_list, columns=["user_id", "book_id", "rating"])

In [34]:
interactions

Unnamed: 0,user_id,book_id,rating
0,282,1064,4
1,282,739,4
2,282,7220,4
3,282,1041,3
4,282,787,4
...,...,...,...
5638696,804100,145214,0
5638697,804100,126,0
5638698,804100,8250,0
5638699,804100,17400,0


In [36]:
# Putting our ratings into the list with the ratings of everybody else 

interactions = pd.concat([my_books[["user_id", "book_id", "rating"]], interactions])

# pd.concat is a function in the pandas library that concatenates
# (combines) multiple DataFrames along a specified axis.

In [37]:
interactions

Unnamed: 0,user_id,book_id,rating
0,-1,2517439,5
1,-1,113576,5
2,-1,35100,5
3,-1,228221,5
5,-1,17662739,5
...,...,...,...
5638696,804100,145214,0
5638697,804100,126,0
5638698,804100,8250,0
5638699,804100,17400,0


In [38]:
# Making sure we have the right data types 

# Our book_id's and user_id's are strings in our json file so we want to make sure we have the same dtypes (str)
# to be able to allign properly between files, lists, ... (our ratings are numbers) 

interactions["book_id"] = interactions["book_id"].astype(str)
interactions["user_id"] = interactions["user_id"].astype(str)
interactions["rating"] = pd.to_numeric(interactions["rating"])

In [39]:
# Giving each "user_id" a unique value

interactions["user_id"].unique()

array(['-1', '282', '874', ..., '442043', '712588', '804100'],
      dtype=object)

In [40]:
# Converting al the values/user_id's that are the same to the same category 
# (creates a number that corresponds to a specific user_id) 

interactions["user_index"] = interactions["user_id"].astype("category").cat.codes

In [42]:
interactions["user_index"].unique()

array([   0,  555, 1216, ..., 1054, 1143, 1183], dtype=int16)

In [44]:
interactions["book_index"] = interactions["book_id"].astype("category").cat.codes

# .cat.codes converts categorical values into numerical values to make the data more compact en better to act on 

In [45]:
len(interactions["book_index"].unique())

802886

In [48]:
len(interactions["user_index"].unique())

1259

# Creating a sparse matrix 

Because of the fact that we are working with a lot of data were not every position/value is filled we are using a sparse matrix in stead of a dense matrix 

In [49]:
from scipy.sparse import coo_matrix

# Arguments coo_matrix((wanted data in the cells, row positions, column positions))

ratings_mat_coo = coo_matrix((interactions["rating"], (interactions["user_index"], interactions["book_index"])))

In [50]:
ratings_mat_coo

<1259x802886 sparse matrix of type '<class 'numpy.int64'>'
	with 5638728 stored elements in COOrdinate format>

In [51]:
ratings_mat = ratings_mat_coo.tocsr()

# Is used to convert a sparse matrix to a compressed sparse row format ('csr'),
# in order to optimize memory usage and enable better/efficient matrix operations.
# the csr format provides faster acces to row-based operations -> faster (numerical) computations and algorithms

# Finding users similar to us

In [53]:
interactions[interactions["user_id"] == "-1"].head()

Unnamed: 0,user_id,book_id,rating,user_index,book_index
0,-1,2517439,5,0,337556
1,-1,113576,5,0,57909
2,-1,35100,5,0,413689
3,-1,228221,5,0,317541
5,-1,17662739,5,0,242200


In [55]:
# Setting our own user_id index to a seperate index 
# row 0 in the matrix will be our book ratings

my_index = 0

In [56]:
# cosine similarity finds the similarity between 2 rows in our matrix, 
# i.e. how similar their(other readers) readings are compared to ours

from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(ratings_mat[my_index,:], ratings_mat).flatten()

In [60]:
similarity[0]

0.9999999999999999

In [62]:
# Finding the indices/users who are the most similar to us 

import numpy as np 

indices = np.argpartition(similarity, -15)[-15:] # argpartitions finds the positions of the highest values

In [63]:
indices

array([ 845,  655,  656,  894, 1243,  957,  167, 1100,  588,  951,  320,
         32,  161,  274,    0])

In [64]:
# Searching for their user id's 

similar_users = interactions[interactions["user_index"].isin(indices)].copy() # finds all the rows where the "user_index" is in our indices

In [65]:
similar_users

Unnamed: 0,user_id,book_id,rating,user_index,book_index
0,-1,2517439,5,0,337556
1,-1,113576,5,0,57909
2,-1,35100,5,0,413689
3,-1,228221,5,0,317541
5,-1,17662739,5,0,242200
...,...,...,...,...,...
5230521,416683,599649,0,957,587368
5230522,416683,67654,0,957,633909
5230523,416683,599651,0,957,587370
5230524,416683,921757,5,957,768366


In [66]:
# Taking ourselves out of the list 

similar_users = similar_users[similar_users["user_id"]!="-1"] # Only proceeding with the user_id's that aren't ours ("-1")

In [67]:
similar_users

Unnamed: 0,user_id,book_id,rating,user_index,book_index
521343,41530,706,3,951,650295
521344,41530,317819,3,951,387638
521345,41530,30694,3,951,381295
521346,41530,7390,4,951,669590
521347,41530,7332,4,951,666071
...,...,...,...,...,...
5230521,416683,599649,0,957,587368
5230522,416683,67654,0,957,633909
5230523,416683,599651,0,957,587370
5230524,416683,921757,5,957,768366


# Creating book recommendations 

In [68]:
# Figuring out how many times each book appeared in the recommendations 

book_recs = similar_users.groupby("book_id").rating.agg(['count', 'mean']) # .agg here takes the average rating out of all the ratings it has received from users/readers 

In [69]:
book_recs

Unnamed: 0_level_0,count,mean
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1,0.000000
10,7,1.428571
100,7,1.285714
1000,9,2.111111
1000290,1,0.000000
...,...,...
99976,1,0.000000
99983,1,4.000000
99987,1,0.000000
99993,2,0.000000


In [70]:
# Reading in our book_titles again and making sure they are of the right dtype 

books_titles = pd.read_json("books_titles.json")
books_titles["book_id"] = books_titles["book_id"].astype(str)

In [76]:
# Merging our 2 datasets to get our book_titles, ... into our recommendations 

book_recs_merged = book_recs.merge(books_titles, how="inner", on="book_id")

In [77]:
book_recs_merged

Unnamed: 0,book_id,count,mean,title_x,ratings_x,url_x,cover_image_x,mod_title_x,title_y,ratings_y,url_y,cover_image_y,mod_title_y
0,10,7,1.428571,"Harry Potter Collection (Harry Potter, #1-6)",25245,https://www.goodreads.com/book/show/10.Harry_P...,https://images.gr-assets.com/books/1328867351m...,harry potter collection harry potter 16,"Harry Potter Collection (Harry Potter, #1-6)",25245,https://www.goodreads.com/book/show/10.Harry_P...,https://images.gr-assets.com/books/1328867351m...,harry potter collection harry potter 16
1,100,7,1.285714,Simply Beautiful Beading,75,https://www.goodreads.com/book/show/100.Simply...,https://s.gr-assets.com/assets/nophoto/book/11...,simply beautiful beading,Simply Beautiful Beading,75,https://www.goodreads.com/book/show/100.Simply...,https://s.gr-assets.com/assets/nophoto/book/11...,simply beautiful beading
2,1000,9,2.111111,Millionaire Women Next Door: The Many Journeys...,460,https://www.goodreads.com/book/show/1000.Milli...,https://s.gr-assets.com/assets/nophoto/book/11...,millionaire women next door the many journeys ...,Millionaire Women Next Door: The Many Journeys...,460,https://www.goodreads.com/book/show/1000.Milli...,https://s.gr-assets.com/assets/nophoto/book/11...,millionaire women next door the many journeys ...
3,100052,1,0.000000,Austerlitz,137,https://www.goodreads.com/book/show/100052.Aus...,https://s.gr-assets.com/assets/nophoto/book/11...,austerlitz,Austerlitz,137,https://www.goodreads.com/book/show/100052.Aus...,https://s.gr-assets.com/assets/nophoto/book/11...,austerlitz
4,100060,1,0.000000,When Godly People Do Ungodly Things: Finding A...,1201,https://www.goodreads.com/book/show/100060.Whe...,https://s.gr-assets.com/assets/nophoto/book/11...,when godly people do ungodly things finding au...,When Godly People Do Ungodly Things: Finding A...,1201,https://www.goodreads.com/book/show/100060.Whe...,https://s.gr-assets.com/assets/nophoto/book/11...,when godly people do ungodly things finding au...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14323,99943,1,4.000000,The Bhagavad-Gita,16,https://www.goodreads.com/book/show/99943.The_...,https://s.gr-assets.com/assets/nophoto/book/11...,the bhagavadgita,The Bhagavad-Gita,16,https://www.goodreads.com/book/show/99943.The_...,https://s.gr-assets.com/assets/nophoto/book/11...,the bhagavadgita
14324,99945,1,0.000000,Bhagavad Gita: A New Translation,965,https://www.goodreads.com/book/show/99945.Bhag...,https://images.gr-assets.com/books/1320542459m...,bhagavad gita a new translation,Bhagavad Gita: A New Translation,965,https://www.goodreads.com/book/show/99945.Bhag...,https://images.gr-assets.com/books/1320542459m...,bhagavad gita a new translation
14325,999660,1,0.000000,The Drowned World,58,https://www.goodreads.com/book/show/999660.The...,https://images.gr-assets.com/books/1330786609m...,the drowned world,The Drowned World,58,https://www.goodreads.com/book/show/999660.The...,https://images.gr-assets.com/books/1330786609m...,the drowned world
14326,99983,1,4.000000,The Best American Science and Nature Writing 2001,68,https://www.goodreads.com/book/show/99983.The_...,https://s.gr-assets.com/assets/nophoto/book/11...,the best american science and nature writing 2001,The Best American Science and Nature Writing 2001,68,https://www.goodreads.com/book/show/99983.The_...,https://s.gr-assets.com/assets/nophoto/book/11...,the best american science and nature writing 2001


# Ranking our book recommendations 

In [78]:
# Creating a count for how many times the book appeared to people like us (same interests), 
# relative to how many times it had appeared to other people. 
# We want only the books that were populair recommended to people like us, but not to other users with other interests

book_recs["adjusted_count"] = book_recs["count"] * (book_recs["count"] / book_recs["ratings"]) # trying to find the books that are specific to us

In [79]:
# Creating a score, how much we might like a specific book 

book_recs["score"] = book_recs["mean"] * book_recs["adjusted_count"]

In [80]:
# Taking out any books we have already read (method 1) 

book_recs = book_recs[~book_recs["book_id"].isin(my_books["book_id"])]

In [87]:
# Beacuse some books have duplicates -> data problem (method 2)
# Converting the titles of the books into the same (our) format 

my_books["mod_title"] = my_books["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True).str.lower()

In [88]:
# Replacing any sequences of spaces by just 1 space 

my_books["mod_title"] = my_books["mod_title"].str.replace("\s+", " ", regex=True)

In [90]:
# Method 2, now deleting these duplicates out of our recommendations 

book_recs = book_recs[~book_recs["mod_title"].isin(my_books["mod_title"])]

In [91]:
# removing anything that appeared less then 2 

book_recs = book_recs[book_recs["count"]>2] # Only proceeding with the books that appeared more then 2x

In [92]:
# only finding/showing books where our mean(average) rating is 4 or higher 

book_recs = book_recs[book_recs["mean"]>=4] 

In [93]:
top_recs = book_recs.sort_values("mean", ascending=False)

# Improving the display of the book as we did in the "Search" notebook 

In [94]:
def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val, val) # Gives us clickable links to the books

def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val) # Renders/shows the cover image 

top_recs.style.format({'url': make_clickable, 'cover_image': show_image})

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title,adjusted_count,score
5624,24929,3,4.666667,Lost,12607,Goodreads,,lost,0.000714,0.003331
12486,74909,3,4.666667,How Wal-Mart Is Destroying America (And the World): And What You Can Do about It,139,Goodreads,,how walmart is destroying america and the world and what you can do about it,0.064748,0.302158
7548,35727,3,4.666667,The Laws of Spirit: A Tale of Transformation,753,Goodreads,,the laws of spirit a tale of transformation,0.011952,0.055777
4125,18950,3,4.666667,The Best of Cordwainer Smith,242,Goodreads,,the best of cordwainer smith,0.03719,0.173554
5658,25032,3,4.333333,Classical Drawing Atelier: A Contemporary Guide to Traditional Studio Practice,3148,Goodreads,,classical drawing atelier a contemporary guide to traditional studio practice,0.002859,0.012389
13561,87992,3,4.333333,Lucky Girls,800,Goodreads,,lucky girls,0.01125,0.04875
2600,14948,3,4.333333,A Writer's Diary,2966,Goodreads,,a writers diary,0.003034,0.013149
10409,56522,3,4.333333,Sacred Pathways: Discover Your Soul's Path to God,944,Goodreads,,sacred pathways discover your souls path to god,0.009534,0.041314
3116,16276,3,4.333333,John Henry Days,1605,Goodreads,,john henry days,0.005607,0.024299
4735,21321,6,4.333333,"Ultimate Spider-Man, Volume 7",230,Goodreads,,ultimate spiderman volume 7,0.156522,0.678261


# Tweakable parameters 

In [None]:
# Filtered_overlap_users 
# Screaping goodreas for more data 
# indices -> Taking more recoms in stead of 15 
# Adjusting how to 