In [None]:
import pandas as pd
from scipy.sparse import coo_matrix
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
# Obtains the imported library from good reads and assign to a panda dataframe called my_books
my_books = pd.read_csv("karl_good_reads.csv", index_col=0)
# Ensure that the book_id column is of type string
my_books["book_id"] = my_books["book_id"].astype(str)

In [None]:
my_books

In [None]:
csv_book_mapping = {}
# Finding users similar to the current user
# Load in the mapping file between the book ids
with open("book_id_map.csv", "r") as f:
    while True:
        line = f.readline() #Read line by line 
        if not line:
            break #End the loop once its done reading

        csv_id, book_id = line.strip().split(",") # Split to 2 variables, removes new line characters
        csv_book_mapping[csv_id] = book_id #Assign to the dictionary

In [None]:
book_set = set(my_books["book_id"]) #Creates a unique set (list) that contains all the books the user has read

In [None]:
overlap_users = {}

# Read the 10 million chunk of user ratings
with open("chunk0.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
            
        user_id, csv_id, _, rating, _ = line.split(",")
        # Get the book_id by invoking the csv_book_mapping created above.
        book_id = csv_book_mapping.get(csv_id)
        if book_id in book_set:
            if user_id not in overlap_users:
                # If the current book has been read by the current user
                # and that user is not in the overlap_user dictionary
                # then add that user

                #Key and value pairs
                #Key = user_id
                #Value = number of times they have read the same books as the current user
                overlap_users[user_id] = 1
            else:
                # If user has already read a book that is the same as the current user
                # then increase their book count by 1
                overlap_users[user_id] += 1

In [None]:
# Filter the similar users that only read 10% of the same books as us, since user's who has less than 10% will not be useful.
filtered_overlap_users = set([k for k in overlap_users if overlap_users[k] > my_books.shape[0]/10]) # 10%

In [None]:
interactions_list = []

# Open the 10 million user interaction csv file.
with open("chunk0.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        # If the user in the interactions are one that are chosen to be used for the recommendation
        # Add their books and corresponding ratings to the interaction_list
        if user_id in filtered_overlap_users:
            book_id = csv_book_mapping[csv_id]
            interactions_list.append([user_id, book_id, rating])

In [None]:
#Convert the interactions list to a pandas dataframe with columns of user_id, book_id and rating
interactions = pd.DataFrame(interactions_list, columns=["user_id", "book_id", "rating"])

In [None]:
# Add  the current user's ratings to the matrix through pandas' concat 
interactions = pd.concat([my_books[["user_id", "book_id", "rating"]], interactions])
interactions

In [None]:
# Data preprocessing
# Ensure that they are the same data type as the books_titles.json
interactions["book_id"] = interactions["book_id"].astype(str)
interactions["user_id"] = interactions["user_id"].astype(str)
interactions["rating"] = pd.to_numeric(interactions["rating"])

In [None]:
# Create a column of user_index and paste the user_id column but as type of category
# All the identical numbers are converted to the same category
interactions["user_index"] = interactions["user_id"].astype("category").cat.codes
interactions["book_index"] = interactions["book_id"].astype("category").cat.codes

In [None]:
# user_index corresponds to the user id but is in order 
# hence we can assign them to a row in the matrix
# e.g row 0 will be the current user's id

interactions.iloc[100]

In [None]:
# Sparse matrix
# No value in the column, doesn't take any space
# Create using coo matrix from scipy
# A matrix using an array/list, user_index (row positions) and column positions.
ratings_mat_coo = coo_matrix((interactions["rating"], (interactions["user_index"], interactions["book_index"])))

In [None]:
ratings_mat = ratings_mat_coo.tocsr()

In [None]:
# As the dataset is large, it will be troubling to identify a id
# that does not correspond to an existing user
# hence the current user_id will be -1
interactions[interactions["user_id"] == "-1"]

In [None]:
my_index = 0

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
# Uses cosine similary to see the most similar users and will user their ratings
# in order to create a recommendation to the current user.

similarity = cosine_similarity(ratings_mat[my_index,:], ratings_mat).flatten()
# Finds the similarity between each user in the matrix in correspondence to the current user
# Flatten to turn into an np array.
# COS, 1 = Best (Highest) similarity

print(similarity)
print(len(similarity))

In [None]:
# As sometimes the recommendation will not return a large amount of similar users
# Select the 10 most similar users, if it is less than that
# select that amount of similar users instead

if len(similarity) <10:
    indices = np.argpartition(similarity, -(len(similarity)))[-(len(similarity)):]
    print(indices)
else:
    indices = np.argpartition(similarity, -10)[-10:]

In [None]:
indices

In [None]:
# Find the all of the rows where the user_index is in the indices numpy array and copy it over
# to similar users which will have the user_id alongside the rating, user_index, and book_index
similar_users = interactions[interactions["user_index"].isin(indices)].copy()
print(similar_users)

In [None]:
# Remove the current user from the similar user
similar_users = similar_users[similar_users["user_id"]!="-1"]

In [None]:
#Output the users alongside their book id, corresponding rating
similar_users

In [None]:
# Find the number of times a book appears in the similar users' recommendation
# Group the users by book_id, then calculate the number of times a book appears
# Then calculate the mean/average rating of the book into a mean column
book_recs = similar_users.groupby("book_id").rating.agg(['count', 'mean'])
book_recs

In [None]:
# Read the book_titles which will help map the title to the book_id
books_titles = pd.read_json("books_titles.json")
books_titles["book_id"] = books_titles["book_id"].astype(str)

In [None]:
# Inner join merge the 2 pandas dataframe to get the book title based on the same book_id
book_recs = book_recs.merge(books_titles, how="inner", on="book_id")
book_recs

In [None]:
# Count normalized how many times the book appeared among similar users to the current user relative to other users.
# Not just books that are popular in the entire GoooReads dataset
book_recs["adjusted_count"] = book_recs["count"] * (book_recs["count"] / book_recs["ratings"])

In [None]:
# Calculate the score by multiplying the mean of the ratings from similar users by the number of times the book was recommended
book_recs["score"] = book_recs["mean"] * book_recs["adjusted_count"]

In [None]:
# Remove books in the recommedation that the current user has already read and rated in the my_books dataframe
book_recs = book_recs[~book_recs["book_id"].isin(my_books["book_id"])]

In [None]:
# Set the mean score to 3.5 as anything higher will result in less recommended books
# As 10 million entries only accounts for just over 3k unique users.
book_recs = book_recs[book_recs["mean"] >=3.5]

In [None]:
# More than 2 users in the similar users has rated.
book_recs = book_recs[book_recs["count"]>2]

In [None]:
top_recs = book_recs.sort_values("mean", ascending=False)

In [None]:
# Print the top recommendations and only contain the book_id and the title
top_recs.columns
top_recs[['book_id','title']]