# Enable model to read my goodreads data

## Imports

In [50]:
import pandas as pd
import numpy as np
from scipy import sparse

## Define Get Score

In [47]:
def get_score(interactions_df): #TODO: Should this score be modified?
        """
        generate score from user interactions data.
        """
        return interactions_df["is_read"] \
            + interactions_df["rating"] \
            + interactions_df["is_reviewed"] * 2

## Load Datasets

In [29]:
book_titles = pd.read_csv("../raw_data/book_titles.csv")
user_raw = pd.read_csv("../raw_data/goodreads_library_export.csv")

## Explore User Data

In [28]:
print("Sample of user data")
display(user_raw.sample(2))

# See columns
print("\nColumns in User Data")
display(user_raw.columns)

Sample of user data


Unnamed: 0,Book Id,Title,Author,Author l-f,Additional Authors,ISBN,ISBN13,My Rating,Average Rating,Publisher,...,Date Read,Date Added,Bookshelves,Bookshelves with positions,Exclusive Shelf,My Review,Spoiler,Private Notes,Read Count,Owned Copies
40,34812868,And Then I Gave Up: Essays About Faith and Spi...,Umm Zakiyyah,"Zakiyyah, Umm",,"=""""","=""""",0,4.44,Al-Walaa Publications,...,2021/05/20,2021/05/10,,,read,,,,1,0
100,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling,"Rowling, J.K.",Mary GrandPré,"=""043965548X""","=""9780439655484""",5,4.58,Scholastic Inc.,...,,2014/08/11,,,read,,,,1,0



Columns in User Data


Index(['Book Id', 'Title', 'Author', 'Author l-f', 'Additional Authors',
       'ISBN', 'ISBN13', 'My Rating', 'Average Rating', 'Publisher', 'Binding',
       'Number of Pages', 'Year Published', 'Original Publication Year',
       'Date Read', 'Date Added', 'Bookshelves', 'Bookshelves with positions',
       'Exclusive Shelf', 'My Review', 'Spoiler', 'Private Notes',
       'Read Count', 'Owned Copies'],
      dtype='object')

In [None]:
# Keep only needed columns
user = user_raw[
    ["Book Id",
     "My Rating",
     "Exclusive Shelf",
     "My Review"]
]

# Clean Col Names
user.columns = ["_".join(col.lower().split()) for col in user.columns]

# Keep only book ids present in our book titles dataset
my_books = set(user["book_id"][user["book_id"] <= 374976])
all_books = set(book_titles["book_id"])
common_books = pd.Series(list(my_books.intersection(all_books)))

user = user[user["book_id"].isin(common_books)]

# Assign binary is_read and is_reviewed column
user["is_read"] = 0
user.loc[user["exclusive_shelf"] == "read", "is_read"] = 1

user["is_reviewed"] = 0
user.loc[user["my_review"].notna(), "is_reviewed"] = 1

# Drop my review and exclusive shelf
user = user.drop(columns=["exclusive_shelf", "my_review"])

# Rename my rating column
user = user.rename(columns={
    "my_rating": "rating"
})

# Get score
user["score"] = get_score(user)

# Create CSR Matrix
cols = common_books.astype(np.int32)
data = user["score"].astype(np.int32)
rows = np.zeros_like(common_books)

user_mat = sparse.csr_matrix(
    (data, (rows, cols)), shape=(1, 374976)
)
user_mat


<Compressed Sparse Row sparse matrix of dtype 'int32'
	with 48 stored elements and shape (1, 374976)>

In [9]:
# clean columns
profile.columns = ["_".join(col.lower().strip().split()) for col in  profile.columns]
profile.columns
profile = profile[["book_id",
         "my_rating",
         "exclusive_shelf",
         "date_read",
         "my_review"]]


Unnamed: 0,book_id,isbn,title,my_rating,exclusive_shelf,date_read,my_review
98,3,"=""0439554934""",Harry Potter and the Sorcerer's Stone (Harry P...,5,read,,
41,16793,"=""0061142026""",Stardust,5,read,,"I read this over a few weeks, slowly and in bi..."


## Load Book Titles

# Check work ids and book ids in whole dataset

In [24]:
book_titles = pd.read_csv("../raw_data/book_titles.csv")

In [28]:
book_titles[["work_id", "book_id"]].isna().sum()

work_id    524
book_id      0
dtype: int64

In [None]:
class ALSRecommender():
    """
    Wraps the implicit ALS model to:
    - Load pickled model
    - Pass a user's Goodread's library download
    - Get back recommendations
    """

    def __init__(
        self,
        model_path: str = os.environ["MODEL_PATH"],
        book_titles_path: str = "/Users/krahmed96/code/KRA96/The_Book_Thrift/raw_data/book_titles.csv",
        book_id_col: str = "book_id"
    ) -> None:
        artifact = joblib.load(model_path)
        self.model = artifact["model"]
        self.n_items = artifact["n_items"]

        # Read book titles file and store it
        self.book_titles: pd.Dataframe | None = None
        titles = pd.read_csv(book_titles_path)
        titles = titles.set_index(book_id_col)
        self.book_titles = titles


    def _get_user_profile(self,
                          profile_csv="/Users/krahmed96/code/KRA96/The_Book_Thrift/raw_data/goodreads_library_export.csv"
        ) -> sparse.csr_matrix:
        """
        Takes a user's csv upload and creates a user csr matrix to use within
        the ALS model
        """
        user = pd.read_csv(profile_csv)
        print(f"Goodreads columns: {user.columns}\n")
        print(f"Self.book_titles columns: {self.book_titles.columns}")
        user_books = self.book_titles[self.book_titles.index.intersection(user["Book Id"])]

        if user_books.empty:
            raise ValueError("No overlapping book_ids between user CSV and catalogue")

        user_scores = get_score(user)
        cols = user_books.astype(np.int32)
        rows = np.zeros_like(user_books, dtype=np.int32)

        return sparse.csr_matrix((user_scores, (rows, cols)), shape=(1, self.n_items))

    def recommend_books(self,
                        n_recs: int = 20):
        user_items = self._get_user_profile()

        rec_ids, scores = self.model.recommend(
            userid=0,
            user_items=user_items,
            N=n_recs
        )

        rec_ids = rec_ids.astype(int)

        titles = self.book_titles.loc[self.book_titles.index.intersection(rec_ids)].copy()
        titles = titles.reindex(rec_ids)

        res = []
        for bid, score in zip(rec_ids, scores):
            row = titles.loc[bid] if bid in titles.index else {}
            res.append(
                {"Book": row.get("title")}
            )

        return res

In [3]:
book_titles_path: str = "/Users/krahmed96/code/KRA96/The_Book_Thrift/raw_data/book_titles.csv"

titles = pd.read_csv(book_titles_path)
titles = titles.set_index("book_id")
titles.sample(3)

Unnamed: 0_level_0,Unnamed: 0,isbn,isbn13,work_id,title
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
32603800,2268379,,,1205304.0,Alexander Hamilton
27303292,190176,,,47356129.0,Mountain Mirror (Mirrors of Time Book 5)
29862119,191315,,,50228164.0,Sweet Peas in April (Flowers Can Be Fatal #4)


In [4]:
profile_csv = "/Users/krahmed96/code/KRA96/The_Book_Thrift/raw_data/goodreads_library_export.csv"
user = pd.read_csv(profile_csv)
# user_books = self.book_titles[self.book_titles.index.intersection(user["Book Id"])]
user.sample(3)
user_books = user["Book Id"]

Unnamed: 0,Book Id,Title,Author,Author l-f,Additional Authors,ISBN,ISBN13,My Rating,Average Rating,Publisher,...,Date Read,Date Added,Bookshelves,Bookshelves with positions,Exclusive Shelf,My Review,Spoiler,Private Notes,Read Count,Owned Copies
67,6043849,"The Ask and the Answer (Chaos Walking, #2)",Patrick Ness,"Ness, Patrick",,"=""1406310263""","=""9781406310269""",0,4.18,Walker Books Ltd.,...,,2016/04/14,,,read,,,,1,0
85,332775,"Incarceron (Incarceron, #1)",Catherine Fisher,"Fisher, Catherine",,"=""0340893605""","=""9780340893609""",0,3.65,Hodder Children's Books,...,,2015/08/02,to-read,to-read (#10),to-read,,,,0,0
41,16793,Stardust,Neil Gaiman,"Gaiman, Neil",,"=""0061142026""","=""9780061142024""",5,4.1,Harper Perennial,...,,2021/05/16,,,read,"I read this over a few weeks, slowly and in bi...",,,1,0
