In [5]:
%load_ext autoreload
%autoreload 2
import sys, os
sys.path.append(os.path.abspath(".."))

from pipelines.bookread_pipeline import build_pipeline, load_raw, make_cluster_labels


In [2]:
import importlib
import pipelines.bookread_pipeline as bookread_pipeline
importlib.reload(bookread_pipeline)
import numpy as np
import pandas as pd

In [6]:
df = load_raw("../raw_data/goodreads_books.json.gz", nrows=50_000)
pipe = build_pipeline()

KeyboardInterrupt: 

In [3]:
df_test = df.sample(5, random_state=42)
df_train = df.drop(df_test.index)


pipe.fit(df_train)

# données clean + features SUR LE TRAIN
df_train_clean = pipe.named_steps["clean"].transform(df_train)
df_train_feat  = pipe.named_steps["build_features"].transform(df_train_clean)

# clusters appris sur le train
df_train_feat["cluster"] = pipe.named_steps["cluster"].labels_

cluster_label_map = make_cluster_labels(df_train_feat, pipe)


pred = pipe.predict(df_test)

print(pred)

[19 14 13 24]


In [16]:
df_train_feat.sample()

Unnamed: 0,text_reviews_count,series,country_code,language_code,popular_shelves,is_ebook,average_rating,similar_books,description,link,...,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series,combined_text,cluster
2678,9,[512153],US,eng,"[{'count': '292', 'name': 'to-read'}, {'count'...",False,4.09,"[26236891, 26071375, 22392531, 12640824, 25287...",When an Iranian nuclear scientist wants to def...,https://www.goodreads.com/book/show/18816525-f...,...,2013,https://www.goodreads.com/book/show/18816525-f...,https://images.gr-assets.com/books/1385965970m...,18816525,33,26757175,"Fog of War (Justin Hall, #3)","Fog of War (Justin Hall, #3)","fog of war (justin hall, #3) when an iranian n...",15


In [9]:
for c in [19, 14, 13, 24]:
    print(c, "->", cluster_label_map[c])


19 -> ebook, romance, kindle-books, kindle, free, books
14 -> short-stories, fiction, books-i-own, short, stories, horror
13 -> fiction, books-i-own, owned-books, fiction, books, literature
24 -> romance, contemporary-romance, contemporary, romance, series, kindle


In [10]:
pipe.named_steps["cluster"].inertia_

16593.69244230073

In [11]:
pipe.named_steps["cluster"].cluster_centers_.shape

(30, 29530)

# Build user pipeline

In [181]:
user = pd.read_csv("/Users/krahmed96/code/KRA96/The_Book_Thrift/raw_data/goodreads_library_export.csv")

In [184]:
user.sample(3)

Unnamed: 0,Book Id,Title,Author,Author l-f,Additional Authors,ISBN,ISBN13,My Rating,Average Rating,Publisher,...,Date Read,Date Added,Bookshelves,Bookshelves with positions,Exclusive Shelf,My Review,Spoiler,Private Notes,Read Count,Owned Copies
120,2657,To Kill a Mockingbird,Harper Lee,"Lee, Harper",,"=""0060935464""","=""9780060935467""",4,4.26,Harper Perennial Modern Classics,...,,2014/08/11,,,read,,,,1,0
74,25479900,"The Awakening (Gifting, #2)",K.E. Ganshert,"Ganshert, K.E.",Katie Ganshert,"=""""","=""""",0,4.32,K.E. Ganshert Books,...,,2016/02/05,,,read,,,,1,0
80,5568007,"Roadside Crosses (Kathryn Dance, #2)",Jeffery Deaver,"Deaver, Jeffery",,"=""1416549994""","=""9781416549994""",2,3.85,Simon & Schuster,...,,2015/08/06,,,read,,,,1,0


In [None]:
# Should have a way to get sparse matrix for collaborative system
# And a way to get books user has read for content system
# And a way to get data that can be used to collect book info from an API for
# books not in the original books dataset

from sklearn.pipeline import FunctionTransformer
## sparse matrix for collaborative system
def downcast(df, cols, dtype: str):
    """
    Downcast datatypes to reduce memory usage.
    """
    int_8_cols = ["is_read", "rating", "is_reviewed"]
    df[int_8_cols] = df[int_8_cols].astype("int8")
    df[["user_id", "book_id"]] = df[["user_id", "book_id"]].astype("int32")
    return df

def get_score(interactions_df): #TODO: Should this score be modified?
    """
    generate score from user interactions data.
    """
    return interactions_df["is_read"] \
        + interactions_df["rating"] \
        + interactions_df["is_reviewed"] * 2


In [None]:
# Keeping necessary columns
df_train_feat.columns

Index(['text_reviews_count', 'series', 'country_code', 'language_code',
       'popular_shelves', 'is_ebook', 'average_rating', 'similar_books',
       'description', 'link', 'authors', 'num_pages', 'isbn13',
       'publication_year', 'url', 'image_url', 'book_id', 'ratings_count',
       'work_id', 'title', 'title_without_series', 'combined_text', 'cluster'],
      dtype='object')

In [25]:
features = ["book_id",
            "combined_text",
            "ratings_count",
            "average_rating",
            "text_reviews_count",
            "publication_year"]

final_features = df_train_feat[features].set_index("book_id")
final_features.head(5)

Unnamed: 0_level_0,combined_text,ratings_count,average_rating,text_reviews_count,publication_year
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
7327624,"the unschooled wizard (sun wolf and starhawk, ...",140,4.03,7,1987.0
6066819,best friends forever addie downs and valerie a...,51184,3.49,3282,2009.0
33394837,the house of memory (pluto's snitch #2) netga...,269,4.33,60,
89373,the bonfire of the vanities fiction classics ...,77,3.82,19,
89376,heaven what is heaven really going to be like?...,7345,4.26,566,


In [37]:
# Building feature matrix
X = pipe.fit_transform(df_train_feat)

In [106]:
# Normalize X for easier cosine similarity calculation later
def l2_normalize_rows(X):
    norms = np.linalg.norm(X, axis=1, keepdims=True)
    norms[norms == 0] = 1   # avoid division by zero
    return X / norms

X_normalised = l2_normalize_rows(X)

In [68]:
# Create mapping from book_id to index just to be safe
book_id_to_index = {book_id: i for i, book_id in enumerate(final_features.index.tolist())}

In [39]:
display(X.shape)
display(final_features.shape)

(18617, 30)

(18617, 5)

In [97]:
# Test with psuedo user
user_books = np.random.choice(final_features.index, 33)

In [98]:
# Grab vector by index for user's books
X_user_books = [book_id_to_index[b_id] for b_id in user_books]
print(X_user_books)

[14073, 15375, 11638, 6156, 9876, 11886, 12706, 6068, 5470, 10913, 13011, 4160, 18054, 6696, 4, 3599, 11787, 12554, 11916, 5615, 9689, 12168, 4940, 17125, 782, 6949, 14789, 10916, 1127, 7796, 11143, 17412, 2706]


In [None]:
# build user profile vector
mean_user_vector = np.average([X_normalised[u_book] for u_book in X_user_books], axis=0)

# Normalise user vector - similar to scaling
mean_user_vector

array([0.23397333, 0.17239943, 0.17538741, 0.17802512, 0.18707075,
       0.17632108, 0.17701439, 0.1733116 , 0.17389245, 0.17683548,
       0.18395923, 0.17859502, 0.19345355, 0.17001211, 0.17152487,
       0.18143925, 0.169872  , 0.17378438, 0.18011479, 0.17758589,
       0.18149111, 0.18416009, 0.18080728, 0.2407178 , 0.17330073,
       0.18255816, 0.1799621 , 0.17532109, 0.17808442, 0.17452072])

In [124]:
# Create pseudo book shelf
book_shelf = np.random.choice(final_features.index, 100)
shelf_index = np.array([book_id_to_index[b_id] for b_id in book_shelf])

# select vectors from X_normalised based on shelf index computed above
X_shelf = [X_normalised[s] for s in shelf_index]

In [135]:
# Compute similarity scores
scores = [option.dot(mean_user_vector) for option in X_shelf]
scores = np.array(scores)

# get top-n recs
N = 10
top_idx = np.argpartition(scores, -N)[-N:]        # fast partial sort
top_idx = top_idx[np.argsort(scores[top_idx])[::-1]]

In [139]:
rec_index = [shelf_index[i] for i in top_idx]
recs = final_features.index[rec_index].tolist()

In [143]:
# load book titles
books = pd.read_csv("/Users/krahmed96/code/KRA96/The_Book_Thrift/book_thrift_app/book_titles.csv")

In [146]:
rec_names = books["title"][books["book_id"].isin(recs)]
rec_names

4728                                  Sunset In St. Tropez
6810                        What Remains (What Remains #1)
13862                                    The Misfortunates
15509                                     Ipswich Unzipped
20014                                Doctor Who: Bloodtide
28137                                   Ready, Set, Curate
34411     Blind Man's Bluff (Star Trek: New Frontier, #18)
45203                                           What Am I?
46666                                                Agony
47471    Master of Pemberley: A Pride and Prejudice Var...
Name: title, dtype: object

## Try with my own books

In [174]:
my_profile = pd.read_csv("/Users/krahmed96/code/KRA96/The_Book_Thrift/raw_data/goodreads_library_export.csv")
read_books = my_profile["Book Id"][
    (my_profile["My Rating"] != 0) |
    (my_profile["Exclusive Shelf"] == "read")
]

In [None]:
# Get index for my books to match to vectors array
my_books = [book_id_to_index.get(b_id) for b_id in read_books]
my_books

# build user profile vector
mean_user_vector = np.average([X_normalised[u_book] for u_book in my_books], axis=0)

# Normalise user vector - similar to scaling
mean_user_vector

# Create pseudo book shelf
book_shelf = np.random.choice(final_features.index, 1000)
shelf_index = np.array([
                        [b_id] for b_id in book_shelf])

# select vectors from X_normalised based on shelf index computed above
X_shelf = [X_normalised[s] for s in shelf_index]

# Compute similarity scores
scores = [option.dot(mean_user_vector) for option in X_shelf]
scores = np.array(scores)

# get top-n recs
N = 10
top_idx = np.argpartition(scores, -N)[-N:]        # fast partial sort
top_idx = top_idx[np.argsort(scores[top_idx])[::-1]]

rec_index = [shelf_index[i] for i in top_idx]
recs = final_features.index[rec_index].tolist()

rec_names = books["title"][books["book_id"].isin(recs)]
rec_names

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (67,) + inhomogeneous part.