# Get recommendations using tfid vectorised embeddings and a user's books

In [1]:
import os, sys

# Go one directory up from "notebooks" -> project root
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

import book_thrift_app
from book_thrift_app import ML_logic
import pandas as pd
import numpy as np
from book_thrift_app import ocr
import joblib
import json
import gzip
from book_thrift_app.ML_logic.recommender import ALSRecommender
from sklearn.preprocessing import normalize
from book_thrift_app.ML_logic.collab_model import get_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# get my profile
als = ALSRecommender()
pd.options.mode.chained_assignment = None  # default='warn'

In [15]:
kalindi_books = als._get_user_profile("/Users/krahmed96/code/KRA96/The_Book_Thrift/raw_data/kalindi_goodreads_library_export (1).csv")
my_books = als._get_user_profile("/Users/krahmed96/code/KRA96/The_Book_Thrift/raw_data/goodreads_library_export.csv")

In [None]:
unique_books = np.load("/Users/krahmed96/code/KRA96/The_Book_Thrift/book_thrift_app/saved_arrays/unique_books.npy")
shelf = np.random.choice(unique_books, 60)


In [12]:
b_id_to_idx = {b_id: i for i, b_id in enumerate(unique_books)}
shelf_to_idx = pd.Series(shelf).map(b_id_to_idx)

In [16]:
# attempt rec
recs = als.recommend_books(my_books, items=shelf_to_idx)

In [18]:
als.recommend_books(kalindi_books, items=shelf_to_idx)

[{'Recommendations': 'Life of Pi'},
 {'Recommendations': 'The Battle for History: Re-fighting World War II'},
 {'Recommendations': 'The Return Of Sherlock Holmes (Sherlock Holmes #6)'},
 {'Recommendations': 'The King of Torts'},
 {'Recommendations': 'The Master Mind of Mars (Barsoom, #6)'},
 {'Recommendations': 'Why Should Anyone Be Led by You?: What It Takes To Be An Authentic Leader'},
 {'Recommendations': 'The Old Man and the Sea'},
 {'Recommendations': 'Welcome to the Monkey House'},
 {'Recommendations': 'Trainspotting'},
 {'Recommendations': 'Re-Visioning Psychology'}]

In [17]:
display(recs)
display((shelf_to_idx))

[{'Recommendations': 'Life of Pi'},
 {'Recommendations': 'VALIS (VALIS Trilogy, #1)'},
 {'Recommendations': 'Tales of the Grotesque and Arabesque Volume 1 (Tales of the Grotesque and Arabesque, #1)'},
 {'Recommendations': 'A House Divided (House of Earth, #3)'},
 {'Recommendations': 'The Battle for History: Re-fighting World War II'},
 {'Recommendations': 'The King of Torts'},
 {'Recommendations': 'The Master Mind of Mars (Barsoom, #6)'},
 {'Recommendations': 'The Old Man and the Sea'},
 {'Recommendations': 'Trainspotting'},
 {'Recommendations': 'Re-Visioning Psychology'}]

0      13785
1      23633
2     114418
3      27146
4       6226
5     129316
6      78078
7      44850
8      13223
9      58664
10      5044
11     30632
12    111509
13     87065
14     76423
15     78579
16     84406
17     61907
18     70715
19     56238
20    101239
21     83582
22    130993
23     12271
24     40103
25     89616
26    127067
27     26184
28     78353
29      1491
30    104776
31     42167
32     58459
33     90817
34       352
35     37661
36     89516
37     65869
38      1878
39     58581
40     82663
41     67520
42    128359
43     42082
44     14122
45     60781
46     94542
47     78865
48      9577
49     57488
50     91878
51     61468
52      4027
53     35391
54     88950
55     61312
56    121537
57     18567
58     77032
59    103433
dtype: int64

In [None]:
def get_user_and_shelf_indices(user_books, available_books):
        """
        Gets matrix row numbers for each book id in user's books and in available
        books
        """
        # Find user books in cleaned books array
        user_books_index = np.where(np.isin(book_ids, user_books))
        available_books_index = np.where(np.isin(book_ids, available_books))
        return (user_books_index, available_books_index)

In [None]:
def get_content_similarity_rank(user_books, available_books):
        """
        Implements a content recommender using books a user has read to find
        similarity with books that are available using tfidf vectorised
        embeddings
        """
        # Find user books in cleaned books array
        user_books_index = np.where(np.isin(book_ids, user_books))
        available_books_index = np.where(np.isin(book_ids, available_books))
        return (user_books_index, available_books_index)

In [None]:
# load my books
my_books = pd.read_csv("../raw_data/goodreads_library_export.csv")
user_books = my_books["Book Id"].to_numpy()
user_books

In [None]:
# Generate psuedo available books
available_books = np.random.choice(book_ids, 80)

In [None]:
# get user and book indices
user_idx, available_book_idx = get_user_and_shelf_indices(user_books, available_books)

In [None]:
# load fitted model
tfid_fitted = joblib.load("/Users/krahmed96/code/KRA96/The_Book_Thrift/book_thrift_app/models/500k_fitted.pkl")

In [None]:
# Try find right rows in books dataset by streaming
target_book_ids = np.concatenate([user_books, available_books], axis=0)
row_matches = []
with gzip.open("/Users/krahmed96/code/KRA96/The_Book_Thrift/raw_data/goodreads_books.json.gz",
               mode="rt",
               encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue
        row = json.loads(line)
        try:
            b_id = int(row.get("book_id"))
        except:
            continue
        if b_id in target_book_ids:
             row_matches.append(row)


In [None]:
df = pd.DataFrame(row_matches)
# change book id dtype
df["book_id"] = df["book_id"].astype("int64")
df = df.reset_index(drop=True)
user_idx = df.index[df["book_id"].isin(user_books)]
print(f"user index is \n{user_idx}")
available_idx = df.index[df["book_id"].isin(available_books)]
print(f"\nshelf index is \n{available_idx}")

In [None]:
# vectorise using saved model
user_mat = tfid_fitted.transform(df[df["book_id"].isin(user_books)])
shelf_mat = tfid_fitted.transform(df[df["book_id"].isin(available_books)])

# normalise
user_mat_norm = normalize(user_mat, norm="l2", axis=1)    # each user book
shelf_mat_norm = normalize(shelf_mat, norm="l2", axis=1)

# user profile as mean of vecs
user_profile = user_mat_norm.mean(axis=0)

# normalise mean profile
user_profile = np.asarray(user_profile).ravel()
norm = np.linalg.norm(user_profile)
if norm > 0:
    user_profile = user_profile / norm

# Calculate similiarity scores
scores = shelf_mat_norm.dot(user_profile)                 # shape (n_shelf_books, )
scores = np.asarray(scores).ravel()

# Get top N recs idx
N = 10
top_idx = np.argpartition(scores, -N)[-N:]
top_idx = top_idx[np.argsort(scores[top_idx])[::-1]]

shelf_df = df[df["book_id"].isin(available_books)]
shelf_ids = shelf_df["book_id"].to_numpy()
top_recs = shelf_ids[top_idx]
top_recs


In [None]:
df[["title", "book_id"]][df["book_id"].isin(top_recs)]

## Collate above code in 1 cell to get chatgpt to create a pipeline

In [None]:
# load my books
my_books = pd.read_csv("../raw_data/goodreads_library_export.csv")
user_books = my_books["Book Id"].to_numpy()
user_books

# load fitted model
tfid_fitted = joblib.load("/Users/krahmed96/code/KRA96/The_Book_Thrift/book_thrift_app/models/500k_fitted.pkl")

# Try find right rows in books dataset by streaming
target_book_ids = np.concatenate([user_books, available_books], axis=0)
row_matches = []
with gzip.open("/Users/krahmed96/code/KRA96/The_Book_Thrift/raw_data/goodreads_books.json.gz",
               mode="rt",
               encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue
        row = json.loads(line)
        try:
            b_id = int(row.get("book_id"))
        except:
            continue
        if b_id in target_book_ids:
             row_matches.append(row)

df = pd.DataFrame(row_matches)

# change book id dtype
df["book_id"] = df["book_id"].astype("int64")
df = df.reset_index(drop=True)

# vectorise using saved model
user_mat = tfid_fitted.transform(df[df["book_id"].isin(user_books)])
shelf_mat = tfid_fitted.transform(df[df["book_id"].isin(available_books)])

# normalise
user_mat_norm = normalize(user_mat, norm="l2", axis=1)    # each user book
shelf_mat_norm = normalize(shelf_mat, norm="l2", axis=1)

# user profile as mean of vecs
user_profile = user_mat_norm.mean(axis=0)

# normalise mean profile
user_profile = np.asarray(user_profile).ravel()
norm = np.linalg.norm(user_profile)
if norm > 0:
    user_profile = user_profile / norm

# Calculate similiarity scores
scores = shelf_mat_norm.dot(user_profile)                 # shape (n_shelf_books, )
scores = np.asarray(scores).ravel()

# Get top N recs idx
N = 10
top_idx = np.argpartition(scores, -N)[-N:]
top_idx = top_idx[np.argsort(scores[top_idx])[::-1]]

shelf_df = df[df["book_id"].isin(available_books)]
shelf_ids = shelf_df["book_id"].to_numpy()
top_recs = shelf_ids[top_idx]

recommended_books = df["title"][df["book_id"].isin(top_recs)]