# Collaborative filtering based on interactions dataset 

In [1]:
# Imports
%matplotlib inline
import pandas as pd
import numpy as np
import time
import gzip
import matplotlib.pyplot as plt

In [6]:
# Set pandas to display all columns
pd.reset_option('display.max_columns')
pd.reset_option("display.max_seq_items")
pd.reset_option("display.max_colwidth")

In [56]:
# Import user interactions
interaction_chunks = pd.read_csv("../raw_data/goodreads_interactions.csv", chunksize=50_000)

In [59]:
!wc -l ../raw_data/goodreads_interactions.csv

 228648343 ../raw_data/goodreads_interactions.csv


In [70]:
print(f"Total rows in interaction: 228648343",
      f"Interactions dataset explored: {200000*7} rows",
      f"unique books explored: 374975",
      f"Books from interactions found in book titles: 121782",
      f"As percent: {(121782/374975) * 100}",
      sep="\n")

Total rows in interaction: 228648343
Interactions dataset explored: 1400000 rows
unique books explored: 374975
Books from interactions found in book titles: 121782
As percent: 32.47736515767718


## Define functions to downcast data and get score

In [2]:
# Downcasting columns
def downcast(df):
    int_8_cols = ["is_read", "rating", "is_reviewed"]
    df[int_8_cols] = df[int_8_cols].astype("int8")
    df[["user_id", "book_id"]] = df[["user_id", "book_id"]].astype("int32")
    return df

# get score
def get_score(interactions_df): #TODO: think of how to include rating in score
    return interactions_df["is_read"] \
           + interactions_df["rating"] \
           + interactions_df["is_reviewed"] * 2

## Test functions on one chunk

In [57]:
# See the first chunk to get a sense of data
first_chunk = next(interaction_chunks)
print("Samples")
display(first_chunk.sample(3))

print("Info")
display(first_chunk.info())

# Processing interactions data
first_chunk.dtypes
print("Do int32 values retain full info? Let's check:", "\n")
for col in first_chunk.columns:
    test = (first_chunk[col].astype("int32") == first_chunk[col]).all()
    print(f"{col}: {'yes' if test else 'no'}")

print("\nWhat about int8?\n")
for col in first_chunk.columns:
    test = (first_chunk[col].astype("int8") == first_chunk[col]).all()
    print(f"{col}: {'yes' if test else 'no'}")

print("\nMemory usage after downcasting")
downcast(first_chunk)
display(first_chunk.info())

Samples


Unnamed: 0,user_id,book_id,is_read,rating,is_reviewed
12171,14,10205,1,4,1
39148,91,26113,1,4,0
43369,104,28389,1,2,1


Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   user_id      50000 non-null  int64
 1   book_id      50000 non-null  int64
 2   is_read      50000 non-null  int64
 3   rating       50000 non-null  int64
 4   is_reviewed  50000 non-null  int64
dtypes: int64(5)
memory usage: 1.9 MB


None

Do int32 values retain full info? Let's check: 

user_id: yes
book_id: yes
is_read: yes
rating: yes
is_reviewed: yes

What about int8?

user_id: yes
book_id: no
is_read: yes
rating: yes
is_reviewed: yes

Memory usage after downcasting
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   user_id      50000 non-null  int32
 1   book_id      50000 non-null  int32
 2   is_read      50000 non-null  int8 
 3   rating       50000 non-null  int8 
 4   is_reviewed  50000 non-null  int8 
dtypes: int32(2), int8(3)
memory usage: 537.2 KB


None

In [None]:
# See distribution for is_read, rating, and i_reviewed
first_chunk.sample(5)
first_chunk[["is_read", "rating", "is_reviewed"]].plot(kind="hist", subplots=True, xticks=[0, 1, 2, 3, 4, 5])


In [None]:
# get score
first_chunk["score"] = get_score(first_chunk)
first_chunk.sample(3)

## Process in chunks to get CSR matrix of scores for 1_400_000 user-item pairs

In [None]:

# Build stable ID maps... chatgpt recommended
user_ids = set()
book_ids = set()

start = time.time()
for chunk in pd.read_csv("../raw_data/goodreads_interactions.csv", chunksize=200_000):
    chunk = downcast(chunk)

    user_ids.update(chunk["user_id"].unique().tolist())
    book_ids.update(chunk["book_id"].unique().tolist())

# Stable 0-based mappings...
user_ids = list(user_ids)
book_ids = list(book_ids)

user_map = {u: i for i, u in enumerate(user_ids)}
book_map = {b: i for i, b in enumerate(book_ids)}

print(f"No. of users: {len(user_map)}", f"No. of books: {len(book_map)}", sep="\n")
end = time.time()
duration = end - start
print(f"Time taken: {duration}")


In [5]:
# Getting arrays for users, items, and scores
from IPython.display import clear_output
user_list = []
book_list = []
score_list = []

start = time.time()
for chunk_no, chunk in enumerate(pd.read_csv("../raw_data/goodreads_interactions.csv", chunksize=200_000)):
    clear_output(wait=True)
    print(f"Processed {chunk_no} chunks")
    chunk = downcast(chunk)
    scores = get_score(chunk)

    # Map to 0-based index
    u_idx = chunk["user_id"]    # Trying without mapping .map(user_map)
    b_idx = chunk["book_id"]    # .map(book_map)

    # drop NaN
    mask = u_idx.notna() & b_idx.notna()
    u_idx = u_idx[mask]
    b_idx = b_idx[mask]
    scores = scores[mask]

    user_list.append(u_idx.to_numpy())
    book_list.append(b_idx.to_numpy())
    score_list.append(scores.to_numpy())
    # Last chunk to be processes is currently: 5
    if chunk_no > 5:
        break
end = time.time()
# Make 1D arrays:
user_idx = np.concatenate(user_list)
book_idx = np.concatenate(book_list)
scores   = np.concatenate(score_list)

duration = end - start
print(f"Time taken: {duration}")

Processed 6 chunks
Time taken: 0.24044299125671387


In [50]:
no_book_extracted = np.unique_counts(book_idx)[1].shape[0]
no_book_extracted

374975

In [8]:
# Create sparse matrix of scores. Rows are users and columns are books
from scipy.sparse import csr_matrix

matrix = csr_matrix(
    (scores, (user_idx, book_idx)),
    dtype="float32"
)

In [None]:
# not all user-book pairs exist in the chunk. Check chunk by loc
# Index is between 1200000 and 1400000

i_loc = 1400000 - 2008
user = chunk.loc[i_loc, ["user_id"]].values
book = chunk.loc[i_loc, ["book_id"]].values
print(f"User no {user} for book no {book}")
display(chunk.loc[i_loc])

print(f"Score for user {user} for book no {book} is:")
display(get_score(chunk.loc[i_loc]))

print("Checking whether matrix score corresponds to this...")
matrix[user, book]


# Test a first model using Implicit library

In [9]:
from implicit.als import AlternatingLeastSquares

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
user_items_full = matrix.tocsr()
n_users, n_items = user_items_full.shape

# build train as a copy
user_items_train = user_items_full.copy().tocsr()

# for each user, hold out 1 interacted item (if they have any)
test_items = {}  # user_id -> set of held-out item_ids

for u in range(n_users):
    start, end = user_items_full.indptr[u], user_items_full.indptr[u+1]
    items = user_items_full.indices[start:end]

    if len(items) == 0:
        continue

    held_out = np.random.choice(items, size=1, replace=False)
    test_items[u] = set(held_out)

    # remove held-out interactions from the train matrix
    for i in held_out:
        user_items_train[u, i] = 0

user_items_train.eliminate_zeros()

In [11]:
# Instantiate model
model = AlternatingLeastSquares(
    factors = 128,
    regularization=0.1,
    iterations=30
)
model.fit(user_items_train)

100%|██████████| 30/30 [01:12<00:00,  2.41s/it]


In [None]:
# Debugging error below
# print("mat shape:", matrix.shape[0])                     # should be (num_users, num_items)
# print("user_factors shape:", model.user_factors.shape)  # (num_users, n_factors)
print(f"Shape of model.user_factors : {model.user_factors.shape[0]}")
print(f"Shape of model.item_factors: {model.item_factors.shape[0]}")

In [32]:
# Get some recommendations
user = 500
top_3_recs = model.recommend(userid=user, user_items=matrix[user], N=10)[0][:3]
top_3_recs

array([1494, 1210, 1211], dtype=int32)

In [None]:
# Match using book titles dataset
book_titles = pd.read_csv("../raw_data/book_titles.csv")

In [55]:
# display(book_titles[book_titles["book_id"].isin(top_3_recs)])
book_titles["book_id"] = book_titles["book_id"].astype("int32")
book_titles[book_titles["book_id"].isin(book_idx.tolist())].shape
print(f"Out of 374975 books extracted from interactions, only \n121782 could be found in book titles df. {121782/374975 * 100}%")

Out of 374975 books extracted from interactions, only 
121782 could be found in book titles df. 32.47736515767718%


In [None]:
# Check user and the books they've rated
interactions = pd.read_csv("../raw_data/goodreads_interactions.csv",
                           nrows=100000)

In [None]:
interactions[(interactions["user_id"] == user) &
             (interactions["is_read"] == 1)]

## Creating test set to test recommendations using chatGPT

In [None]:

user_items_full = user_items.tocsr()
n_users, n_items = user_items_full.shape

# build train as a copy
user_items_train = user_items_full.copy().tocsr()

# for each user, hold out 1 interacted item (if they have any)
test_items = {}  # user_id -> set of held-out item_ids

for u in range(n_users):
    start, end = user_items_full.indptr[u], user_items_full.indptr[u+1]
    items = user_items_full.indices[start:end]

    if len(items) == 0:
        continue

    held_out = np.random.choice(items, size=1, replace=False)
    test_items[u] = set(held_out)

    # remove held-out interactions from the train matrix
    for i in held_out:
        user_items_train[u, i] = 0

user_items_train.eliminate_zeros()


In [None]:
K = 10
hits = 0
total = 0

for u, positives in test_items.items():
    if not positives:
        continue

    # get recommendations for this user from the TRAIN matrix
    rec_ids, _ = model.recommend(
        u,
        user_items_train[u],
        N=K
    )

    rec_set = set(rec_ids)
    # count how many test positives appear in top-K
    hits += len(positives & rec_set)
    total += len(positives)

recall_at_k = hits / total if total > 0 else 0.0
print("Recall@{} = {:.4f}".format(K, recall_at_k))


# Books dataset exploration
All columns are object type, so number ones need to be converted. <br>
We may select languages that are eng or '' (as those are Eng too, it seems) <br>
We may want to make individual columns for popular shelves (such as 'to-read count') <br>

In [None]:
books_50k = pd.read_json("../raw_data/goodreads_books.json.gz",
                         lines=True,
                         compression="gzip",
                         nrows=50_000)

In [None]:
books_50k.columns

In [None]:
pd.reset_option('display.max_columns')
pd.reset_option("display.max_seq_items")
pd.reset_option("display.max_colwidth")

cols_to_keep = [
    "text_reviews_count",
    "language_code",
    "popular_shelves",
    "average_rating",
    "description",
    "authors",
    "num_pages",
    "publication_year",
    "ratings_count",
    "book_id",
    "work_id"
]
books_50k = books_50k[cols_to_keep]
books_50k.sample(2)

In [None]:
# Genres
print("Genres Dataset Columns")
print(genres.columns, "\n")

print("Size of dataset", genres.shape[0])
print("Dataset dtypes", "\n", genres.dtypes)

# Null values
# Map empty dicts to null
genres["genres"] = genres["genres"].map(
    lambda s: s if len(s) > 0 else pd.NA
)
print("\n", "Null values in genres")
print(genres.isnull().sum())
genres.sample(3)


In [None]:
# Get top 3 genres for every book
def get_top_genres(genre_dict):
    if isinstance(genre_dict, dict):
        print((max(genre_dict, key = genre_dict.get)))
get_top_genres({'fantasy': 10, 'fiction': 12})

In [None]:
print("Books Dataset")
print(books_50k.dtypes)


In [None]:
# change dtypes
num_cols = ['text_reviews_count',
            'average_rating',
            'num_pages',
            'ratings_count']

# num_pages has several blank values. First check if any values other than digits
books_50k.loc[:, num_cols] = books_50k[num_cols].replace(r"^\s*$", pd.NA, regex=True)

books_50k.isna().sum()

In [None]:
# Convert dtypes
for col in num_cols:
    books_50k.loc[:, col] = pd.to_numeric(books_50k[col], errors='coerce')

books_50k.dtypes

In [None]:
books_50k[num_cols].describe()

In [None]:
# books_50k[num_cols].plot(subplots=True, kind='hist', figsize=(8, 10))
books_50k[num_cols].plot(kind='box', subplots=True, figsize=(12, 6))
# Removing max from text_reviews_count and keeping num_pages below 2000
books_50k = books_50k[
    (books_50k['text_reviews_count'] < 38878.000000) &
    (books_50k['num_pages'] < 2000)
]

In [None]:
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(10, 6))
ax1.hist(books_50k['text_reviews_count'], bins=50, log=True)
ax2.hist(books_50k['average_rating'])
ax3.hist(books_50k['num_pages'], bins=50)
ax4.hist(books_50k['ratings_count'], bins=20, log=True)

In [None]:
def extract_authors(li):
    authors = []
    for author_dict in li:
        authors.append(
            author_dict['author_id']
        )

    return authors

books_50k["extractd_authors"] = books_50k["authors"].apply(
    extract_authors
)

In [None]:
def extract_popular_shelves(li):
    popularity_threshold = 500
    shelves = []
    for d in li:
        if int(d["count"]) > popularity_threshold:
            shelves.append({d["name"]: d["count"]})
    return shelves

popular_shelves = books_50k["popular_shelves"].apply(extract_popular_shelves)

In [None]:
from collections import Counter
all_shelves = Counter()

for book_shelves in popular_shelves:
    for shelf in book_shelves:
        shelf_name = list(shelf.keys())[0]
        all_shelves.update([shelf_name])

all_shelves

In [None]:
# Examining categorical columns
cat_cols = ['country_code', 'language_code']
pd.DataFrame(books_50k[cat_cols].value_counts())


In [None]:
# check language column
# Show count for all languages
pd.set_option('display.max_rows', None)
display(books_50k['language_code'].value_counts().sort_values())
pd.reset_option('display.max_rows')

# keep only eng lang books
eng_lang = ['eng', 'en-US', 'en-GB', 'en-CA', '']
books_50k = books_50k[books_50k['language_code'].isin(eng_lang)]


In [None]:
# Check similar books column
books_50k.similar_books.apply(lambda x: len(x)).sort_values()

# check num pages column
books_50k.num_pages.value_counts().sort_index()
display(books_50k['num_pages'].quantile([0.25, 0.5, 0.75, 0.9, 0.99, 1]))

# remove the massive book as outlier
books_50k

In [None]:

# describe
books_50k[num_cols].describe()


In [None]:
# check max values for num cols
books_50k[books_50k['num_pages'] == 945077.000000]

In [None]:
# fig, ax = plt.subplots(2, 2, figsize=(10,6))
# ax[0, 0].hist(books_50k['num_pages'], bins=20)
# ax[0, 0].set_xlim(0, 2000)
num_cols
books_50k.num_pages.plot(kind='hist', xlim=(0, 2000))

In [None]:
books_50k['num_pages'].quantile([0.5, 0.9, 0.99, 0.999])

In [None]:
books_50k.info()

In [None]:
books_50k.country_code.unique()

In [None]:
# for dummy model, keep only some columns
books_50k.columns.tolist()
books_50k_small = books_50k[['isbn',
                             'text_reviews_count',
                             'average_rating',
                             'publication_year',
                             'ratings_count']]  # No num pages as it has many missing.

books_50k_small.dropna(how='all', inplace=True)

In [None]:
books_50k_small