In [3]:
import json
import gzip
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
# load the works data
with gzip.open('../raw_data/goodreads_book_works.json.gz', 'rt', encoding='utf-8') as f:
    works_data = [json.loads(line) for line in f]

works_df = pd.DataFrame(works_data)

# exploration
print(f"Shape: {works_df.shape}")
print(f"\nColumns: {works_df.columns.tolist()}")
print(f"\nData types:\n{works_df.dtypes}")
print(f"\nFirst few rows:")
works_df.head()

Shape: (1521962, 16)

Columns: ['books_count', 'reviews_count', 'original_publication_month', 'default_description_language_code', 'text_reviews_count', 'best_book_id', 'original_publication_year', 'original_title', 'rating_dist', 'default_chaptering_book_id', 'original_publication_day', 'original_language_id', 'ratings_count', 'media_type', 'ratings_sum', 'work_id']

Data types:
books_count                          object
reviews_count                        object
original_publication_month           object
default_description_language_code    object
text_reviews_count                   object
best_book_id                         object
original_publication_year            object
original_title                       object
rating_dist                          object
default_chaptering_book_id           object
original_publication_day             object
original_language_id                 object
ratings_count                        object
media_type                           object
r

Unnamed: 0,books_count,reviews_count,original_publication_month,default_description_language_code,text_reviews_count,best_book_id,original_publication_year,original_title,rating_dist,default_chaptering_book_id,original_publication_day,original_language_id,ratings_count,media_type,ratings_sum,work_id
0,1,6,8.0,,1,5333265,1984,W. C. Fields: A Life on Film,5:1|4:1|3:1|2:0|1:0|total:3,,,,3,book,12,5400751
1,22,10162,,,741,25717,2001,Good Harbor,5:517|4:1787|3:2763|2:966|1:196|total:6229,,,,6229,book,20150,1323437
2,2,268,,,7,7327624,1987,,5:49|4:58|3:26|2:5|1:3|total:141,,,,141,book,568,8948723
3,38,89252,7.0,,3504,6066819,2009,Best Friends Forever,5:9152|4:16855|3:19507|2:6210|1:1549|total:53273,,14.0,,53273,book,185670,6243154
4,2,49,,,5,287140,1990,Runic Astrology: Starcraft and Timekeeping in ...,5:6|4:1|3:3|2:3|1:2|total:15,,,,15,book,51,278577


In [5]:
# load interactions
df_large = pd.read_csv('../raw_data/goodreads_interactions.csv', nrows=500000)
interactions_df = df_large.sample(n=50000, random_state=42)

# load genres
with gzip.open('../raw_data/goodreads_book_genres_initial.json.gz', 'rt', encoding='utf-8') as f:
    genres_data = [json.loads(line) for line in f]
genres_df = pd.DataFrame(genres_data)

# get top 3 genres
def get_top_n_genres(genres_dict, n=3):
    sorted_genres = sorted(genres_dict.items(), key=lambda x: x[1], reverse=True)
    return [genre for genre, count in sorted_genres[:n]]

genres_df['top_3_genres'] = genres_df['genres'].apply(lambda x: get_top_n_genres(x, n=3))

# convert types and merge
interactions_df['book_id'] = interactions_df['book_id'].astype(str)
genres_df['book_id'] = genres_df['book_id'].astype(str)

merged_df = interactions_df.merge(
    genres_df[['book_id', 'top_3_genres']],
    on='book_id',
    how='left'
)

print(f"merged_df shape: {merged_df.shape}")
print(f"Books with genres: {merged_df['top_3_genres'].notna().sum()}")

merged_df shape: (50000, 6)
Books with genres: 21732


In [6]:
# check overlap
merged_book_ids = set(merged_df['book_id'].unique())
works_book_ids = set(works_df['best_book_id'].unique())

overlap = merged_book_ids & works_book_ids

print(f"Books in merged_df: {len(merged_book_ids):,}")
print(f"Books in works dataset: {len(works_book_ids):,}")
print(f"Matching books: {len(overlap):,}")
print(f"Coverage: {len(overlap)/len(merged_book_ids)*100:.2f}%")

Books in merged_df: 33,275
Books in works dataset: 1,521,962
Matching books: 10,764
Coverage: 32.35%


In [7]:
# make sure book_id types match
works_df['best_book_id'] = works_df['best_book_id'].astype(str)

# get book sets
books_with_genres = set(merged_df[merged_df['top_3_genres'].notna()]['book_id'].unique())
books_with_titles = set(works_df['best_book_id'].unique())

# calculate overlaps
books_with_both = books_with_genres & books_with_titles
books_with_either = books_with_genres | books_with_titles
books_only_genres = books_with_genres - books_with_titles
books_only_titles = books_with_titles - books_with_genres

print(f"Total unique books in interactions: {merged_df['book_id'].nunique():,}")
print(f"\nBooks with genres: {len(books_with_genres):,}")
print(f"Books with titles: {len(books_with_titles & set(merged_df['book_id'].unique())):,}")
print(f"Books with BOTH genres AND titles: {len(books_with_both):,}")
print(f"Books with ONLY genres (no title): {len(books_only_genres):,}")
print(f"Books with ONLY titles (no genre): {len(books_only_titles & set(merged_df['book_id'].unique())):,}")

# coverage percentages
total_books = merged_df['book_id'].nunique()
print(f"Genre coverage: {len(books_with_genres)/total_books*100:.2f}%")
print(f"Title coverage: {len(books_with_titles & set(merged_df['book_id'].unique()))/total_books*100:.2f}%")
print(f"Both genre + title: {len(books_with_both)/total_books*100:.2f}%")

Total unique books in interactions: 33,275

Books with genres: 13,701
Books with titles: 10,764
Books with BOTH genres AND titles: 10,394
Books with ONLY genres (no title): 3,307
Books with ONLY titles (no genre): 370
Genre coverage: 41.18%
Title coverage: 32.35%
Both genre + title: 31.24%


Summary 

- Books with BOTH genres AND titles: 10,394 

Next Steps: 
- Add original_title into merged_df 

Follow up: 
- Might incorporate other useful information if needed later 

In [8]:
# select only book_id and title from works_df
works_simple = works_df[['best_book_id', 'original_title']].copy()

# rename to match merged_df
works_simple.rename(columns={'best_book_id': 'book_id'}, inplace=True)

# ensure book_id types match
works_simple['book_id'] = works_simple['book_id'].astype(str)
merged_df['book_id'] = merged_df['book_id'].astype(str)

print("== BEFORE MERGE ==")
print(f"merged_df shape: {merged_df.shape}")
print(f"works_simple shape: {works_simple.shape}")

# merge
merged_title = merged_df.merge(
    works_simple,
    on='book_id',
    how='left'
)

print("\n== MERGE RESULTS ==")
print(f"After merge shape: {merged_title.shape}")
print(f"Columns: {merged_title.columns.tolist()}")
print(f"\nBooks with titles: {merged_title['original_title'].notna().sum()} ({merged_title['original_title'].notna().sum()/len(merged_title)*100:.2f}%)")

# preview
print("\n== SAMPLE WITH TITLES ==")
print(merged_title[merged_title['original_title'].notna()][
    ['user_id', 'book_id', 'original_title', 'rating', 'top_3_genres']
].head(10))

== BEFORE MERGE ==
merged_df shape: (50000, 6)
works_simple shape: (1521962, 2)

== MERGE RESULTS ==
After merge shape: (50000, 7)
Columns: ['user_id', 'book_id', 'is_read', 'rating', 'is_reviewed', 'top_3_genres', 'original_title']

Books with titles: 16696 (33.39%)

== SAMPLE WITH TITLES ==
    user_id book_id                                     original_title  \
0       244   60924  Seed to Harvest (Wild Seed / Mind of My Mind /...   
1       437   98786      Dwarf Hamsters (Complete Pet Owner's Manuals)   
3       309   42643                            The Crime of Punishment   
6       474     334                                   "A" Is for Zebra   
7       913   39850                                    Power to Change   
8         5    1954                A Tale of Two Cities (Cliffs Notes)   
9       797    7511                           The Beach House Cookbook   
10      982  176535       Intellivore (Star Trek: The Next Generation)   
14      240   46268                     

In [9]:
# filter for read books with ratings
read_with_ratings = merged_title[(merged_title['is_read'] == 1) & (merged_title['rating'] > 0)]

print("== READ BOOKS WITH RATINGS ==")
print(f"Total read books with ratings: {len(read_with_ratings)}")

# title coverage for read + rated books
with_titles = read_with_ratings[read_with_ratings['original_title'].notna()]
without_titles = read_with_ratings[read_with_ratings['original_title'].isna()]

print(f"\n== TITLE COVERAGE ==")
print(f"Read+Rated books WITH titles: {len(with_titles)} ({len(with_titles)/len(read_with_ratings)*100:.2f}%)")
print(f"Read+Rated books WITHOUT titles: {len(without_titles)} ({len(without_titles)/len(read_with_ratings)*100:.2f}%)")

# genre coverage for read + rated books
with_genres = read_with_ratings[read_with_ratings['top_3_genres'].notna()]
without_genres = read_with_ratings[read_with_ratings['top_3_genres'].isna()]

print(f"\n== GENRE COVERAGE ==")
print(f"Read+Rated books WITH genres: {len(with_genres)} ({len(with_genres)/len(read_with_ratings)*100:.2f}%)")
print(f"Read+Rated books WITHOUT genres: {len(without_genres)} ({len(without_genres)/len(read_with_ratings)*100:.2f}%)")

# both
with_both = read_with_ratings[
    (read_with_ratings['top_3_genres'].notna()) &
    (read_with_ratings['original_title'].notna())
]

print(f"\n== BOTH COVERAGE ==")
print(f"Read+Rated books WITH BOTH titles AND genres: {len(with_both)} ({len(with_both)/len(read_with_ratings)*100:.2f}%)")

# unique books breakdown
print(f"\n== UNIQUE BOOKS (READ + RATED) ==")
print(f"Total unique books: {read_with_ratings['book_id'].nunique()}")
print(f"Unique books with titles: {with_titles['book_id'].nunique()} ({with_titles['book_id'].nunique()/read_with_ratings['book_id'].nunique()*100:.2f}%)")
print(f"Unique books with genres: {with_genres['book_id'].nunique()} ({with_genres['book_id'].nunique()/read_with_ratings['book_id'].nunique()*100:.2f}%)")
print(f"Unique books with BOTH: {with_both['book_id'].nunique()} ({with_both['book_id'].nunique()/read_with_ratings['book_id'].nunique()*100:.2f}%)")

== READ BOOKS WITH RATINGS ==
Total read books with ratings: 23688

== TITLE COVERAGE ==
Read+Rated books WITH titles: 7922 (33.44%)
Read+Rated books WITHOUT titles: 15766 (66.56%)

== GENRE COVERAGE ==
Read+Rated books WITH genres: 10348 (43.68%)
Read+Rated books WITHOUT genres: 13340 (56.32%)

== BOTH COVERAGE ==
Read+Rated books WITH BOTH titles AND genres: 7661 (32.34%)

== UNIQUE BOOKS (READ + RATED) ==
Total unique books: 16417
Unique books with titles: 5370 (32.71%)
Unique books with genres: 6856 (41.76%)
Unique books with BOTH: 5182 (31.56%)


In [12]:
# content-based pool (with genres)
content_based_pool = merged_title[
    (merged_title['is_read'] == 1) &
    (merged_title['rating'] > 0) &
    (merged_title['top_3_genres'].notna())
]

# collaborative-only pool (NO genres)
collaborative_only_pool = merged_title[
    (merged_title['is_read'] == 1) &
    (merged_title['rating'] > 0) &
    (merged_title['top_3_genres'].isna())
]

print(f"Content-based pool: {len(content_based_pool)} interactions, {content_based_pool['book_id'].nunique()} unique books")
print(f"Collaborative-only pool: {len(collaborative_only_pool)} interactions, {collaborative_only_pool['book_id'].nunique()} unique books")

# available titles to display in each pool
print(f"\nOf content-based pool, {content_based_pool['original_title'].notna().sum()} have titles for display")
print(f"Of collaborative pool, {collaborative_only_pool['original_title'].notna().sum()} have titles for display")

Content-based pool: 10348 interactions, 6856 unique books
Collaborative-only pool: 13340 interactions, 9561 unique books

Of content-based pool, 7661 have titles for display
Of collaborative pool, 261 have titles for display


Start with Content-Based Filtering 

In [14]:
# filter to content-based pool (read + rated + has genres)
content_based_pool = merged_title[
    (merged_title['is_read'] == 1) &
    (merged_title['rating'] > 0) &
    (merged_title['top_3_genres'].notna())
].copy()

print(f"Content-based pool: {len(content_based_pool)} interactions")
print(f"Unique users: {content_based_pool['user_id'].nunique()}")
print(f"Unique books: {content_based_pool['book_id'].nunique()}")
print(f"Books with titles: {content_based_pool['original_title'].notna().sum()}")

# preview
print("\nSample data:")
print(content_based_pool[['book_id', 'original_title', 'rating', 'top_3_genres']].head())

Content-based pool: 10348 interactions
Unique users: 981
Unique books: 6856
Books with titles: 7661

Sample data:
   book_id                                     original_title  rating  \
0    60924  Seed to Harvest (Wild Seed / Mind of My Mind /...       5   
3    42643                            The Crime of Punishment       3   
10  176535       Intellivore (Star Trek: The Next Generation)       3   
13    1387                                                NaN       5   
14   46268                                                          5   

                                         top_3_genres  
0   [fiction, fantasy, paranormal, history, histor...  
3   [history, historical fiction, biography, myste...  
10                     [fiction, fantasy, paranormal]  
13  [fiction, poetry, history, historical fiction,...  
14                                           [poetry]  


In [15]:
# genre distribution
from collections import Counter

all_genres_in_pool = Counter()
for genres_list in content_based_pool['top_3_genres']:
    if isinstance(genres_list, list):
        for genre in genres_list:
            all_genres_in_pool[genre] += 1

print(f"\nTotal unique genres: {len(all_genres_in_pool)}")
print("\nGenre distribution:")
for genre, count in all_genres_in_pool.most_common(10):
    print(f"  {genre}: {count}")


Total unique genres: 10

Genre distribution:
  fiction: 6892
  history, historical fiction, biography: 4831
  non-fiction: 4237
  fantasy, paranormal: 2259
  mystery, thriller, crime: 1724
  romance: 1379
  children: 1114
  young-adult: 1093
  poetry: 859
  comics, graphic: 684


In [16]:
# book-genre matrix for similarity

# get unique books for the matrix
unique_books = content_based_pool.groupby('book_id').agg({
    'top_3_genres': 'first',  # take first occurrence of genres
    'original_title': 'first',  # take first occurrence of title
    'rating': 'mean'  # average rating for this book
}).reset_index()

print(f"Unique books in content-based pool: {len(unique_books)}")
print("\nSample:")
print(unique_books.head())

# create binary genre features (one-hot encoding style)
# for each book, mark which genres it has
all_genres = list(all_genres_in_pool.keys())
print(f"\nAll genres: {all_genres}")

# create genre matrix

genre_matrix = np.zeros((len(unique_books), len(all_genres)))

for idx, row in unique_books.iterrows():
    genres_list = row['top_3_genres']
    if isinstance(genres_list, list):
        for genre in genres_list:
            if genre in all_genres:
                genre_idx = all_genres.index(genre)
                genre_matrix[idx, genre_idx] = 1

print(f"\nGenre matrix shape: {genre_matrix.shape}")
print(f"(rows = books, columns = genres)")

# preview - show how many genres each book has
books_per_genre_count = genre_matrix.sum(axis=1)
print(f"\nAverage genres per book: {books_per_genre_count.mean():.2f}")

Unique books in content-based pool: 6856

Sample:
  book_id                                       top_3_genres  \
0     100                                      [non-fiction]   
1    1000  [non-fiction, history, historical fiction, bio...   
2   10002                                          [fiction]   
3   10003                              [fantasy, paranormal]   
4  100284                                      [non-fiction]   

                                      original_title    rating  
0                                                     4.000000  
1  Millionaire Women Next Door: The Many Journeys...  3.395349  
2                            Three Plays by Kobo Abe  4.500000  
3                                You Da  [Tomodachi]  5.000000  
4  How We Choose to Be Happy: The 9 Choices of Ex...  4.000000  

All genres: ['fiction', 'fantasy, paranormal', 'history, historical fiction, biography', 'mystery, thriller, crime', 'non-fiction', 'poetry', 'comics, graphic', 'children', '

In [17]:
# this creates a 6856 x 6856 matrix
# showing how similar each book is to every other book
book_similarity = cosine_similarity(genre_matrix)

print(f"Similarity matrix shape: {book_similarity.shape}")

# example: How similar are the first 5 books to each other?
print("\nSimilarity between first 5 books:")
print(book_similarity[:5, :5])
#print("\n1.0 = identical genres, 0.0 = no genres in common")

Similarity matrix shape: (6856, 6856)

Similarity between first 5 books:
[[1.         0.70710678 0.         0.         1.        ]
 [0.70710678 1.         0.         0.         0.70710678]
 [0.         0.         1.         0.         0.        ]
 [0.         0.         0.         1.         0.        ]
 [1.         0.70710678 0.         0.         1.        ]]


In [None]:
# recommend 1 book in the same genre with highest rating

def get_best_similar_book(book_id):
    # Find the index of this book in unique_books
    try:
        book_idx = unique_books[unique_books['book_id'] == book_id].index[0]
    except:
        return f"Book {book_id} not found in content-based pool"

    # get similarity scores for this book with all other books
    similarity_scores = book_similarity[book_idx]

    # get all similar books (similarity > 0, excluding itself)
    similar_mask = (similarity_scores > 0) & (similarity_scores < 1.0)  # exclude itself (1.0)

    if not similar_mask.any():
        return "No similar books found"

    # get similar books with their similarity scores
    similar_books = unique_books[similar_mask].copy()
    similar_books['similarity_score'] = similarity_scores[similar_mask]

    # sort by similarity first, rating second
    similar_books = similar_books.sort_values(['similarity_score','rating', ], ascending=[False, False])

    # Return the top 1
    top_book = similar_books.iloc[0]

    # handle missing title - fallback to book_id
    title = top_book['original_title'] if pd.notna(top_book['original_title']) else f"Book ID: {top_book['book_id']}"

    return {
        'book_id': top_book['book_id'],
        'title': title,
        'genres': top_book['top_3_genres'],
        'rating': top_book['rating'],
        'similarity': top_book['similarity_score']
    }

# testing
test_book_id = '42643'  # The Crime of Punishment

print(f"Input book: {test_book_id}")
input_book = unique_books[unique_books['book_id'] == test_book_id].iloc[0]
print(f"Title: {input_book['original_title']}")
print(f"Genres: {input_book['top_3_genres']}")
print(f"Rating: {input_book['rating']}")

print("\n== RECOMMENDATION ==")
recommendation = get_best_similar_book(test_book_id)
print(f"Book ID: {recommendation['book_id']}")
print(f"Title: {recommendation['title']}")
print(f"Genres: {recommendation['genres']}")
print(f"Rating: {recommendation['rating']}")
print(f"Similarity: {recommendation['similarity']:.2f}")

Input book: 42643
Title: The Crime of Punishment
Genres: ['history, historical fiction, biography', 'mystery, thriller, crime', 'non-fiction']
Rating: 3.75

== RECOMMENDATION ==
Book ID: 100827
Title: Options, Futures and Other Derivatives
Genres: ['non-fiction', 'mystery, thriller, crime']
Rating: 5.0
Similarity: 0.82


In [25]:
# Test with multiple books
test_books = ['60924', '176535', '10002', '1000']

for test_id in test_books:
    try:
        print("="*60)
        print(f"\nInput book: {test_id}")
        input_book = unique_books[unique_books['book_id'] == test_id].iloc[0]
        print(f"Title: {input_book['original_title']}")
        print(f"Genres: {input_book['top_3_genres']}")
        print(f"Rating: {input_book['rating']}")

        print("\n→ RECOMMENDATION:")
        rec = get_best_similar_book(test_id)
        print(f"  {rec['title']}")
        print(f"  Genres: {rec['genres']}")
        print(f"  Rating: {rec['rating']} | Similarity: {rec['similarity']:.2f}")
    except:
        print(f"Couldn't find recommendation for {test_id}")


Input book: 60924
Title: Seed to Harvest (Wild Seed / Mind of My Mind / Clay's Ark / Patternmaster)
Genres: ['fiction', 'fantasy, paranormal', 'history, historical fiction, biography']
Rating: 5.0

→ RECOMMENDATION:
  The Elric Saga Part 3 (The Elric Saga, part 3)
  Genres: ['fantasy, paranormal', 'fiction']
  Rating: 5.0 | Similarity: 0.82

Input book: 176535
Title: Intellivore (Star Trek: The Next Generation)
Genres: ['fiction', 'fantasy, paranormal']
Rating: 3.0

→ RECOMMENDATION:
  The Elric Saga Part 3 (The Elric Saga, part 3)
  Genres: ['fantasy, paranormal', 'fiction']
  Rating: 5.0 | Similarity: 1.00

Input book: 10002
Title: Three Plays by Kobo Abe
Genres: ['fiction']
Rating: 4.5

→ RECOMMENDATION:
  Wednesday's Child
  Genres: ['mystery, thriller, crime', 'fiction']
  Rating: 5.0 | Similarity: 0.71

Input book: 1000
Title: Millionaire Women Next Door: The Many Journeys of Successful American Businesswomen
Genres: ['non-fiction', 'history, historical fiction, biography']
Rati