In [2]:
import pandas as pd
import json
import gzip


In [3]:
# load interactions
df_large = pd.read_csv('../raw_data/goodreads_interactions.csv', nrows=500000)
interactions_df = df_large.sample(n=50000, random_state=42)

# load genres
with gzip.open('../raw_data/goodreads_book_genres_initial.json.gz', 'rt', encoding='utf-8') as f:
    genres_data = [json.loads(line) for line in f]
genres_df = pd.DataFrame(genres_data)

# get top 3 genres
def get_top_n_genres(genres_dict, n=3):
    sorted_genres = sorted(genres_dict.items(), key=lambda x: x[1], reverse=True)
    return [genre for genre, count in sorted_genres[:n]]

genres_df['top_3_genres'] = genres_df['genres'].apply(lambda x: get_top_n_genres(x, n=3))

In [4]:
# check types before merging
print("BEFORE CONVERSION")
print(f"Interactions book_id type: {interactions_df['book_id'].dtype}")
print(f"Genres book_id type: {genres_df['book_id'].dtype}")
print(f"\nSample interactions book_id: {interactions_df['book_id'].head()}")
print(f"Sample genres book_id: {genres_df['book_id'].head()}")

# both convert to string to match
interactions_df['book_id'] = interactions_df['book_id'].astype(str)
genres_df['book_id'] = genres_df['book_id'].astype(str)

print("\nAFTER CONVERSION")
print(f"Interactions book_id type: {interactions_df['book_id'].dtype}")
print(f"Genres book_id type: {genres_df['book_id'].dtype}")

# merge
merged_df = interactions_df.merge(
    genres_df[['book_id', 'top_3_genres']],
    on='book_id',
    how='left'
)

print(f"\nMERGE RESULTS")
print(f"Original interactions: {len(interactions_df)}")
print(f"After merge: {len(merged_df)}")
print(f"Rows with genres: {merged_df['top_3_genres'].notna().sum()}")
print(f"\nFirst few rows:")
print(merged_df.head())

BEFORE CONVERSION
Interactions book_id type: int64
Genres book_id type: object

Sample interactions book_id: 104241    60924
199676    98786
140199    75314
132814    42643
408697     1634
Name: book_id, dtype: int64
Sample genres book_id: 0    5333265
1    1333909
2    7327624
3    6066819
4     287140
Name: book_id, dtype: object

AFTER CONVERSION
Interactions book_id type: object
Genres book_id type: object

MERGE RESULTS
Original interactions: 50000
After merge: 50000
Rows with genres: 21732

First few rows:
   user_id book_id  is_read  rating  is_reviewed  \
0      244   60924        1       5            1   
1      437   98786        0       0            0   
2      319   75314        0       0            0   
3      309   42643        1       3            1   
4      840    1634        0       0            0   

                                        top_3_genres  
0  [fiction, fantasy, paranormal, history, histor...  
1                                      [non-fiction]  
2  [

In [5]:
# number of unique books in interactions have genres?
unique_books_with_genres = merged_df[merged_df['top_3_genres'].notna()]['book_id'].nunique()
total_unique_books = interactions_df['book_id'].nunique()

print(f"UNIQUE BOOK COVERAGE")
print(f"Total unique books in interactions: {total_unique_books}")
print(f"Unique books with genres: {unique_books_with_genres}")
print(f"Coverage: {unique_books_with_genres/total_unique_books*100:.2f}%")

# Check if the missing genres are from the full genres dataset or not
interactions_book_ids = set(interactions_df['book_id'].unique())
genres_book_ids = set(genres_df['book_id'].unique())

missing_in_genres = interactions_book_ids - genres_book_ids
print(f"\nMISSING BOOKS")
print(f"Books in interactions but NOT in genres dataset: {len(missing_in_genres)}")
print(f"Sample missing book_ids: {list(missing_in_genres)[:10]}")

# check books that didn't match
no_genres_df = merged_df[merged_df['top_3_genres'].isna()]
print(f"\nBOOKS WITHOUT GENRES ")
print(f"Total interactions without genres: {len(no_genres_df)}")
print(f"Unique books without genres: {no_genres_df['book_id'].nunique()}")
print(f"Sample book_ids: {no_genres_df['book_id'].unique()[:10]}")

UNIQUE BOOK COVERAGE
Total unique books in interactions: 33275
Unique books with genres: 13701
Coverage: 41.18%

MISSING BOOKS
Books in interactions but NOT in genres dataset: 19574
Sample missing book_ids: ['175505', '121839', '171922', '171902', '62348', '57930', '143553', '17414', '182045', '23328']

BOOKS WITHOUT GENRES 
Total interactions without genres: 28268
Unique books without genres: 19574
Sample book_ids: ['1634' '56315' '70345' '108843' '21551' '12972' '1010' '7390' '8267'
 '2593']


In [9]:
# check if missing books actually exist in full genres dataset
import random
sample_missing = ['39430', '726', '85930', '21178', '75624']

print("Checking if missing books exist in genres dataset:")
for book_id in sample_missing:
    exists = book_id in genres_df['book_id'].values
    print(f"Book {book_id}: {'Found' if exists else 'NOT FOUND'}")

# also check: do these books have ratings/interactions?
print("\nInfo about missing books")
missing_books_df = interactions_df[interactions_df['book_id'].isin(sample_missing)]
print(missing_books_df[['user_id', 'book_id', 'is_read', 'rating', 'is_reviewed']])


Checking if missing books exist in genres dataset:
Book 39430: NOT FOUND
Book 726: NOT FOUND
Book 85930: NOT FOUND
Book 21178: NOT FOUND
Book 75624: NOT FOUND

Info about missing books
        user_id book_id  is_read  rating  is_reviewed
29343        65   21178        0       0            0
166867      374   85930        0       0            0
464686      962   75624        0       0            0
166551      372   39430        1       0            0
222           0     726        1       3            1


In [7]:
# check genre converage for read books with ratings

# filter for read books with ratings
read_with_ratings = merged_df[(merged_df['is_read'] == 1) & (merged_df['rating'] > 0)]

print(f"READ BOOKS WITH RATINGS")
print(f"Total read books with ratings: {len(read_with_ratings)}")

# check how many have genres
with_genres = read_with_ratings[read_with_ratings['top_3_genres'].notna()]
without_genres = read_with_ratings[read_with_ratings['top_3_genres'].isna()]

print(f"\nWith genre info: {len(with_genres)} ({len(with_genres)/len(read_with_ratings)*100:.2f}%)")
print(f"Without genre info: {len(without_genres)} ({len(without_genres)/len(read_with_ratings)*100:.2f}%)")

# check unique books
print(f"\nUNIQUE BOOKS")
print(f"Total unique books (read + rated): {read_with_ratings['book_id'].nunique()}")
print(f"Unique books with genres: {with_genres['book_id'].nunique()}")
print(f"Unique books without genres: {without_genres['book_id'].nunique()}")
print(f"Coverage: {with_genres['book_id'].nunique()/read_with_ratings['book_id'].nunique()*100:.2f}%")

READ BOOKS WITH RATINGS
Total read books with ratings: 23688

With genre info: 10348 (43.68%)
Without genre info: 13340 (56.32%)

UNIQUE BOOKS
Total unique books (read + rated): 16417
Unique books with genres: 6856
Unique books without genres: 9561
Coverage: 41.76%


In [None]:
# segment recommendation strategy
content_based_pool = merged_df[
    (merged_df['is_read'] == 1) &
    (merged_df['rating'] > 0) &
    (merged_df['top_3_genres'].notna())
]

collaborative_only_pool = merged_df[
    (merged_df['is_read'] == 1) &
    (merged_df['rating'] > 0) &
    (merged_df['top_3_genres'].isna())
]

print(f"Content-based pool: {len(content_based_pool)} interactions, {content_based_pool['book_id'].nunique()} unique books")
print(f"Collaborative-only pool: {len(collaborative_only_pool)} interactions, {collaborative_only_pool['book_id'].nunique()} unique books")

Content-based pool: 10348 interactions, 6856 unique books
Collaborative-only pool: 13340 interactions, 9561 unique books


In [8]:
from collections import Counter

# use only read + rated books with genres
user_genre_data = merged_df[
    (merged_df['is_read'] == 1) &
    (merged_df['rating'] > 0) &
    (merged_df['top_3_genres'].notna())
].copy()

print(f"analysing {len(user_genre_data)} interactions with genre info\n")

# Count all genres from top_3_genres lists
all_user_genres = Counter()

for genres_list in user_genre_data['top_3_genres']:
    if isinstance(genres_list, list):
        for genre in genres_list:
            all_user_genres[genre] += 1

print(" TOP 10 GENRES BY INTERACTION COUNT ")
for i, (genre, count) in enumerate(all_user_genres.most_common(10), 1):
    pct = count / len(user_genre_data) * 100
    print(f"{i}. {genre}: {count:,} interactions ({pct:.1f}%)")

analysing 10348 interactions with genre info

 TOP 10 GENRES BY INTERACTION COUNT 
1. fiction: 6,892 interactions (66.6%)
2. history, historical fiction, biography: 4,831 interactions (46.7%)
3. non-fiction: 4,237 interactions (40.9%)
4. fantasy, paranormal: 2,259 interactions (21.8%)
5. mystery, thriller, crime: 1,724 interactions (16.7%)
6. romance: 1,379 interactions (13.3%)
7. children: 1,114 interactions (10.8%)
8. young-adult: 1,093 interactions (10.6%)
9. poetry: 859 interactions (8.3%)
10. comics, graphic: 684 interactions (6.6%)
