In [1]:
import json
import gzip
import pandas as pd

In [2]:
# load from raw_data
with gzip.open('../raw_data/goodreads_book_genres_initial.json.gz', 'rt', encoding='utf-8') as f:
    genres_data = [json.loads(line) for line in f]

genres_df = pd.DataFrame(genres_data)

In [None]:
# basic info
print("Dataset shape:", genres_df.shape)
print("\nColumn names:")
print(genres_df.columns.tolist())
print("\nData types:")
print(genres_df.dtypes)
print("\nFirst few rows:")
print(genres_df.head())

Dataset shape: (2360655, 2)

Column names:
['book_id', 'genres']

Data types:
book_id    object
genres     object
dtype: object

First few rows:
   book_id                                             genres
0  5333265      {'history, historical fiction, biography': 1}
1  1333909  {'fiction': 219, 'history, historical fiction,...
2  7327624  {'fantasy, paranormal': 31, 'fiction': 8, 'mys...
3  6066819  {'fiction': 555, 'romance': 23, 'mystery, thri...
4   287140                                 {'non-fiction': 3}


In [None]:
# check data types
print("Data types:")
print(genres_df.dtypes)
print()

# check actual type of book_id values
print("book_id actual type:")
print(f"Type: {type(genres_df['book_id'].iloc[0])}")
print(f"Sample values: {genres_df['book_id'].head()}")
print()

# check actual type of genres values
print("genres actual type:")
print(f"Type: {type(genres_df['genres'].iloc[0])}")
print(f"Sample value: {genres_df['genres'].iloc[0]}")

Data types:
book_id               object
genres                object
top_3_genres          object
significant_genres    object
filtered_genres       object
dtype: object

book_id actual type:
Type: <class 'str'>
Sample values: 0    5333265
1    1333909
2    7327624
3    6066819
4     287140
Name: book_id, dtype: object

genres actual type:
Type: <class 'dict'>
Sample value: {'history, historical fiction, biography': 1}


In [11]:
# missing values
print(genres_df.isnull().sum())
print(f"\nTotal rows with any missing values: {genres_df.isnull().any(axis=1).sum()}")

# check for empty genres dictionaries
empty_genres = genres_df['genres'].apply(lambda x: len(x) == 0 if isinstance(x, dict) else True)
print(f"Books with empty genres dictionary: {empty_genres.sum()}")

# 3. Duplicate book_ids
print(f"Total books: {len(genres_df):,}")
print(f"Unique book_ids: {genres_df['book_id'].nunique():,}")
print(f"Duplicate book_ids: {genres_df['book_id'].duplicated().sum():,}")

# Show duplicate examples if any
if genres_df['book_id'].duplicated().sum() > 0:
    print("\nExample duplicates:")
    duplicate_ids = genres_df[genres_df['book_id'].duplicated(keep=False)]['book_id'].unique()[:5]
    for book_id in duplicate_ids:
        print(f"\nBook ID {book_id}:")
        print(genres_df[genres_df['book_id'] == book_id])

book_id               0
genres                0
top_3_genres          0
significant_genres    0
filtered_genres       0
dtype: int64

Total rows with any missing values: 0
Books with empty genres dictionary: 409513
Total books: 2,360,655
Unique book_ids: 2,360,655
Duplicate book_ids: 0


In [None]:

# dive into the empty genres issues
print(f"Books with empty genres: {409513:,}")
print(f"Percentage: {409513/2360655*100:.2f}%")

# check examples
empty_books = genres_df[genres_df['genres'].apply(lambda x: len(x) == 0)]
print("\nExample books with no genres:")
print(empty_books.head(5))

Books with empty genres: 409,513
Percentage: 17.35%

Example books with no genres:
     book_id genres top_3_genres significant_genres filtered_genres
13  28575155     {}           []                 []              {}
20  30227122     {}           []                 []              {}
26    287142     {}           []                 []              {}
27  16037548     {}           []                 []              {}
36  24994796     {}           []                 []              {}


In [15]:
# load the same sample
df_large = pd.read_csv('../raw_data/goodreads_interactions.csv', nrows=500000)
interactions_df = df_large.sample(n=50000, random_state=42)  # Same random_state = same sample

print(f"Loaded interactions: {interactions_df.shape}")
print(f"Unique books in interactions: {interactions_df['book_id'].nunique()}")

Loaded interactions: (50000, 5)
Unique books in interactions: 33275


In [16]:
# check the overlapping between empty genres and 50k samples from interactions

# books with empty genres
empty_book_ids = set(empty_books['book_id'])

# books in the interactions sample
interactions_book_ids = set(interactions_df['book_id'].unique())

# check overlap
overlap = empty_book_ids & interactions_book_ids
print(f"\nEmpty-genre books in the interactions: {len(overlap)}")
print(f"Total books in interactions: {len(interactions_book_ids)}")
print(f"Percentage: {len(overlap)/len(interactions_book_ids)*100:.2f}%")

# also check: how many interaction books HAVE genres?
books_with_genres = interactions_book_ids - empty_book_ids
print(f"\nInteraction books with genres: {len(books_with_genres)}")
print(f"Coverage: {len(books_with_genres)/len(interactions_book_ids)*100:.2f}%")


Empty-genre books in the interactions: 0
Total books in interactions: 33275
Percentage: 0.00%

Interaction books with genres: 33275
Coverage: 100.00%


In [4]:
# understand more about the 'genres' column

# look at one book in detail
example_book = genres_df.iloc[3]  # Book 6066819
print(f"Book ID: {example_book['book_id']}")
print(f"\nGenres with user counts:")
for genre, count in example_book['genres'].items():
    print(f"  {genre}: {count} users")

# Total users who tagged this book
total_tags = sum(example_book['genres'].values())
print(f"\nTotal genre tags: {total_tags}")

Book ID: 6066819

Genres with user counts:
  fiction: 555 users
  romance: 23 users
  mystery, thriller, crime: 10 users

Total genre tags: 588


In [None]:
# decide which options to use:

# Compare approaches

# 1: keep top 3 genres per book
def get_top_n_genres(genres_dict, n=3):
    sorted_genres = sorted(genres_dict.items(), key=lambda x: x[1], reverse=True)
    return [genre for genre, count in sorted_genres[:n]]

genres_df['top_3_genres'] = genres_df['genres'].apply(lambda x: get_top_n_genres(x, n=3))

# 2: Only keep genre tags > = 10
def get_significant_genres(genres_dict, min_tags=10):
    return [genre for genre, count in genres_dict.items() if count >= min_tags]

genres_df['significant_genres'] = genres_df['genres'].apply(lambda x: get_significant_genres(x, min_tags=10))

# 3: Keep all the genres but use the counts as weights for the recommeder - this is more complex which it's not required for mvp

print("Book 6066819 examples:")
print(f"Top 1 only: {get_top_n_genres(genres_df.iloc[3]['genres'], n=1)}")
print(f"Top 3: {get_top_n_genres(genres_df.iloc[3]['genres'], n=3)}")
print(f"Threshold >= 10: {get_significant_genres(genres_df.iloc[3]['genres'], min_tags=10)}")


Book 6066819 examples:
Top 1 only: ['fiction']
Top 3: ['fiction', 'romance', 'mystery, thriller, crime']
Threshold >= 10: ['fiction', 'romance', 'mystery, thriller, crime']


Use genree tags > = 10 but check the following first 


In [7]:
# Check how much data lost with threshold
threshold = 10
genres_df['filtered_genres'] = genres_df['genres'].apply(
    lambda x: {g: c for g, c in x.items() if c >= threshold}
)

# Compare
print("Original average genres per book:", genres_df['genres'].apply(len).mean())
print("Filtered average genres per book:", genres_df['filtered_genres'].apply(len).mean())
print("Books with no genres after filtering:", (genres_df['filtered_genres'].apply(len) == 0).sum())

Original average genres per book: 2.136077486968659
Filtered average genres per book: 1.021288583041571
Books with no genres after filtering: 1245513


In [None]:
# probably makes more sense to go with option 1

# function to get top N genres
def get_top_n_genres(genres_dict, n=3):
    """Get top N genres by user tag count"""
    sorted_genres = sorted(genres_dict.items(), key=lambda x: x[1], reverse=True)
    return [genre for genre, count in sorted_genres[:n]]

# apply to dataset
genres_df['top_3_genres'] = genres_df['genres'].apply(lambda x: get_top_n_genres(x, n=3))

# verify
print("Examples:")
for i in range(5):
    book = genres_df.iloc[i]
    print(f"\nBook {book['book_id']}:")
    print(f"  Original: {book['genres']}")
    print(f"  Top 3: {book['top_3_genres']}")

# statistics
print(f"\nAverage genres per book: {genres_df['top_3_genres'].apply(len).mean():.2f}")
print(f"Books with < 3 genres: {(genres_df['top_3_genres'].apply(len) < 3).sum()}")

Examples:

Book 5333265:
  Original: {'history, historical fiction, biography': 1}
  Top 3: ['history, historical fiction, biography']

Book 1333909:
  Original: {'fiction': 219, 'history, historical fiction, biography': 5}
  Top 3: ['fiction', 'history, historical fiction, biography']

Book 7327624:
  Original: {'fantasy, paranormal': 31, 'fiction': 8, 'mystery, thriller, crime': 1, 'poetry': 1}
  Top 3: ['fantasy, paranormal', 'fiction', 'mystery, thriller, crime']

Book 6066819:
  Original: {'fiction': 555, 'romance': 23, 'mystery, thriller, crime': 10}
  Top 3: ['fiction', 'romance', 'mystery, thriller, crime']

Book 287140:
  Original: {'non-fiction': 3}
  Top 3: ['non-fiction']

Average genres per book: 1.85
Books with < 3 genres: 1438423


Look at top 10 popular genres for all the books in the file: 

- Option #1: Most frequently tagged genres (by total user tags)
- Option #2: Most common genres (by # of books)

In [None]:
#option #1

from collections import Counter

# Collect all genre tags with their counts
all_genre_counts = Counter()

for genres_dict in genres_df['genres']:
    for genre, count in genres_dict.items():
        all_genre_counts[genre] += count

# Top 10 by total user tags
print("Top 10 genres by total user tags:")
for i, (genre, count) in enumerate(all_genre_counts.most_common(10), 1):
    print(f"{i}. {genre}: {count:,} tags")

Top 10 genres by total user tags:
1. fiction: 396,699,795 tags
2. fantasy, paranormal: 261,701,592 tags
3. young-adult: 113,748,515 tags
4. history, historical fiction, biography: 102,232,444 tags
5. mystery, thriller, crime: 94,226,447 tags
6. romance: 83,142,002 tags
7. children: 57,752,225 tags
8. non-fiction: 48,078,235 tags
9. comics, graphic: 19,114,748 tags
10. poetry: 6,897,839 tags


In [None]:
#option #2

# Count how many books have each genre
genre_book_count = Counter()

for genres_dict in genres_df['genres']:
    for genre in genres_dict.keys():
        genre_book_count[genre] += 1

# Top 10 by book count
print("\nTop 10 genres by number of books:")
for i, (genre, count) in enumerate(genre_book_count.most_common(10), 1):
    print(f"{i}. {genre}: {count:,} books")


Top 10 genres by number of books:
1. fiction: 1,244,112 books
2. history, historical fiction, biography: 663,795 books
3. romance: 658,719 books
4. fantasy, paranormal: 538,311 books
5. non-fiction: 533,491 books
6. mystery, thriller, crime: 523,156 books
7. young-adult: 364,114 books
8. children: 256,935 books
9. comics, graphic: 171,279 books
10. poetry: 88,630 books


Data Summary


Data structure: 
- 2 columns book_id (string), genres (dict)
- No duplicates or missing values 
- Unique book_ids: 2,360,655
- Books with empty genres: 409,513 (17% of full dataset), no overlap with the 50k samples  


Key Findings:
- Each book can have multiple genres with user tag counts
- For simplicity, we capture the top 3 genres per book
- Some genres are combined tags (e,g. “mystery, thriller, crime”) 
Top 10 genres by number of books:
1. fiction: 1,244,112 books
2. history, historical fiction, biography: 663,795 books
3. romance: 658,719 books
4. fantasy, paranormal: 538,311 books
5. non-fiction: 533,491 books
6. mystery, thriller, crime: 523,156 books
7. young-adult: 364,114 books
8. children: 256,935 books
9. comics, graphic: 171,279 books
10. poetry: 88,630 books


Next steps: 
- Merge genres with the 50k interaction sample
- Analyse user preferences by genre
- Build user-genre profiles for content-based recommendations 


