# Books Recommender for Children using GoodReads data
August 9th 2025 

This notebook provides a clean, analysis-ready dataset for further modeling or recommendation tasks.

---

## Inputs

- **Raw Data:** Multiple JSON files in `data/all/`, each line is an individual JSON record.
- **DataFrames:**
  - `books_df`: Book metadata (ISBN, book_id, title, average_rating, description, etc.)
  - `reviews_df`: User reviews (user_id, book_id, review_id, rating, review_text, n_votes)

---

## Processing Steps

1. **Data Loading:** All JSON files are read into pandas DataFrames.
2. **Book Data Preparation:** Selected columns from the books DataFrame are cleaned and formatted, including handling missing publication years.
3. **Review Aggregation:**
   - Count total reviews per book.
   - Filter reviews to only those with `n_votes > 0`.
   - For each book, select the top 20 reviews by number of votes.
   - Aggregate these reviews: concatenate review texts and sum votes.
   - Merge review aggregates with review counts.
4. **Final Dataset Creation:** Merge the processed reviews with selected book metadata.
5. **Exploratory Analysis:** 
   - Check for missing values.
   - Display summary statistics for numeric columns.
   - Visualize distributions of key numeric features.
6. **Output:** Save the final merged DataFrame as a CSV file in `data/clean/books_with_reviews.csv`.

---

## Outputs

- **books_with_reviews.csv:** A cleaned, aggregated dataset containing book metadata, review counts, concatenated top reviews, and total votes for those reviews.

---

This notebook prepares the data for further modeling or recommendation tasks.

In [None]:
import pandas as pd
import numpy as np
import glob
import os

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:


# Set the directory containing JSON files
json_dir = os.path.join(os.getcwd(), '..', 'data/all/')

# Find all JSON files in the directory
json_files = glob.glob(os.path.join(json_dir, '*.json'))
print(f"Found {len(json_files)} JSON files in {json_dir}")
print("Files:", json_files)
# Read each JSON into a dictionary of DataFrames
dataframes = {}
for file in json_files:
    name = os.path.splitext(os.path.basename(file))[0]
    dataframes[name] = pd.read_json(file, lines=True)

# Display the names and shapes of loaded DataFrames
for name, df in dataframes.items():
    print(f"{name}: {df.shape}")

In [None]:
dataframes

In [None]:
dataframes['goodreads_books_children'].columns

In [None]:
books_df = dataframes['goodreads_books_children'][['isbn', 'book_id', 'title', 'average_rating', 'similar_books', 'description', 'format', 'link', 'authors','publisher', 'num_pages','publication_year']]
books_df

In [None]:
reviews_df = dataframes['goodreads_reviews_children'][['user_id', 'book_id', 'review_id', 'rating', 'review_text','n_votes']]
reviews_df

In [None]:
books_df['publication_year'] = books_df['publication_year'].replace('', np.nan).fillna(-1).astype(int)

In [None]:
books_df[((books_df['publication_year'] > 2010) & (books_df['average_rating']>4.5))].sort_values(by='publication_year', ascending=False).head(10)

In [None]:
books_df[((books_df['average_rating'] > 4.9) & (books_df['publication_year'] > 2010))].sort_values(by='publication_year', ascending=False).head(10)

In [None]:
books_df[books_df['title'].str.contains('The Enchanted Wood', case=False, na=False)].sort_values(by='publication_year', ascending=False)  

In [None]:
books_df[books_df['isbn'] == '0689304099'].values  

In [None]:
reviews_df[reviews_df['book_id'] == '2426418'].values    

In [None]:
# Aggregate reviews: count number of reviews per book
review_counts = reviews_df.groupby('book_id').size().reset_index(name='num_reviews')

# Merge with books_df to get book titles
popular_books = pd.merge(review_counts, books_df[['book_id', 'title']], on='book_id')

# Sort by number of reviews, descending
popular_books_sorted = popular_books.sort_values(by='num_reviews', ascending=False)

# Show distribution of review counts
import matplotlib.pyplot as plt
plt.figure(figsize=(8,5))
plt.hist(review_counts['num_reviews'][review_counts['num_reviews']>10], bins=30)
plt.xlabel('Number of Reviews per Book')
plt.ylabel('Number of Books')
plt.title('Distribution of Reviews per Book')
plt.show()

# Display top 10 most reviewed books
popular_books_sorted.head(10)

In [None]:
review_counts

In [None]:
books_df[books_df['book_id']==5].values

In [None]:
reviews_df[['rating','review_text', 'n_votes']][reviews_df['book_id'] == 5].sort_values(by='n_votes', ascending=False)

In [None]:
# Step 1: Get review counts per book (before filtering)
review_counts = reviews_df.groupby('book_id').size().reset_index(name='n_reviews')

# Step 2: Filter reviews with n_votes > 0
filtered_reviews = reviews_df[reviews_df['n_votes'] > 0]

# Step 3: For each book, get top 20 reviews by n_votes
top_reviews = (
    filtered_reviews
    .sort_values(['book_id', 'n_votes'], ascending=[True, False])
    .groupby('book_id')
    .head(20)
)

# Step 4: Aggregate per book
agg_reviews = (
    top_reviews
    .groupby('book_id')
    .agg(
        review_texts=('review_text', lambda x: ' || '.join(x)),
        sum_n_votes=('n_votes', 'sum')
    )
    .reset_index()
)

# Step 5: Merge with review_counts
final_reviews_df = pd.merge(review_counts, agg_reviews, on='book_id', how='left')

# Show the result
final_reviews_df.head()

In [None]:
# Select relevant columns from books_df
books_selected = books_df[['book_id', 'title', 'average_rating', 'description', 'publication_year']]

# Join with final_reviews_df on book_id
books_with_reviews = pd.merge(books_selected, final_reviews_df, on='book_id', how='left')

# Show the result
books_with_reviews.sort_values(by='n_reviews', ascending=False).head()

In [None]:
books_with_reviews.sort_values(by='n_reviews', ascending=False).tail()

In [None]:
# Check for missing values
missing_summary = books_with_reviews.isnull().sum()
print("Missing values per column:\n", missing_summary)

# Describe numeric columns
numeric_summary = books_with_reviews.describe()
print("\nSummary statistics for numeric columns:\n", numeric_summary)

# Visualize distributions of numeric columns
import matplotlib.pyplot as plt

numeric_cols = ['average_rating', 'publication_year', 'n_reviews', 'sum_n_votes']
for col in numeric_cols:
    if col in books_with_reviews.columns:
        plt.figure(figsize=(7,4))
        books_with_reviews[col].hist(bins=30)
        plt.title(f'Distribution of {col}')
        plt.xlabel(col)
        plt.ylabel('Count')
        plt.show()

In [None]:
# Save the final DataFrame as a CSV in data/clean
output_path = os.path.join(os.getcwd(), '..', 'data', 'clean', 'books_with_reviews.csv')
books_with_reviews.to_csv(output_path, index=False)
print(f"Saved to {output_path}")

In [None]:
books_with_reviews.columns

In [None]:
np.percentile(books_with_reviews['review_texts'].fillna('').astype(str).apply(lambda x : len(x)), 95)