# Book Recommendation Dataset - Data Exploration

This notebook explores the book recommendation dataset to understand its structure, distribution, and characteristics.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")

## 1. Data Loading and Basic Information

In [None]:
# Load datasets
try:
    train_df = pd.read_csv('../data/processed/train.csv')
    val_df = pd.read_csv('../data/processed/val.csv')
    test_df = pd.read_csv('../data/processed/test.csv')
    
    print(f"Training set: {len(train_df)} books")
    print(f"Validation set: {len(val_df)} books")
    print(f"Test set: {len(test_df)} books")
    print(f"Total: {len(train_df) + len(val_df) + len(test_df)} books")
    
    # Combine for exploration
    df = pd.concat([train_df, val_df, test_df], ignore_index=True)
    print(f"\nCombined dataset shape: {df.shape}")
    
except FileNotFoundError:
    print("Processed data not found. Please run data preprocessing first.")
    # Try loading raw data instead
    try:
        df = pd.read_csv('../data/raw/goodreads/books.csv')
        print(f"Loaded raw data with {len(df)} books")
    except FileNotFoundError:
        print("No data found. Please collect data first.")
        df = pd.DataFrame()  # Empty dataframe for demonstration

In [None]:
# Basic dataset information
if not df.empty:
    print("Dataset Info:")
    print(df.info())
    print("\nColumn names:")
    print(df.columns.tolist())
    print("\nFirst few rows:")
    display(df.head())

## 2. Missing Data Analysis

In [None]:
if not df.empty:
    # Missing data analysis
    missing_data = df.isnull().sum()
    missing_percentage = (missing_data / len(df)) * 100
    
    missing_df = pd.DataFrame({
        'Missing Count': missing_data,
        'Percentage': missing_percentage
    }).sort_values('Percentage', ascending=False)
    
    print("Missing Data Summary:")
    display(missing_df[missing_df['Missing Count'] > 0])
    
    # Visualize missing data
    plt.figure(figsize=(12, 6))
    missing_cols = missing_df[missing_df['Missing Count'] > 0]
    if not missing_cols.empty:
        plt.bar(missing_cols.index, missing_cols['Percentage'])
        plt.title('Missing Data Percentage by Column')
        plt.xlabel('Columns')
        plt.ylabel('Missing Percentage (%)')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()
    else:
        print("No missing data found!")

## 3. Text Length Analysis

In [None]:
if not df.empty:
    # Analyze text length for key fields
    text_columns = ['title', 'author', 'description']
    
    for col in text_columns:
        if col in df.columns:
            # Calculate text lengths
            lengths = df[col].dropna().astype(str).str.len()
            
            print(f"\n{col.upper()} Length Statistics:")
            print(f"Mean: {lengths.mean():.1f}")
            print(f"Median: {lengths.median():.1f}")
            print(f"Min: {lengths.min()}")
            print(f"Max: {lengths.max()}")
            print(f"95th percentile: {lengths.quantile(0.95):.1f}")
    
    # Visualize text lengths
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle('Text Length Distributions', fontsize=16)
    
    for i, col in enumerate(text_columns):
        if col in df.columns:
            lengths = df[col].dropna().astype(str).str.len()
            
            # Histogram
            ax = axes[i//2, i%2]
            ax.hist(lengths, bins=50, alpha=0.7, edgecolor='black')
            ax.set_title(f'{col.capitalize()} Length Distribution')
            ax.set_xlabel('Character Count')
            ax.set_ylabel('Frequency')
            
            # Add statistics as text
            ax.axvline(lengths.mean(), color='red', linestyle='--', label=f'Mean: {lengths.mean():.0f}')
            ax.axvline(lengths.median(), color='green', linestyle='--', label=f'Median: {lengths.median():.0f}')
            ax.legend()
    
    # Remove empty subplot
    if len(text_columns) < 4:
        axes[1, 1].remove()
    
    plt.tight_layout()
    plt.show()

## 4. Genre Analysis

In [None]:
if not df.empty and 'genre' in df.columns:
    # Analyze genres
    genres_series = df['genre'].dropna()
    
    # Split genres and count
    all_genres = []
    for genre_str in genres_series:
        if pd.notna(genre_str):
            genres = [g.strip().lower() for g in str(genre_str).split(',')]
            all_genres.extend(genres)
    
    genre_counts = Counter(all_genres)
    
    print(f"Total unique genres: {len(genre_counts)}")
    print(f"\nTop 20 most common genres:")
    
    top_genres = genre_counts.most_common(20)
    for genre, count in top_genres:
        print(f"{genre:20s}: {count:4d} ({count/len(df)*100:.1f}%)")
    
    # Visualize top genres
    plt.figure(figsize=(12, 8))
    genres, counts = zip(*top_genres)
    plt.barh(range(len(genres)), counts)
    plt.yticks(range(len(genres)), genres)
    plt.xlabel('Number of Books')
    plt.title('Top 20 Most Common Genres')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
    
    # Genre distribution per book
    genres_per_book = []
    for genre_str in genres_series:
        if pd.notna(genre_str):
            num_genres = len([g.strip() for g in str(genre_str).split(',')])
            genres_per_book.append(num_genres)
    
    plt.figure(figsize=(10, 6))
    plt.hist(genres_per_book, bins=range(1, max(genres_per_book)+2), alpha=0.7, edgecolor='black')
    plt.xlabel('Number of Genres per Book')
    plt.ylabel('Number of Books')
    plt.title('Distribution of Number of Genres per Book')
    plt.xticks(range(1, max(genres_per_book)+1))
    plt.show()
    
    print(f"\nAverage genres per book: {np.mean(genres_per_book):.2f}")
    print(f"Median genres per book: {np.median(genres_per_book):.1f}")

## 5. Publication Year Analysis

In [None]:
if not df.empty and 'publication_year' in df.columns:
    # Clean and analyze publication years
    pub_years = pd.to_numeric(df['publication_year'], errors='coerce').dropna()
    
    print(f"Publication Year Statistics:")
    print(f"Earliest: {pub_years.min():.0f}")
    print(f"Latest: {pub_years.max():.0f}")
    print(f"Mean: {pub_years.mean():.1f}")
    print(f"Median: {pub_years.median():.0f}")
    
    # Filter reasonable years (remove outliers)
    reasonable_years = pub_years[(pub_years >= 1800) & (pub_years <= 2025)]
    
    plt.figure(figsize=(15, 10))
    
    # Histogram of publication years
    plt.subplot(2, 2, 1)
    plt.hist(reasonable_years, bins=50, alpha=0.7, edgecolor='black')
    plt.xlabel('Publication Year')
    plt.ylabel('Number of Books')
    plt.title('Distribution of Publication Years')
    
    # Books by decade
    plt.subplot(2, 2, 2)
    decades = (reasonable_years // 10) * 10
    decade_counts = decades.value_counts().sort_index()
    plt.bar(decade_counts.index, decade_counts.values, width=8, alpha=0.7, edgecolor='black')
    plt.xlabel('Decade')
    plt.ylabel('Number of Books')
    plt.title('Books Published by Decade')
    plt.xticks(rotation=45)
    
    # Recent years (2000+)
    plt.subplot(2, 2, 3)
    recent_years = reasonable_years[reasonable_years >= 2000]
    if len(recent_years) > 0:
        year_counts = recent_years.value_counts().sort_index()
        plt.plot(year_counts.index, year_counts.values, marker='o', linewidth=2)
        plt.xlabel('Year')
        plt.ylabel('Number of Books')
        plt.title('Books Published 2000-Present')
        plt.xticks(rotation=45)
    
    # Box plot by decade
    plt.subplot(2, 2, 4)
    decade_labels = sorted(decades.unique())
    decade_data = [reasonable_years[decades == decade] for decade in decade_labels]
    plt.boxplot(decade_data, labels=[f"{int(d)}s" for d in decade_labels])
    plt.xlabel('Decade')
    plt.ylabel('Publication Year')
    plt.title('Publication Year Distribution by Decade')
    plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.show()

## 6. Author Analysis

In [None]:
if not df.empty and 'author' in df.columns:
    # Analyze authors
    authors = df['author'].dropna()
    
    print(f"Total unique authors: {authors.nunique()}")
    print(f"Total books with author info: {len(authors)}")
    
    # Most prolific authors
    author_counts = authors.value_counts()
    
    print(f"\nTop 15 most prolific authors:")
    for author, count in author_counts.head(15).items():
        print(f"{author:30s}: {count:3d} books")
    
    # Distribution of books per author
    plt.figure(figsize=(15, 5))
    
    plt.subplot(1, 3, 1)
    plt.hist(author_counts.values, bins=50, alpha=0.7, edgecolor='black')
    plt.xlabel('Number of Books per Author')
    plt.ylabel('Number of Authors')
    plt.title('Distribution of Books per Author')
    plt.yscale('log')
    
    plt.subplot(1, 3, 2)
    top_authors = author_counts.head(20)
    plt.barh(range(len(top_authors)), top_authors.values)
    plt.yticks(range(len(top_authors)), [name[:20] + '...' if len(name) > 20 else name for name in top_authors.index])
    plt.xlabel('Number of Books')
    plt.title('Top 20 Most Prolific Authors')
    plt.gca().invert_yaxis()
    
    plt.subplot(1, 3, 3)
    # Single book authors vs multi-book authors
    single_book = (author_counts == 1).sum()
    multi_book = (author_counts > 1).sum()
    
    plt.pie([single_book, multi_book], 
            labels=[f'Single Book\n({single_book})', f'Multiple Books\n({multi_book})'],
            autopct='%1.1f%%')
    plt.title('Authors by Number of Books')
    
    plt.tight_layout()
    plt.show()
    
    print(f"\nAuthors with only one book: {single_book} ({single_book/len(author_counts)*100:.1f}%)")
    print(f"Authors with multiple books: {multi_book} ({multi_book/len(author_counts)*100:.1f}%)")

## 7. Word Cloud and Text Analysis

In [None]:
# Try to create word clouds (optional, requires wordcloud package)
try:
    from wordcloud import WordCloud
    
    if not df.empty and 'description' in df.columns:
        # Combine all descriptions
        all_descriptions = ' '.join(df['description'].dropna().astype(str))
        
        # Create word cloud
        wordcloud = WordCloud(width=800, height=400, 
                             background_color='white',
                             max_words=100,
                             colormap='viridis').generate(all_descriptions)
        
        plt.figure(figsize=(12, 6))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title('Word Cloud of Book Descriptions', fontsize=16)
        plt.show()
        
except ImportError:
    print("WordCloud package not available. Skipping word cloud generation.")
    print("Install with: pip install wordcloud")

# Basic text analysis without wordcloud
if not df.empty and 'description' in df.columns:
    # Most common words in descriptions
    all_text = ' '.join(df['description'].dropna().astype(str).str.lower())
    
    # Simple word extraction (remove common stop words)
    stop_words = set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 
                     'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 
                     'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that',
                     'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them'])
    
    words = re.findall(r'\b[a-z]{3,}\b', all_text)  # Words with 3+ letters
    filtered_words = [word for word in words if word not in stop_words]
    
    word_counts = Counter(filtered_words)
    
    print(f"\nTop 20 most common words in descriptions:")
    for word, count in word_counts.most_common(20):
        print(f"{word:15s}: {count:5d}")
    
    # Visualize top words
    top_words = word_counts.most_common(15)
    words, counts = zip(*top_words)
    
    plt.figure(figsize=(12, 6))
    plt.barh(range(len(words)), counts)
    plt.yticks(range(len(words)), words)
    plt.xlabel('Frequency')
    plt.title('Top 15 Most Common Words in Book Descriptions')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()

## 8. Rating Analysis (if available)

In [None]:
if not df.empty and 'rating' in df.columns:
    ratings = pd.to_numeric(df['rating'], errors='coerce').dropna()
    
    if len(ratings) > 0:
        print(f"Rating Statistics:")
        print(f"Mean: {ratings.mean():.2f}")
        print(f"Median: {ratings.median():.2f}")
        print(f"Min: {ratings.min():.2f}")
        print(f"Max: {ratings.max():.2f}")
        print(f"Standard deviation: {ratings.std():.2f}")
        
        plt.figure(figsize=(15, 5))
        
        plt.subplot(1, 3, 1)
        plt.hist(ratings, bins=50, alpha=0.7, edgecolor='black')
        plt.xlabel('Rating')
        plt.ylabel('Number of Books')
        plt.title('Distribution of Book Ratings')
        plt.axvline(ratings.mean(), color='red', linestyle='--', label=f'Mean: {ratings.mean():.2f}')
        plt.legend()
        
        plt.subplot(1, 3, 2)
        plt.boxplot(ratings)
        plt.ylabel('Rating')
        plt.title('Box Plot of Ratings')
        
        plt.subplot(1, 3, 3)
        # Rating categories
        rating_categories = pd.cut(ratings, bins=[0, 2, 3, 4, 5], labels=['Poor', 'Fair', 'Good', 'Excellent'])
        category_counts = rating_categories.value_counts()
        plt.pie(category_counts.values, labels=category_counts.index, autopct='%1.1f%%')
        plt.title('Rating Categories')
        
        plt.tight_layout()
        plt.show()
        
        # Correlation with publication year
        if 'publication_year' in df.columns:
            pub_years = pd.to_numeric(df['publication_year'], errors='coerce')
            valid_data = df[(ratings.notna()) & (pub_years.notna()) & (pub_years >= 1900) & (pub_years <= 2025)]
            
            if len(valid_data) > 0:
                plt.figure(figsize=(12, 6))
                plt.scatter(valid_data['publication_year'], valid_data['rating'], alpha=0.5)
                plt.xlabel('Publication Year')
                plt.ylabel('Rating')
                plt.title('Rating vs Publication Year')
                
                # Add trend line
                z = np.polyfit(valid_data['publication_year'], valid_data['rating'], 1)
                p = np.poly1d(z)
                plt.plot(valid_data['publication_year'], p(valid_data['publication_year']), "r--", alpha=0.8)
                
                correlation = valid_data['publication_year'].corr(valid_data['rating'])
                plt.text(0.05, 0.95, f'Correlation: {correlation:.3f}', transform=plt.gca().transAxes)
                
                plt.show()
else:
    print("No rating information available in the dataset.")

## 9. Summary and Insights

In [None]:
if not df.empty:
    print("=" * 60)
    print("DATASET SUMMARY AND INSIGHTS")
    print("=" * 60)
    
    print(f"📚 Total Books: {len(df):,}")
    
    if 'author' in df.columns:
        unique_authors = df['author'].nunique()
        print(f"✍️  Unique Authors: {unique_authors:,}")
        print(f"📖 Average Books per Author: {len(df)/unique_authors:.1f}")
    
    if 'publication_year' in df.columns:
        years = pd.to_numeric(df['publication_year'], errors='coerce').dropna()
        if len(years) > 0:
            print(f"📅 Publication Years: {years.min():.0f} - {years.max():.0f}")
            recent_books = (years >= 2000).sum()
            print(f"🆕 Books from 2000+: {recent_books:,} ({recent_books/len(years)*100:.1f}%)")
    
    if 'genre' in df.columns:
        genres_with_data = df['genre'].notna().sum()
        print(f"🏷️  Books with Genre Info: {genres_with_data:,} ({genres_with_data/len(df)*100:.1f}%)")
    
    if 'description' in df.columns:
        desc_with_data = df['description'].notna().sum()
        print(f"📝 Books with Descriptions: {desc_with_data:,} ({desc_with_data/len(df)*100:.1f}%)")
        
        avg_desc_length = df['description'].dropna().astype(str).str.len().mean()
        print(f"📏 Average Description Length: {avg_desc_length:.0f} characters")
    
    if 'rating' in df.columns:
        ratings = pd.to_numeric(df['rating'], errors='coerce').dropna()
        if len(ratings) > 0:
            print(f"⭐ Average Rating: {ratings.mean():.2f}/5.0")
            print(f"🎯 Books with Ratings: {len(ratings):,} ({len(ratings)/len(df)*100:.1f}%)")
    
    print("\n" + "=" * 60)
    print("RECOMMENDATIONS FOR MODEL TRAINING:")
    print("=" * 60)
    
    if 'description' in df.columns:
        no_desc = df['description'].isna().sum()
        if no_desc > 0:
            print(f"⚠️  {no_desc:,} books lack descriptions - consider data augmentation")
    
    if 'genre' in df.columns:
        no_genre = df['genre'].isna().sum()
        if no_genre > 0:
            print(f"⚠️  {no_genre:,} books lack genre information - impacts tag prediction")
    
    print("✅ Dataset appears suitable for semantic model training")
    print("✅ Good coverage of books across different time periods")
    print("✅ Sufficient text data for embedding generation")

else:
    print("⚠️  No data available for analysis. Please run data collection first.")

## Next Steps

Based on this exploration:

1. **Data Quality**: Address missing descriptions and genre information
2. **Text Preprocessing**: Clean and standardize text fields
3. **Feature Engineering**: Create combined text features for training
4. **Tag Vocabulary**: Build comprehensive tag vocabulary from genres
5. **Model Training**: Use insights to configure model hyperparameters

The dataset shows good potential for training a semantic book recommendation model!