# 1. Data Collection

In [15]:
import requests
import pandas as pd
from datetime import datetime

def fetch_books(subject='fiction', max_results=40, start_index=0):
    url = f"https://www.googleapis.com/books/v1/volumes"
    params = {
        'q': f"subject:{subject}",
        'maxResults': max_results,
        'startIndex': start_index,
    }

    response = requests.get(url, params=params)
    response.raise_for_status()
    data = response.json()
    return data.get('items', [])

def parse_book(item):
    volume = item.get('volumeInfo', {})
    
    return {
        'book_id': item.get('id'),
        'title': volume.get('title'),
        'authors': ', '.join(volume.get('authors', [])),
        'publication_date': volume.get('publishedDate'),
        'page_count': volume.get('pageCount'),
        'genre_name': ', '.join(volume.get('categories', [])) if volume.get('categories') else 'Unknown',
        'genre_id': None,  # will be filled later
        'categories': ', '.join(volume.get('categories', [])) if volume.get('categories') else '',
        'average_rating': volume.get('averageRating'),
        'ratings_count': volume.get('ratingsCount'),
        'description': volume.get('description'),
        'language': volume.get('language'),
        'publisher': volume.get('publisher'),
        'isbn_13': None,
        'isbn_10': None,
        'thumbnail': volume.get('imageLinks', {}).get('thumbnail'),
        'info_link': volume.get('infoLink'),
        'collected_date': datetime.now().isoformat()
    }

def extract_isbns(industry_identifiers):
    isbn_10, isbn_13 = None, None
    for iden in industry_identifiers:
        if iden['type'] == 'ISBN_10':
            isbn_10 = iden['identifier']
        elif iden['type'] == 'ISBN_13':
            isbn_13 = iden['identifier']
    return isbn_10, isbn_13

def enrich_books_with_isbn(book_list, items):
    for book, item in zip(book_list, items):
        industry_ids = item.get('volumeInfo', {}).get('industryIdentifiers', [])
        isbn_10, isbn_13 = extract_isbns(industry_ids)
        book['isbn_10'] = isbn_10
        book['isbn_13'] = isbn_13
    return book_list

# 🧪 Example: Fetch 80 books from 'science' and 'fiction'
subjects = ['science', 'fiction','romance','mystery']
all_books = []

for subject in subjects:
    for start in range(0, 80, 40):  # Google Books API allows max 40 results per call
        items = fetch_books(subject, max_results=40, start_index=start)
        parsed_books = [parse_book(item) for item in items]
        parsed_books = enrich_books_with_isbn(parsed_books, items)
        all_books.extend(parsed_books)

# 📁 Convert to DataFrame and save
df_books = pd.DataFrame(all_books)
df_books.to_csv('../data/raw/books_raw_data.csv', index=False)
print(f"✅ Saved {len(df_books)} books to '../data/raw/books_raw_data.csv'")


✅ Saved 320 books to '../data/raw/books_raw_data.csv'


In [16]:

import pandas as pd
import numpy as np
import os
from datetime import datetime

# Create data directories if they don't exist

print("📚 Starting book data collection for ME204 Final Project")
print("=" * 60)

def load_existing_data():
    """
    Load existing book data from CSV file
    
    Returns:
        DataFrame with book data or None if file doesn't exist
    """
    csv_path = '../data/raw/books_raw_data.csv'
    
    if os.path.exists(csv_path):
        print(f"📁 Found existing data file: {csv_path}")
        try:
            df = pd.read_csv(csv_path)
            print(f"✅ Successfully loaded {len(df)} books from existing file")
            return df
        except Exception as e:
            print(f"❌ Error loading existing file: {e}")
            return None
    else:
        print(f"📁 No existing data file found at: {csv_path}")
        return None

def analyze_existing_data(df):
    """
    Analyze the structure and content of existing data
    
    Args:
        df: DataFrame with book data
    """
    print(f"\n📊 Data Analysis:")
    print(f"   Total books: {len(df)}")
    print(f"   Columns: {list(df.columns)}")
    
    # Check for genre distribution
    if 'genre_name' in df.columns:
        print(f"\n📚 Books by Genre:")
        genre_counts = df['genre_name'].value_counts()
        for genre, count in genre_counts.items():
            print(f"   {genre}: {count} books")
    
    # Check data quality
    print(f"\n🔍 Data Quality Check:")
    for col in df.columns:
        non_null_count = df[col].notna().sum()
        percentage = (non_null_count / len(df)) * 100
        print(f"   {col}: {non_null_count}/{len(df)} ({percentage:.1f}%) non-null")
    
    # Show sample of data
    print(f"\n📖 Sample Books:")
    sample_cols = ['title', 'authors', 'genre_name', 'average_rating', 'ratings_count']
    available_cols = [col for col in sample_cols if col in df.columns]
    
    if available_cols:
        sample = df[available_cols].head(10)
        for idx, row in sample.iterrows():
            title = row.get('title', 'Unknown')[:50]
            authors = row.get('authors', 'Unknown')[:30]
            genre = row.get('genre_name', 'Unknown')
            rating = row.get('average_rating', 'N/A')
            count = row.get('ratings_count', 'N/A')
            print(f"   {title} by {authors} ({genre}) - {rating}/5 ({count} ratings)")

def select_top_books_by_genre(df, books_per_genre=50):
    """
    Select top books from each genre based on ratings_count
    
    Args:
        df: DataFrame with all book data
        books_per_genre: Number of books to select per genre
    
    Returns:
        DataFrame with selected top books
    """
    if 'genre_name' not in df.columns:
        print("❌ No genre_name column found. Cannot filter by genre.")
        return df
    
    if 'ratings_count' not in df.columns:
        print("❌ No ratings_count column found. Using original order.")
        return df
    
    print(f"\n🏆 Selecting top {books_per_genre} books per genre...")
    
    # Convert ratings_count to numeric, handling any non-numeric values
    df['ratings_count'] = pd.to_numeric(df['ratings_count'], errors='coerce').fillna(0)
    
    selected_books = []
    
    for genre in df['genre_name'].unique():
        genre_books = df[df['genre_name'] == genre].copy()
        
        # Sort by ratings_count (descending) to get most popular books
        genre_books = genre_books.sort_values('ratings_count', ascending=False)
        
        # Take top N books for this genre
        top_books = genre_books.head(books_per_genre)
        selected_books.append(top_books)
        
        print(f"   {genre}: Selected {len(top_books)} books (max ratings: {top_books['ratings_count'].max():.0f})")
    
    result_df = pd.concat(selected_books, ignore_index=True)
    print(f"\n✅ Total selected books: {len(result_df)}")
    
    return result_df

def create_genre_mapping(df):
    """
    Create a clean genre mapping table
    
    Args:
        df: DataFrame with book data
    
    Returns:
        DataFrame with genre mapping
    """
    if 'genre_name' not in df.columns:
        return pd.DataFrame()
    
    # Create genre mapping
    genres = df['genre_name'].unique()
    genre_mapping = pd.DataFrame({
        'genre_id': range(1, len(genres) + 1),
        'genre_name': genres,
        'description': [f'Books in the {genre} category' for genre in genres]
    })
    
    return genre_mapping

def clean_and_validate_data(df):
    """
    Clean and validate the book data
    
    Args:
        df: Raw DataFrame
    
    Returns:
        Cleaned DataFrame
    """
    print(f"\n🧹 Cleaning data...")
    initial_count = len(df)
    
    # Remove rows with missing essential data
    essential_cols = ['book_id', 'title']
    available_essential = [col for col in essential_cols if col in df.columns]
    
    if available_essential:
        df = df.dropna(subset=available_essential)
        print(f"   Removed {initial_count - len(df)} books with missing essential data")
    
    # Clean text fields
    text_cols = ['title', 'authors', 'description', 'publisher']
    for col in text_cols:
        if col in df.columns:
            df[col] = df[col].astype(str).replace('nan', '').replace('None', '')
    
    # Ensure numeric fields are properly typed
    numeric_cols = ['average_rating', 'ratings_count', 'page_count', 'genre_id']
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Add collection timestamp if not present
    if 'collected_date' not in df.columns:
        df['collected_date'] = datetime.now().isoformat()
    
    print(f"   Final dataset: {len(df)} books")
    return df

def save_processed_data(df, genre_mapping=None):
    """
    Save the processed data to files
    
    Args:
        df: Processed book DataFrame
        genre_mapping: Genre mapping DataFrame
    """
    print(f"\n💾 Saving processed data...")
    
    # Save main book data
    df.to_csv('../data/raw/books_processed.csv', index=False)
    print(f"   ✅ Saved books data: ../data/raw/books_processed.csv")
    
    # Save genre mapping if available
    if genre_mapping is not None and len(genre_mapping) > 0:
        genre_mapping.to_csv('../data/raw/genres.csv', index=False)
        print(f"   ✅ Saved genre mapping: ../data/raw/genres.csv")
    
    # Create metadata
    metadata = {
        'processing_date': datetime.now().isoformat(),
        'total_books': len(df),
        'genres': df['genre_name'].unique().tolist() if 'genre_name' in df.columns else [],
        'books_by_genre': df['genre_name'].value_counts().to_dict() if 'genre_name' in df.columns else {},
        'columns': list(df.columns)
    }
    
    import json
    with open('../data/raw/processing_metadata.json', 'w') as f:
        json.dump(metadata, f, indent=2)
    print(f"   ✅ Saved metadata: ../data/raw/processing_metadata.json")

# Main execution
if __name__ == "__main__":
    # Try to load existing data
    df = load_existing_data()
    
    if df is not None:
        # Analyze existing data
        analyze_existing_data(df)
        
        # Clean and validate
        df_clean = clean_and_validate_data(df)
        
        # Select top books by genre (25 per genre = 100 total)
        df_selected = select_top_books_by_genre(df_clean, books_per_genre=25)
        
        # Create genre mapping
        genre_mapping = create_genre_mapping(df_selected)
        
        # Save processed data
        save_processed_data(df_selected, genre_mapping)
        
        # Final summary
        print(f"\n📈 Final Summary:")
        print(f"   Total books selected: {len(df_selected)}")
        if 'genre_name' in df_selected.columns:
            print(f"   Books by genre:")
            for genre, count in df_selected['genre_name'].value_counts().items():
                print(f"     {genre}: {count}")
        
        if 'ratings_count' in df_selected.columns:
            print(f"   Rating statistics:")
            print(f"     Average ratings count: {df_selected['ratings_count'].mean():.0f}")
            print(f"     Median ratings count: {df_selected['ratings_count'].median():.0f}")
            print(f"     Max ratings count: {df_selected['ratings_count'].max():.0f}")
        
        print(f"\n🏆 Top 5 Most Popular Books:")
        if 'ratings_count' in df_selected.columns:
            top_5 = df_selected.nlargest(5, 'ratings_count')
            for idx, row in top_5.iterrows():
                title = row.get('title', 'Unknown')[:40]
                authors = row.get('authors', 'Unknown')[:25]
                genre = row.get('genre_name', 'Unknown')
                ratings = row.get('ratings_count', 0)
                avg_rating = row.get('average_rating', 'N/A')
                print(f"   {title} by {authors} ({genre}) - {ratings:.0f} ratings, {avg_rating}/5")
        
    else:
        print("❌ No existing data found. Please ensure you have book data in '../data/raw/books_raw_data.csv'")
        print("📝 The CSV should have these columns:")
        expected_columns = [
            'book_id', 'title', 'authors', 'publication_date', 'page_count', 
            'genre_name', 'genre_id', 'categories', 'average_rating', 'ratings_count',
            'description', 'language', 'publisher', 'isbn_13', 'isbn_10', 
            'thumbnail', 'info_link', 'collected_date'
        ]
        for col in expected_columns:
            print(f"   - {col}")

print("\n" + "="*60)
print("📝 Next Steps:")
print("1. Review the processed data in ../data/raw/books_processed.csv")
print("2. Check ../data/raw/genres.csv for genre mapping")
print("3. Move to NB02-data-processing.ipynb to create your database")
print("="*60)

📚 Starting book data collection for ME204 Final Project
📁 Found existing data file: ../data/raw/books_raw_data.csv
✅ Successfully loaded 320 books from existing file

📊 Data Analysis:
   Total books: 320
   Columns: ['book_id', 'title', 'authors', 'publication_date', 'page_count', 'genre_name', 'genre_id', 'categories', 'average_rating', 'ratings_count', 'description', 'language', 'publisher', 'isbn_13', 'isbn_10', 'thumbnail', 'info_link', 'collected_date']

📚 Books by Genre:
   Fiction: 164 books
   Science: 31 books
   Unknown: 28 books
   Nature: 11 books
   Detective and mystery stories: 7 books
   Biography & Autobiography: 7 books
   Technology & Engineering: 6 books
   History: 5 books
   Medical: 4 books
   Juvenile Fiction: 4 books
   Children's stories: 3 books
   Books and reading: 3 books
   England: 3 books
   Computers: 2 books
   Social Science: 2 books
   Health & Fitness: 2 books
   Philosophy: 2 books
   Drama: 2 books
   Romance: 2 books
   Brunetti, Guido (Fictitio

TypeError: 'float' object is not subscriptable