# Notebook 1: Data Collection

## Purpose
This notebook handles the collection of raw movie data from multiple sources including:
- **TMDB API**: Movie metadata (budget, cast, crew, genres, runtime, release dates)
- **Box Office Mojo**: Box office revenue data (opening weekend, total domestic, worldwide)
- **OMDb API**: Supplemental metadata and IMDb ratings
- **YouTube Data API**: Trailer view counts and engagement metrics

## Objectives
1. Set up API connections and test endpoints
2. Write data collection functions with error handling and rate limiting
3. Collect data for 3,000+ movies from 2010-2024
4. Merge data sources on IMDb ID
5. Save raw datasets to CSV files in `data/raw/` directory
6. Perform initial data inspection

## Outputs
- `data/raw/movies_tmdb_raw.csv`
- `data/raw/revenue_boxofficemojo_raw.csv`
- `data/raw/trailers_youtube_raw.csv`

## Notes
- This notebook may take several hours to run due to API rate limits
- Once data is collected, subsequent runs should load from saved CSV files
- API keys should be stored in a `.env` file (not committed to git)

---
## Setup and Imports

In [None]:
# Cell 2
# Import libraries
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import time
import os
from dotenv import load_dotenv
from datetime import datetime
import json

# Load environment variables
load_dotenv()

# API Keys
TMDB_API_KEY = os.getenv('TMDB_API_KEY')
OMDB_API_KEY = os.getenv('OMDB_API_KEY')
YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY')

# Verify API keys are loaded
print("API Keys loaded:")
print(f"  TMDB: {'✓' if TMDB_API_KEY else '✗'}")
print(f"  OMDb: {'✓' if OMDB_API_KEY else '✗'}")
print(f"  YouTube: {'✓' if YOUTUBE_API_KEY else '✗'}")

# Test TMDB API connection
print("\nTesting TMDB API connection...")
test_url = f"https://api.themoviedb.org/3/movie/550?api_key={TMDB_API_KEY}"
try:
    response = requests.get(test_url)
    if response.status_code == 200:
        print("✓ TMDB API connection successful!")
        print(f"  Test movie: {response.json()['title']}")
    else:
        print(f"✗ TMDB API error: {response.status_code}")
except Exception as e:
    print(f"✗ Connection error: {e}")

---
## Data Collection Functions

In [None]:
# Cell 4
# TMDB API Base URL
TMDB_BASE_URL = "https://api.themoviedb.org/3"

# Rate limiter class to handle TMDB's 40 requests per 10 seconds limit
class RateLimiter:
    def __init__(self, max_calls=40, time_period=10):
        self.max_calls = max_calls
        self.time_period = time_period
        self.calls = []
    
    def wait_if_needed(self):
        now = time.time()
        # Remove calls older than time_period
        self.calls = [call_time for call_time in self.calls if now - call_time < self.time_period]
        
        if len(self.calls) >= self.max_calls:
            sleep_time = self.time_period - (now - self.calls[0]) + 0.1
            print(f"  Rate limit reached, waiting {sleep_time:.1f} seconds...")
            time.sleep(sleep_time)
            self.calls = []
        
        self.calls.append(time.time())

# Initialize rate limiter
rate_limiter = RateLimiter(max_calls=35, time_period=10)  # Using 35 to be safe

def get_popular_movies_by_year(year, pages=5):
    """
    Get popular movies for a specific year using TMDB discover endpoint.
    
    Args:
        year: Release year (e.g., 2020)
        pages: Number of pages to fetch (20 movies per page)
    
    Returns:
        List of movie IDs
    """
    movie_ids = []
    
    for page in range(1, pages + 1):
        rate_limiter.wait_if_needed()
        
        url = f"{TMDB_BASE_URL}/discover/movie"
        params = {
            'api_key': TMDB_API_KEY,
            'language': 'en-US',
            'sort_by': 'popularity.desc',
            'primary_release_year': year,
            'page': page,
            'vote_count.gte': 50  # Minimum votes to ensure it's not obscure
        }
        
        try:
            response = requests.get(url, params=params, timeout=10)
            if response.status_code == 200:
                data = response.json()
                movie_ids.extend([movie['id'] for movie in data['results']])
            else:
                print(f"  Error fetching page {page} for year {year}: {response.status_code}")
        except Exception as e:
            print(f"  Exception for year {year}, page {page}: {e}")
            time.sleep(2)
    
    return movie_ids

def get_movie_details(movie_id):
    """
    Get detailed information for a specific movie.
    
    Args:
        movie_id: TMDB movie ID
    
    Returns:
        Dictionary with movie details or None if error
    """
    rate_limiter.wait_if_needed()
    
    url = f"{TMDB_BASE_URL}/movie/{movie_id}"
    params = {
        'api_key': TMDB_API_KEY,
        'append_to_response': 'credits,release_dates,videos'
    }
    
    try:
        response = requests.get(url, params=params, timeout=10)
        if response.status_code == 200:
            return response.json()
        else:
            print(f"  Error fetching movie {movie_id}: {response.status_code}")
            return None
    except Exception as e:
        print(f"  Exception for movie {movie_id}: {e}")
        return None

def extract_movie_data(movie_details):
    """
    Extract relevant fields from TMDB movie details.
    
    Args:
        movie_details: Raw JSON response from TMDB
    
    Returns:
        Dictionary with extracted fields
    """
    if not movie_details:
        return None
    
    # Extract release dates to find US release
    us_release_date = None
    us_certification = None
    if 'release_dates' in movie_details and 'results' in movie_details['release_dates']:
        for country_release in movie_details['release_dates']['results']:
            if country_release['iso_3166_1'] == 'US':
                for release in country_release['release_dates']:
                    if release.get('type') in [2, 3]:  # Theatrical release
                        us_release_date = release.get('release_date')
                        us_certification = release.get('certification')
                        break
                break
    
    # Extract cast (top 5 actors)
    cast = []
    if 'credits' in movie_details and 'cast' in movie_details['credits']:
        cast = [
            {
                'id': actor['id'],
                'name': actor['name'],
                'order': actor['order']
            }
            for actor in movie_details['credits']['cast'][:5]
        ]
    
    # Extract director and crew
    director = None
    if 'credits' in movie_details and 'crew' in movie_details['credits']:
        for crew_member in movie_details['credits']['crew']:
            if crew_member['job'] == 'Director':
                director = {
                    'id': crew_member['id'],
                    'name': crew_member['name']
                }
                break
    
    # Extract YouTube trailer key
    trailer_key = None
    if 'videos' in movie_details and 'results' in movie_details['videos']:
        for video in movie_details['videos']['results']:
            if video['type'] == 'Trailer' and video['site'] == 'YouTube':
                trailer_key = video['key']
                break
    
    # Extract genres
    genres = [genre['name'] for genre in movie_details.get('genres', [])]
    
    # Extract production companies
    production_companies = [company['name'] for company in movie_details.get('production_companies', [])]
    
    return {
        'tmdb_id': movie_details.get('id'),
        'imdb_id': movie_details.get('imdb_id'),
        'title': movie_details.get('title'),
        'original_title': movie_details.get('original_title'),
        'release_date': movie_details.get('release_date'),
        'us_release_date': us_release_date,
        'us_certification': us_certification,
        'budget': movie_details.get('budget'),
        'revenue': movie_details.get('revenue'),  # Note: TMDB revenue often incomplete
        'runtime': movie_details.get('runtime'),
        'genres': '|'.join(genres) if genres else None,
        'primary_genre': genres[0] if genres else None,
        'num_genres': len(genres),
        'popularity': movie_details.get('popularity'),
        'vote_average': movie_details.get('vote_average'),
        'vote_count': movie_details.get('vote_count'),
        'director_id': director['id'] if director else None,
        'director_name': director['name'] if director else None,
        'cast_ids': '|'.join([str(actor['id']) for actor in cast]),
        'cast_names': '|'.join([actor['name'] for actor in cast]),
        'production_companies': '|'.join(production_companies) if production_companies else None,
        'num_production_companies': len(production_companies),
        'original_language': movie_details.get('original_language'),
        'production_countries': '|'.join([country['iso_3166_1'] for country in movie_details.get('production_countries', [])]),
        'youtube_trailer_key': trailer_key,
        'tagline': movie_details.get('tagline'),
        'overview': movie_details.get('overview')
    }

def collect_movies_for_year_range(start_year, end_year, pages_per_year=5):
    """
    Collect movie data for a range of years.
    
    Args:
        start_year: Starting year (inclusive)
        end_year: Ending year (inclusive)
        pages_per_year: Number of pages to fetch per year
    
    Returns:
        DataFrame with collected movie data
    """
    all_movies = []
    total_movies = 0
    
    for year in range(start_year, end_year + 1):
        print(f"\n=== Collecting movies for {year} ===")
        
        # Get movie IDs for this year
        movie_ids = get_popular_movies_by_year(year, pages=pages_per_year)
        print(f"  Found {len(movie_ids)} movie IDs for {year}")
        
        # Get details for each movie
        year_movies = 0
        for i, movie_id in enumerate(movie_ids, 1):
            if i % 20 == 0:
                print(f"  Progress: {i}/{len(movie_ids)} movies processed for {year}")
            
            movie_details = get_movie_details(movie_id)
            if movie_details:
                extracted_data = extract_movie_data(movie_details)
                if extracted_data:
                    all_movies.append(extracted_data)
                    year_movies += 1
        
        print(f"  Collected {year_movies} movies for {year}")
        total_movies += year_movies
        print(f"  Total movies collected so far: {total_movies}")
    
    df = pd.DataFrame(all_movies)
    return df

print("Data collection functions loaded successfully!")

---
## Collect Data

In [None]:
# Cell 6
# Collect TMDB data for movies from 2010-2024
# This will take a while due to rate limiting (approximately 2-3 hours)

# Set parameters
START_YEAR = 2010
END_YEAR = 2024
PAGES_PER_YEAR = 17  # 17 pages x 20 movies = ~340 movies per year x 15 years = ~5,100 movies

print(f"Starting data collection for {START_YEAR}-{END_YEAR}")
print(f"Fetching {PAGES_PER_YEAR} pages per year (~{PAGES_PER_YEAR * 20} movies/year)")
print(f"Estimated total movies: ~{(END_YEAR - START_YEAR + 1) * PAGES_PER_YEAR * 20}")
print(f"This will take approximately 2-3 hours due to API rate limiting.\n")

# Collect the data
start_time = time.time()
df_tmdb = collect_movies_for_year_range(START_YEAR, END_YEAR, pages_per_year=PAGES_PER_YEAR)
end_time = time.time()

print(f"\n{'='*60}")
print(f"Data collection complete!")
print(f"Total movies collected: {len(df_tmdb)}")
print(f"Time elapsed: {(end_time - start_time) / 60:.1f} minutes")
print(f"{'='*60}")

---
## Save Raw Data

In [None]:
# Cell 8
# Create data/raw directory if it doesn't exist
os.makedirs('data/raw', exist_ok=True)

# Save to CSV
output_file = 'data/raw/movies_tmdb_raw.csv'
df_tmdb.to_csv(output_file, index=False)

print(f"Data saved to {output_file}")
print(f"File size: {os.path.getsize(output_file) / 1024:.1f} KB")
print(f"Total rows: {len(df_tmdb)}")
print(f"Total columns: {len(df_tmdb.columns)}")

---
## Initial Data Inspection

In [None]:
# Cell 10
# Basic data inspection
print("="*60)
print("DATASET OVERVIEW")
print("="*60)

print(f"\nShape: {df_tmdb.shape}")
print(f"  Rows (movies): {df_tmdb.shape[0]}")
print(f"  Columns (features): {df_tmdb.shape[1]}")

print("\n" + "="*60)
print("COLUMN NAMES")
print("="*60)
print(df_tmdb.columns.tolist())

print("\n" + "="*60)
print("DATA TYPES")
print("="*60)
print(df_tmdb.dtypes)

print("\n" + "="*60)
print("MISSING VALUES")
print("="*60)
missing = df_tmdb.isnull().sum()
missing_pct = (missing / len(df_tmdb) * 100).round(1)
missing_df = pd.DataFrame({
    'Missing': missing,
    'Percentage': missing_pct
})
print(missing_df[missing_df['Missing'] > 0].sort_values('Missing', ascending=False))

print("\n" + "="*60)
print("FIRST 5 ROWS")
print("="*60)
print(df_tmdb.head())

print("\n" + "="*60)
print("BASIC STATISTICS (Numeric Columns)")
print("="*60)
print(df_tmdb.describe())

print("\n" + "="*60)
print("KEY METRICS")
print("="*60)
print(f"Movies with budget data: {df_tmdb['budget'].notna().sum()} ({df_tmdb['budget'].notna().sum() / len(df_tmdb) * 100:.1f}%)")
print(f"Movies with non-zero budget: {(df_tmdb['budget'] > 0).sum()} ({(df_tmdb['budget'] > 0).sum() / len(df_tmdb) * 100:.1f}%)")
print(f"Movies with revenue data: {df_tmdb['revenue'].notna().sum()} ({df_tmdb['revenue'].notna().sum() / len(df_tmdb) * 100:.1f}%)")
print(f"Movies with non-zero revenue: {(df_tmdb['revenue'] > 0).sum()} ({(df_tmdb['revenue'] > 0).sum() / len(df_tmdb) * 100:.1f}%)")
print(f"Movies with IMDb ID: {df_tmdb['imdb_id'].notna().sum()} ({df_tmdb['imdb_id'].notna().sum() / len(df_tmdb) * 100:.1f}%)")
print(f"Movies with director: {df_tmdb['director_name'].notna().sum()} ({df_tmdb['director_name'].notna().sum() / len(df_tmdb) * 100:.1f}%)")
print(f"Movies with cast data: {df_tmdb['cast_names'].notna().sum()} ({df_tmdb['cast_names'].notna().sum() / len(df_tmdb) * 100:.1f}%)")
print(f"Movies with YouTube trailer: {df_tmdb['youtube_trailer_key'].notna().sum()} ({df_tmdb['youtube_trailer_key'].notna().sum() / len(df_tmdb) * 100:.1f}%)")

print("\n" + "="*60)
print("SAMPLE MOVIES")
print("="*60)
print(df_tmdb[['title', 'release_date', 'budget', 'revenue', 'primary_genre', 'director_name']].sample(10))

In [None]:
# Cell 11
df_tmdb.head()

---
## Box Office Mojo Revenue Scraping

### Purpose
Scrape box office revenue data from Box Office Mojo to fill gaps in TMDB data. Currently only 37% of movies have both budget and revenue, limiting our dataset size. Box Office Mojo provides comprehensive revenue data including opening weekend, domestic total, international, and worldwide gross.

### Strategy
- Target all 5,094 movies with IMDb IDs
- Use respectful rate limiting (1.5s delays)
- Implement checkpointing for resumability
- Extract: opening weekend, domestic total, international total, worldwide total, budget

In [None]:
# Cell 13
# Load existing TMDB data if not already in memory
if 'df_tmdb' not in locals():
    print("Loading existing TMDB data from CSV...")
    df_tmdb = pd.read_csv('data/raw/movies_tmdb_raw.csv')
    print(f"Loaded {len(df_tmdb)} movies")
else:
    print(f"df_tmdb already in memory with {len(df_tmdb)} movies")

In [None]:
# Cell 14
def parse_bom_revenue(soup, imdb_id):
    """
    Extract revenue from Box Office Mojo HTML using span.money tags.

    BOM structure: Revenue values are in <span class="money"> tags.
    We find all money spans and match them to labels by proximity.

    Args:
        soup: BeautifulSoup object of page HTML
        imdb_id: IMDb ID for result dictionary

    Returns:
        Dictionary with revenue fields or None values if not found
    """
    result = {
        'imdb_id': imdb_id,
        'domestic_total': None,
        'opening_weekend': None,
        'international_total': None,
        'worldwide_total': None,
        'bom_budget': None,
        'scrape_success': True,
        'error_message': None
    }

    # Find all <span class="money"> elements
    money_spans = soup.find_all('span', class_='money')

    # For each money span, check surrounding context for labels
    for money_span in money_spans:
        # Get the dollar amount
        amount_text = money_span.get_text(strip=True)
        if not amount_text or amount_text == '–':
            continue

        amount = int(amount_text.replace('$', '').replace(',', ''))

        # Get parent div to find associated label
        parent = money_span.find_parent('div', class_='a-section')
        if not parent:
            continue

        # Get the text of the parent div to find label
        parent_text = parent.get_text().lower()

        # Match to appropriate field based on label in parent
        # Use "not result[field]" to only capture the first occurrence
        if 'worldwide' in parent_text and result['worldwide_total'] is None:
            result['worldwide_total'] = amount
        elif 'domestic' in parent_text and 'international' not in parent_text and result['domestic_total'] is None:
            result['domestic_total'] = amount
        elif 'international' in parent_text and result['international_total'] is None:
            result['international_total'] = amount
        elif 'opening' in parent_text and result['opening_weekend'] is None:
            result['opening_weekend'] = amount
        elif 'budget' in parent_text and result['bom_budget'] is None:
            result['bom_budget'] = amount

    return result

print("HTML parsing function loaded!")

In [None]:
# Cell 15
def clean_currency(currency_str):
    """Helper function to convert currency string to integer."""
    return int(currency_str.replace('$', '').replace(',', ''))

def error_result(imdb_id, error_type):
    """Helper function to create error result dictionary."""
    return {
        'imdb_id': imdb_id,
        'domestic_total': None,
        'opening_weekend': None,
        'international_total': None,
        'worldwide_total': None,
        'bom_budget': None,
        'scrape_success': False,
        'error_message': error_type
    }

class BOMRateLimiter:
    """Rate limiter for Box Office Mojo scraping."""
    def __init__(self, delay=1.5):
        self.delay = delay
        self.last_call = 0
    
    def wait(self):
        elapsed = time.time() - self.last_call
        if elapsed < self.delay:
            time.sleep(self.delay - elapsed)
        self.last_call = time.time()

print("Helper functions loaded!")

In [None]:
# Cell 16
def scrape_bom_movie(imdb_id, max_retries=3):
    """
    Scrape revenue data for a single movie from Box Office Mojo.
    
    Handles various error conditions:
    - 404: Movie not found in BOM
    - 429: Rate limited (exponential backoff)
    - 5xx: Server errors (retry with delays)
    - Timeout: Network timeout (retry once)
    - Other exceptions: Catch and log
    
    Args:
        imdb_id: IMDb ID (e.g., 'tt1375666')
        max_retries: Maximum retry attempts for recoverable errors
    
    Returns:
        Dictionary with revenue data or error information
    """
    url = f"https://www.boxofficemojo.com/title/{imdb_id}"
    headers = {'User-Agent': 'Mozilla/5.0 (compatible; MovieDataCollector/1.0)'}

    for attempt in range(max_retries):
        try:
            response = requests.get(url, headers=headers, timeout=10)

            if response.status_code == 404:
                # Movie not in Box Office Mojo database
                return error_result(imdb_id, 'not_found')

            elif response.status_code == 429:
                # Rate limited - wait with exponential backoff
                wait = 30 * (2 ** attempt)  # 30s, 60s, 120s
                print(f"  Rate limited for {imdb_id}, waiting {wait}s...")
                time.sleep(wait)
                continue

            elif response.status_code >= 500:
                # Server error - retry if attempts remain
                if attempt < max_retries - 1:
                    print(f"  Server error {response.status_code} for {imdb_id}, retrying...")
                    time.sleep(5)
                    continue
                return error_result(imdb_id, f'server_error_{response.status_code}')

            elif response.status_code == 200:
                # Success - parse the HTML
                soup = BeautifulSoup(response.content, 'html.parser')
                return parse_bom_revenue(soup, imdb_id)

            else:
                # Unexpected status code
                return error_result(imdb_id, f'http_{response.status_code}')

        except requests.Timeout:
            # Network timeout - retry if attempts remain
            if attempt < max_retries - 1:
                print(f"  Timeout for {imdb_id}, retrying...")
                time.sleep(2)
                continue
            return error_result(imdb_id, 'timeout')

        except Exception as e:
            # Catch-all for unexpected errors
            error_msg = str(e)[:50]  # Truncate long error messages
            return error_result(imdb_id, f'exception_{error_msg}')

    # Max retries exhausted
    return error_result(imdb_id, 'max_retries')

print("Main scraping function loaded!")

In [None]:
# Cell 17
def scrape_bom_batch(imdb_ids, checkpoint_file='data/raw/bom_checkpoint.csv', save_every=100):
    """
    Scrape multiple movies with automatic checkpointing for resumability.
    
    If interrupted, the function can resume from the last checkpoint by
    simply running again - it will load completed IMDb IDs and skip them.
    
    Args:
        imdb_ids: List of IMDb IDs to scrape
        checkpoint_file: Path to save progress (CSV format)
        save_every: Save checkpoint every N movies
    
    Returns:
        DataFrame with all scraped results
    """
    # Load existing checkpoint if available
    if os.path.exists(checkpoint_file):
        df_checkpoint = pd.read_csv(checkpoint_file)
        completed = set(df_checkpoint['imdb_id'].dropna())
        results = df_checkpoint.to_dict('records')
        print(f"Resuming from checkpoint: {len(completed)} already scraped")
    else:
        completed = set()
        results = []
        print("Starting fresh scrape (no checkpoint found)")

    # Filter to unscraped movies
    remaining = [id for id in imdb_ids if id not in completed]
    print(f"Scraping {len(remaining)} movies...")
    print(f"Estimated time: {len(remaining) * 2 / 3600:.1f} hours\n")

    # Initialize rate limiter
    limiter = BOMRateLimiter(delay=1.5)

    # Scrape each movie
    for i, imdb_id in enumerate(remaining, 1):
        limiter.wait()  # Respect rate limit before each request

        result = scrape_bom_movie(imdb_id)
        results.append(result)

        # Progress report every 50 movies
        if i % 50 == 0:
            success = sum(1 for r in results[-i:] if r['scrape_success'])
            print(f"  Progress: {i}/{len(remaining)} | Recent success rate: {success}/{min(i, 50)} ({success/min(i, 50)*100:.1f}%)")

        # Save checkpoint every N movies
        if i % save_every == 0:
            pd.DataFrame(results).to_csv(checkpoint_file, index=False)
            print(f"  Checkpoint saved ({len(results)} total movies)")

    # Final save
    df = pd.DataFrame(results)
    df.to_csv(checkpoint_file, index=False)
    
    # Calculate final statistics
    total_success = df['scrape_success'].sum()
    print(f"\n{'='*60}")
    print(f"Scraping complete!")
    print(f"  Total movies: {len(df)}")
    print(f"  Successful: {total_success} ({total_success/len(df)*100:.1f}%)")
    print(f"  Failed: {len(df) - total_success}")
    print(f"{'='*60}")

    return df

print("Batch processing function loaded!")

---
## Test Scraping on Sample Movies

Before running the full 3-4 hour scraping job, test on a few movies to verify the scraping logic works correctly.

In [None]:
# Cell 19 - TEST SCRAPING ON 5 MOVIES
# Test scraper on 5 sample movies
print("Testing Box Office Mojo scraper on sample movies...\n")

test_ids = df_tmdb.head(5)['imdb_id'].dropna().tolist()

for test_id in test_ids:
    result = scrape_bom_movie(test_id)
    
    # Get movie title for context
    title = df_tmdb[df_tmdb['imdb_id'] == test_id]['title'].values[0]
    
    print(f"{test_id} ({title}):")
    print(f"  Success: {result['scrape_success']}")
    print(f"  Worldwide: ${result['worldwide_total']:,}" if result['worldwide_total'] else f"  Worldwide: None")
    print(f"  Domestic: ${result['domestic_total']:,}" if result['domestic_total'] else f"  Domestic: None")
    print(f"  Opening: ${result['opening_weekend']:,}" if result['opening_weekend'] else f"  Opening: None")
    if result['error_message']:
        print(f"  Error: {result['error_message']}")
    print()
    
    time.sleep(1.5)  # Rate limit during test

print("Test complete! If results look good, proceed to full scraping.")

---
## Run Full Box Office Mojo Scraping

**Note:** This will take approximately 3-4 hours. The scraper uses checkpointing, so it can be safely interrupted and resumed. Consider running overnight or during a long break.

In [41]:
# Cell 21
# Get all IMDb IDs from TMDB data
imdb_ids = df_tmdb['imdb_id'].dropna().tolist()

print(f"Starting Box Office Mojo scraping...")
print(f"Total movies to scrape: {len(imdb_ids)}")
print(f"Estimated time: {len(imdb_ids) * 2 / 3600:.1f} hours")
print(f"Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")

# Run batch scraper with checkpointing
scrape_start = time.time()
df_bom = scrape_bom_batch(
    imdb_ids,
    checkpoint_file='data/raw/bom_checkpoint.csv',
    save_every=100
)
scrape_end = time.time()

print(f"\nFinished at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Total time: {(scrape_end - scrape_start) / 60:.1f} minutes ({(scrape_end - scrape_start) / 3600:.2f} hours)")

Starting Box Office Mojo scraping...
Total movies to scrape: 5094
Estimated time: 2.8 hours
Started at: 2026-01-21 14:03:55

Resuming from checkpoint: 3400 already scraped
Scraping 1694 movies...
Estimated time: 0.9 hours



KeyboardInterrupt: 

---
## Save Raw Box Office Mojo Data

In [None]:
# Cell 23
# Save Box Office Mojo data as raw CSV
output_file = 'data/raw/revenue_boxofficemojo_raw.csv'
df_bom.to_csv(output_file, index=False)

print(f"Box Office Mojo data saved to: {output_file}")
print(f"File size: {os.path.getsize(output_file) / 1024:.1f} KB")
print(f"Total rows: {len(df_bom)}")
print(f"Total columns: {len(df_bom.columns)}")

# Display first few rows
print("\nFirst 5 rows:")
print(df_bom.head())

---
## Merge TMDB and Box Office Mojo Data

Combine the two data sources, preferring BOM revenue (more complete) over TMDB revenue.

In [None]:
# Cell 25
# Merge TMDB and BOM data on IMDb ID
df_merged = df_tmdb.merge(
    df_bom[['imdb_id', 'domestic_total', 'opening_weekend', 'worldwide_total', 'international_total', 'bom_budget', 'scrape_success']],
    on='imdb_id',
    how='left'
)

# Create final revenue column - prefer BOM worldwide, fallback to TMDB revenue
df_merged['revenue_final'] = df_merged['worldwide_total'].fillna(df_merged['revenue'])

# Create final budget column - prefer BOM budget, fallback to TMDB budget
df_merged['budget_final'] = df_merged['bom_budget'].fillna(df_merged['budget'])

# Track revenue source for transparency
df_merged['revenue_source'] = 'none'
df_merged.loc[df_merged['revenue'] > 0, 'revenue_source'] = 'tmdb'
df_merged.loc[df_merged['worldwide_total'].notna(), 'revenue_source'] = 'bom'
df_merged.loc[(df_merged['revenue'] > 0) & (df_merged['worldwide_total'].notna()), 'revenue_source'] = 'both'

# Track budget source for transparency
df_merged['budget_source'] = 'none'
df_merged.loc[df_merged['budget'] > 0, 'budget_source'] = 'tmdb'
df_merged.loc[df_merged['bom_budget'].notna(), 'budget_source'] = 'bom'
df_merged.loc[(df_merged['budget'] > 0) & (df_merged['bom_budget'].notna()), 'budget_source'] = 'both'

print("="*60)
print("MERGE COMPLETE")
print("="*60)
print(f"\nMerged dataset shape: {df_merged.shape}")
print(f"  Rows: {len(df_merged)}")
print(f"  Columns: {len(df_merged.columns)}")

print("\nRevenue source breakdown:")
print(df_merged['revenue_source'].value_counts())

print("\nBudget source breakdown:")
print(df_merged['budget_source'].value_counts())

print("\nNew columns added:")
print("  - domestic_total (from BOM)")
print("  - opening_weekend (from BOM)")
print("  - worldwide_total (from BOM)")
print("  - international_total (from BOM)")
print("  - bom_budget (from BOM)")
print("  - revenue_final (combined best source)")
print("  - budget_final (combined best source)")
print("  - revenue_source (tracking field)")
print("  - budget_source (tracking field)")

---
## Quality Analysis

Analyze scraping results, gap filling, and dataset completeness.

In [None]:
# Cell 27
print("="*60)
print("BOX OFFICE MOJO SCRAPING RESULTS")
print("="*60)

# Scraping success rate
total_scraped = len(df_bom)
successful = df_bom['scrape_success'].sum()
print(f"\nScraping Success Rate:")
print(f"  Total attempted: {total_scraped}")
print(f"  Successful: {successful} ({successful/total_scraped*100:.1f}%)")
print(f"  Failed: {total_scraped - successful}")

# Most common errors
if (total_scraped - successful) > 0:
    print("\nMost common errors:")
    error_counts = df_bom[~df_bom['scrape_success']]['error_message'].value_counts().head(5)
    for error, count in error_counts.items():
        print(f"  {error}: {count}")

# Revenue coverage comparison
print(f"\n{'='*60}")
print("REVENUE COVERAGE")
print("="*60)
print(f"\nBefore BOM scraping:")
print(f"  TMDB revenue > 0: {(df_tmdb['revenue'] > 0).sum()} ({(df_tmdb['revenue'] > 0).sum()/len(df_tmdb)*100:.1f}%)")

print(f"\nAfter BOM scraping:")
print(f"  BOM revenue available: {df_merged['worldwide_total'].notna().sum()} ({df_merged['worldwide_total'].notna().sum()/len(df_merged)*100:.1f}%)")
print(f"  Final revenue > 0: {(df_merged['revenue_final'] > 0).sum()} ({(df_merged['revenue_final'] > 0).sum()/len(df_merged)*100:.1f}%)")

# Gap filling analysis for revenue
tmdb_missing = (df_merged['revenue'] == 0) | (df_merged['revenue'].isna())
bom_filled = df_merged['worldwide_total'].notna()
gaps_filled = (tmdb_missing & bom_filled).sum()

print(f"\nRevenue Gap Filling:")
print(f"  TMDB revenue gaps: {tmdb_missing.sum()}")
print(f"  Gaps filled by BOM: {gaps_filled}")
print(f"  Gap fill rate: {gaps_filled/tmdb_missing.sum()*100:.1f}%")

# Budget coverage comparison
print(f"\n{'='*60}")
print("BUDGET COVERAGE")
print("="*60)
print(f"\nBefore BOM scraping:")
print(f"  TMDB budget > 0: {(df_tmdb['budget'] > 0).sum()} ({(df_tmdb['budget'] > 0).sum()/len(df_tmdb)*100:.1f}%)")

print(f"\nAfter BOM scraping:")
print(f"  BOM budget available: {df_merged['bom_budget'].notna().sum()} ({df_merged['bom_budget'].notna().sum()/len(df_merged)*100:.1f}%)")
print(f"  Final budget > 0: {(df_merged['budget_final'] > 0).sum()} ({(df_merged['budget_final'] > 0).sum()/len(df_merged)*100:.1f}%)")

# Gap filling analysis for budget
budget_tmdb_missing = (df_merged['budget'] == 0) | (df_merged['budget'].isna())
budget_bom_filled = df_merged['bom_budget'].notna()
budget_gaps_filled = (budget_tmdb_missing & budget_bom_filled).sum()

print(f"\nBudget Gap Filling:")
print(f"  TMDB budget gaps: {budget_tmdb_missing.sum()}")
print(f"  Gaps filled by BOM: {budget_gaps_filled}")
print(f"  Gap fill rate: {budget_gaps_filled/budget_tmdb_missing.sum()*100:.1f}%")

# Revenue comparison for movies with both sources
print(f"\n{'='*60}")
print("REVENUE COMPARISON (Movies with Both Sources)")
print("="*60)

both = (df_merged['revenue'] > 0) & (df_merged['worldwide_total'].notna())
if both.sum() > 0:
    df_compare = df_merged[both].copy()
    df_compare['diff'] = abs(df_compare['revenue'] - df_compare['worldwide_total'])
    df_compare['diff_pct'] = df_compare['diff'] / df_compare['revenue'] * 100

    print(f"\nCount: {len(df_compare)}")
    print(f"Mean absolute difference: ${df_compare['diff'].mean():,.0f}")
    print(f"Median absolute difference: ${df_compare['diff'].median():,.0f}")
    print(f"Mean % difference: {df_compare['diff_pct'].mean():.1f}%")
    print(f"Median % difference: {df_compare['diff_pct'].median():.1f}%")
    print(f"\nMovies with >20% difference: {(df_compare['diff_pct'] > 20).sum()} ({(df_compare['diff_pct'] > 20).sum()/len(df_compare)*100:.1f}%)")
    
    # Show a few examples of large discrepancies
    if (df_compare['diff_pct'] > 20).sum() > 0:
        print("\nExample large discrepancies:")
        large_diff = df_compare.nlargest(3, 'diff_pct')[['title', 'revenue', 'worldwide_total', 'diff_pct']]
        for idx, row in large_diff.iterrows():
            print(f"  {row['title']}: TMDB=${row['revenue']:,} vs BOM=${row['worldwide_total']:,} ({row['diff_pct']:.1f}% diff)")

# Dataset readiness check
print(f"\n{'='*60}")
print("DATASET READINESS")
print("="*60)

# Before BOM
complete_before = (df_tmdb['budget'] > 0) & (df_tmdb['revenue'] > 0)
print(f"\nBefore BOM scraping:")
print(f"  Movies with budget > 0: {(df_tmdb['budget'] > 0).sum()}")
print(f"  Movies with revenue > 0: {(df_tmdb['revenue'] > 0).sum()}")
print(f"  Movies with BOTH budget & revenue: {complete_before.sum()} ({complete_before.sum()/len(df_tmdb)*100:.1f}%)")

# After BOM
complete_after = (df_merged['budget_final'] > 0) & (df_merged['revenue_final'] > 0)
print(f"\nAfter BOM scraping:")
print(f"  Movies with budget_final > 0: {(df_merged['budget_final'] > 0).sum()}")
print(f"  Movies with revenue_final > 0: {(df_merged['revenue_final'] > 0).sum()}")
print(f"  Movies with BOTH budget & revenue: {complete_after.sum()} ({complete_after.sum()/len(df_merged)*100:.1f}%)")

improvement = complete_after.sum() - complete_before.sum()
print(f"\nImprovement:")
print(f"  Additional complete movies: +{improvement}")
print(f"  Improvement rate: +{improvement/complete_before.sum()*100:.1f}%")

# Target assessment
TARGET = 2500
print(f"\nTarget Assessment (2,500-3,000 movies needed):")
if complete_after.sum() >= 3000:
    print(f"  ✅ EXCEEDS upper target! ({complete_after.sum():,} complete movies)")
elif complete_after.sum() >= TARGET:
    print(f"  ✅ MEETS lower target! ({complete_after.sum():,} complete movies)")
else:
    shortfall = TARGET - complete_after.sum()
    print(f"  ⚠️  Short by {shortfall:,} movies (have {complete_after.sum():,}, need {TARGET:,})")
    print(f"  Completion: {complete_after.sum()/TARGET*100:.1f}%")

---
## Save Merged Dataset

Save the combined TMDB + Box Office Mojo dataset for use in subsequent notebooks.

In [None]:
# Cell 29
# Save merged dataset
merged_output = 'data/raw/movies_merged.csv'
df_merged.to_csv(merged_output, index=False)

print(f"Merged dataset saved to: {merged_output}")
print(f"File size: {os.path.getsize(merged_output) / 1024:.1f} KB")
print(f"Total rows: {len(df_merged)}")
print(f"Total columns: {len(df_merged.columns)}")

print("\nColumn list:")
for i, col in enumerate(df_merged.columns, 1):
    print(f"  {i}. {col}")

print("\n✅ Data collection complete!")
print("Next step: Proceed to 02_data_cleaning_eda.ipynb")