In [None]:
# Save merged dataset
merged_output = 'data/raw/movies_merged.csv'
df_merged.to_csv(merged_output, index=False)

print(f"Merged dataset saved to: {merged_output}")
print(f"File size: {os.path.getsize(merged_output) / 1024:.1f} KB")
print(f"Total rows: {len(df_merged)}")
print(f"Total columns: {len(df_merged.columns)}")

print("\nColumn list:")
for i, col in enumerate(df_merged.columns, 1):
    print(f"  {i}. {col}")

print("\n✅ Data collection complete!")
print("Next step: Proceed to 02_data_cleaning_eda.ipynb")

---
## Save Merged Dataset

Save the combined TMDB + Box Office Mojo dataset for use in subsequent notebooks.

In [None]:
print("="*60)
print("BOX OFFICE MOJO SCRAPING RESULTS")
print("="*60)

# Scraping success rate
total_scraped = len(df_bom)
successful = df_bom['scrape_success'].sum()
print(f"\nScraping Success Rate:")
print(f"  Total attempted: {total_scraped}")
print(f"  Successful: {successful} ({successful/total_scraped*100:.1f}%)")
print(f"  Failed: {total_scraped - successful}")

# Most common errors
if (total_scraped - successful) > 0:
    print("\nMost common errors:")
    error_counts = df_bom[~df_bom['scrape_success']]['error_message'].value_counts().head(5)
    for error, count in error_counts.items():
        print(f"  {error}: {count}")

# Revenue coverage comparison
print(f"\n{'='*60}")
print("REVENUE COVERAGE")
print("="*60)
print(f"\nBefore BOM scraping:")
print(f"  TMDB revenue > 0: {(df_tmdb['revenue'] > 0).sum()} ({(df_tmdb['revenue'] > 0).sum()/len(df_tmdb)*100:.1f}%)")

print(f"\nAfter BOM scraping:")
print(f"  BOM revenue available: {df_merged['worldwide_total'].notna().sum()} ({df_merged['worldwide_total'].notna().sum()/len(df_merged)*100:.1f}%)")
print(f"  Final revenue > 0: {(df_merged['revenue_final'] > 0).sum()} ({(df_merged['revenue_final'] > 0).sum()/len(df_merged)*100:.1f}%)")

# Gap filling analysis
tmdb_missing = (df_merged['revenue'] == 0) | (df_merged['revenue'].isna())
bom_filled = df_merged['worldwide_total'].notna()
gaps_filled = (tmdb_missing & bom_filled).sum()

print(f"\nGap Filling:")
print(f"  TMDB revenue gaps: {tmdb_missing.sum()}")
print(f"  Gaps filled by BOM: {gaps_filled}")
print(f"  Gap fill rate: {gaps_filled/tmdb_missing.sum()*100:.1f}%")

# Revenue comparison for movies with both sources
print(f"\n{'='*60}")
print("REVENUE COMPARISON (Movies with Both Sources)")
print("="*60)

both = (df_merged['revenue'] > 0) & (df_merged['worldwide_total'].notna())
if both.sum() > 0:
    df_compare = df_merged[both].copy()
    df_compare['diff'] = abs(df_compare['revenue'] - df_compare['worldwide_total'])
    df_compare['diff_pct'] = df_compare['diff'] / df_compare['revenue'] * 100

    print(f"\nCount: {len(df_compare)}")
    print(f"Mean absolute difference: ${df_compare['diff'].mean():,.0f}")
    print(f"Median absolute difference: ${df_compare['diff'].median():,.0f}")
    print(f"Mean % difference: {df_compare['diff_pct'].mean():.1f}%")
    print(f"Median % difference: {df_compare['diff_pct'].median():.1f}%")
    print(f"\nMovies with >20% difference: {(df_compare['diff_pct'] > 20).sum()} ({(df_compare['diff_pct'] > 20).sum()/len(df_compare)*100:.1f}%)")
    
    # Show a few examples of large discrepancies
    if (df_compare['diff_pct'] > 20).sum() > 0:
        print("\nExample large discrepancies:")
        large_diff = df_compare.nlargest(3, 'diff_pct')[['title', 'revenue', 'worldwide_total', 'diff_pct']]
        for idx, row in large_diff.iterrows():
            print(f"  {row['title']}: TMDB=${row['revenue']:,} vs BOM=${row['worldwide_total']:,} ({row['diff_pct']:.1f}% diff)")

# Dataset readiness check
print(f"\n{'='*60}")
print("DATASET READINESS")
print("="*60)

# Before BOM
complete_before = (df_tmdb['budget'] > 0) & (df_tmdb['revenue'] > 0)
print(f"\nBefore BOM scraping:")
print(f"  Movies with budget > 0: {(df_tmdb['budget'] > 0).sum()}")
print(f"  Movies with revenue > 0: {(df_tmdb['revenue'] > 0).sum()}")
print(f"  Movies with BOTH budget & revenue: {complete_before.sum()} ({complete_before.sum()/len(df_tmdb)*100:.1f}%)")

# After BOM
complete_after = (df_merged['budget'] > 0) & (df_merged['revenue_final'] > 0)
print(f"\nAfter BOM scraping:")
print(f"  Movies with budget > 0: {(df_merged['budget'] > 0).sum()}")
print(f"  Movies with revenue_final > 0: {(df_merged['revenue_final'] > 0).sum()}")
print(f"  Movies with BOTH budget & revenue: {complete_after.sum()} ({complete_after.sum()/len(df_merged)*100:.1f}%)")

improvement = complete_after.sum() - complete_before.sum()
print(f"\nImprovement:")
print(f"  Additional complete movies: +{improvement}")
print(f"  Improvement rate: +{improvement/complete_before.sum()*100:.1f}%")

# Target assessment
TARGET = 5000
print(f"\nTarget Assessment:")
if complete_after.sum() >= TARGET:
    print(f"  ✅ EXCEEDS {TARGET:,} movie target! ({complete_after.sum():,} complete movies)")
else:
    shortfall = TARGET - complete_after.sum()
    print(f"  ⚠️  Short by {shortfall:,} movies (have {complete_after.sum():,}, need {TARGET:,})")
    print(f"  Completion: {complete_after.sum()/TARGET*100:.1f}%")

---
## Quality Analysis

Analyze scraping results, gap filling, and dataset completeness.

In [None]:
# Merge TMDB and BOM data on IMDb ID
df_merged = df_tmdb.merge(
    df_bom[['imdb_id', 'domestic_total', 'opening_weekend', 'worldwide_total', 'international_total', 'bom_budget', 'scrape_success']],
    on='imdb_id',
    how='left'
)

# Create final revenue column - prefer BOM worldwide, fallback to TMDB revenue
df_merged['revenue_final'] = df_merged['worldwide_total'].fillna(df_merged['revenue'])

# Track revenue source for transparency
df_merged['revenue_source'] = 'none'
df_merged.loc[df_merged['revenue'] > 0, 'revenue_source'] = 'tmdb'
df_merged.loc[df_merged['worldwide_total'].notna(), 'revenue_source'] = 'bom'
df_merged.loc[(df_merged['revenue'] > 0) & (df_merged['worldwide_total'].notna()), 'revenue_source'] = 'both'

print("="*60)
print("MERGE COMPLETE")
print("="*60)
print(f"\nMerged dataset shape: {df_merged.shape}")
print(f"  Rows: {len(df_merged)}")
print(f"  Columns: {len(df_merged.columns)}")

print("\nRevenue source breakdown:")
print(df_merged['revenue_source'].value_counts())

print("\nNew columns added:")
print("  - domestic_total (from BOM)")
print("  - opening_weekend (from BOM)")
print("  - worldwide_total (from BOM)")
print("  - international_total (from BOM)")
print("  - bom_budget (from BOM)")
print("  - revenue_final (combined best source)")
print("  - revenue_source (tracking field)")

---
## Merge TMDB and Box Office Mojo Data

Combine the two data sources, preferring BOM revenue (more complete) over TMDB revenue.

In [None]:
# Save Box Office Mojo data as raw CSV
output_file = 'data/raw/revenue_boxofficemojo_raw.csv'
df_bom.to_csv(output_file, index=False)

print(f"Box Office Mojo data saved to: {output_file}")
print(f"File size: {os.path.getsize(output_file) / 1024:.1f} KB")
print(f"Total rows: {len(df_bom)}")
print(f"Total columns: {len(df_bom.columns)}")

# Display first few rows
print("\nFirst 5 rows:")
print(df_bom.head())

---
## Save Raw Box Office Mojo Data

In [None]:
# Get all IMDb IDs from TMDB data
imdb_ids = df_tmdb['imdb_id'].dropna().tolist()

print(f"Starting Box Office Mojo scraping...")
print(f"Total movies to scrape: {len(imdb_ids)}")
print(f"Estimated time: {len(imdb_ids) * 2 / 3600:.1f} hours")
print(f"Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")

# Run batch scraper with checkpointing
scrape_start = time.time()
df_bom = scrape_bom_batch(
    imdb_ids,
    checkpoint_file='data/raw/bom_checkpoint.csv',
    save_every=100
)
scrape_end = time.time()

print(f"\nFinished at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Total time: {(scrape_end - scrape_start) / 60:.1f} minutes ({(scrape_end - scrape_start) / 3600:.2f} hours)")

---
## Run Full Box Office Mojo Scraping

**Note:** This will take approximately 3-4 hours. The scraper uses checkpointing, so it can be safely interrupted and resumed. Consider running overnight or during a long break.

In [None]:
# Test scraper on 5 sample movies
print("Testing Box Office Mojo scraper on sample movies...\n")

test_ids = df_tmdb.head(5)['imdb_id'].dropna().tolist()

for test_id in test_ids:
    result = scrape_bom_movie(test_id)
    
    # Get movie title for context
    title = df_tmdb[df_tmdb['imdb_id'] == test_id]['title'].values[0]
    
    print(f"{test_id} ({title}):")
    print(f"  Success: {result['scrape_success']}")
    print(f"  Worldwide: ${result['worldwide_total']:,}" if result['worldwide_total'] else f"  Worldwide: None")
    print(f"  Domestic: ${result['domestic_total']:,}" if result['domestic_total'] else f"  Domestic: None")
    print(f"  Opening: ${result['opening_weekend']:,}" if result['opening_weekend'] else f"  Opening: None")
    if result['error_message']:
        print(f"  Error: {result['error_message']}")
    print()
    
    time.sleep(1.5)  # Rate limit during test

print("Test complete! If results look good, proceed to full scraping.")

---
## Test Scraping on Sample Movies

Before running the full 3-4 hour scraping job, test on a few movies to verify the scraping logic works correctly.

In [None]:
def scrape_bom_batch(imdb_ids, checkpoint_file='data/raw/bom_checkpoint.csv', save_every=100):
    """
    Scrape multiple movies with automatic checkpointing for resumability.
    
    If interrupted, the function can resume from the last checkpoint by
    simply running again - it will load completed IMDb IDs and skip them.
    
    Args:
        imdb_ids: List of IMDb IDs to scrape
        checkpoint_file: Path to save progress (CSV format)
        save_every: Save checkpoint every N movies
    
    Returns:
        DataFrame with all scraped results
    """
    # Load existing checkpoint if available
    if os.path.exists(checkpoint_file):
        df_checkpoint = pd.read_csv(checkpoint_file)
        completed = set(df_checkpoint['imdb_id'].dropna())
        results = df_checkpoint.to_dict('records')
        print(f"Resuming from checkpoint: {len(completed)} already scraped")
    else:
        completed = set()
        results = []
        print("Starting fresh scrape (no checkpoint found)")

    # Filter to unscraped movies
    remaining = [id for id in imdb_ids if id not in completed]
    print(f"Scraping {len(remaining)} movies...")
    print(f"Estimated time: {len(remaining) * 2 / 3600:.1f} hours\n")

    # Initialize rate limiter
    limiter = BOMRateLimiter(delay=1.5)

    # Scrape each movie
    for i, imdb_id in enumerate(remaining, 1):
        limiter.wait()  # Respect rate limit before each request

        result = scrape_bom_movie(imdb_id)
        results.append(result)

        # Progress report every 50 movies
        if i % 50 == 0:
            success = sum(1 for r in results[-i:] if r['scrape_success'])
            print(f"  Progress: {i}/{len(remaining)} | Recent success rate: {success}/{min(i, 50)} ({success/min(i, 50)*100:.1f}%)")

        # Save checkpoint every N movies
        if i % save_every == 0:
            pd.DataFrame(results).to_csv(checkpoint_file, index=False)
            print(f"  Checkpoint saved ({len(results)} total movies)")

    # Final save
    df = pd.DataFrame(results)
    df.to_csv(checkpoint_file, index=False)
    
    # Calculate final statistics
    total_success = df['scrape_success'].sum()
    print(f"\n{'='*60}")
    print(f"Scraping complete!")
    print(f"  Total movies: {len(df)}")
    print(f"  Successful: {total_success} ({total_success/len(df)*100:.1f}%)")
    print(f"  Failed: {len(df) - total_success}")
    print(f"{'='*60}")

    return df

print("Batch processing function loaded!")

In [None]:
def scrape_bom_movie(imdb_id, max_retries=3):
    """
    Scrape revenue data for a single movie from Box Office Mojo.
    
    Handles various error conditions:
    - 404: Movie not found in BOM
    - 429: Rate limited (exponential backoff)
    - 5xx: Server errors (retry with delays)
    - Timeout: Network timeout (retry once)
    - Other exceptions: Catch and log
    
    Args:
        imdb_id: IMDb ID (e.g., 'tt1375666')
        max_retries: Maximum retry attempts for recoverable errors
    
    Returns:
        Dictionary with revenue data or error information
    """
    url = f"https://www.boxofficemojo.com/title/{imdb_id}"
    headers = {'User-Agent': 'Mozilla/5.0 (compatible; MovieDataCollector/1.0)'}

    for attempt in range(max_retries):
        try:
            response = requests.get(url, headers=headers, timeout=10)

            if response.status_code == 404:
                # Movie not in Box Office Mojo database
                return error_result(imdb_id, 'not_found')

            elif response.status_code == 429:
                # Rate limited - wait with exponential backoff
                wait = 30 * (2 ** attempt)  # 30s, 60s, 120s
                print(f"  Rate limited for {imdb_id}, waiting {wait}s...")
                time.sleep(wait)
                continue

            elif response.status_code >= 500:
                # Server error - retry if attempts remain
                if attempt < max_retries - 1:
                    print(f"  Server error {response.status_code} for {imdb_id}, retrying...")
                    time.sleep(5)
                    continue
                return error_result(imdb_id, f'server_error_{response.status_code}')

            elif response.status_code == 200:
                # Success - parse the HTML
                soup = BeautifulSoup(response.content, 'html.parser')
                return parse_bom_revenue(soup, imdb_id)

            else:
                # Unexpected status code
                return error_result(imdb_id, f'http_{response.status_code}')

        except requests.Timeout:
            # Network timeout - retry if attempts remain
            if attempt < max_retries - 1:
                print(f"  Timeout for {imdb_id}, retrying...")
                time.sleep(2)
                continue
            return error_result(imdb_id, 'timeout')

        except Exception as e:
            # Catch-all for unexpected errors
            error_msg = str(e)[:50]  # Truncate long error messages
            return error_result(imdb_id, f'exception_{error_msg}')

    # Max retries exhausted
    return error_result(imdb_id, 'max_retries')

print("Main scraping function loaded!")

In [None]:
def parse_bom_revenue(soup, imdb_id):
    """
    Extract revenue data from Box Office Mojo HTML using regex patterns.
    
    BOM pages have varied layouts, so regex provides flexibility to find
    labeled dollar amounts regardless of exact HTML structure.
    
    Args:
        soup: BeautifulSoup object of page HTML
        imdb_id: IMDb ID for result dictionary
    
    Returns:
        Dictionary with revenue fields or None values if not found
    """
    import re

    result = {
        'imdb_id': imdb_id,
        'domestic_total': None,
        'opening_weekend': None,
        'international_total': None,
        'worldwide_total': None,
        'bom_budget': None,
        'scrape_success': True,
        'error_message': None
    }

    # Get all text content from page
    text = soup.get_text()

    # Regex pattern to match currency amounts
    currency_pattern = r'\$[\d,]+'

    # Search patterns: look for label followed by currency amount
    # Case-insensitive for fields that vary in capitalization
    patterns = {
        'opening_weekend': r'Opening.*?' + currency_pattern,
        'domestic_total': r'Domestic.*?' + currency_pattern,
        'international_total': r'International.*?' + currency_pattern,
        'worldwide_total': r'Worldwide.*?' + currency_pattern,
        'bom_budget': r'Budget.*?' + currency_pattern
    }

    for field, pattern in patterns.items():
        # Use IGNORECASE for opening and budget (capitalization varies)
        flags = re.IGNORECASE if field in ['opening_weekend', 'bom_budget'] else 0
        match = re.search(pattern, text, flags)
        
        if match:
            # Extract just the currency part from the matched text
            currency_match = re.search(currency_pattern, match.group())
            if currency_match:
                result[field] = clean_currency(currency_match.group())

    return result

print("HTML parsing function loaded!")

In [None]:
# Box Office Mojo Helper Functions

class BOMRateLimiter:
    """Rate limiter for Box Office Mojo scraping (1.5s between requests)"""
    def __init__(self, delay=1.5):
        self.delay = delay
        self.last_time = None

    def wait(self):
        """Wait if needed to respect rate limit"""
        if self.last_time:
            elapsed = time.time() - self.last_time
            if elapsed < self.delay:
                time.sleep(self.delay - elapsed)
        self.last_time = time.time()


def clean_currency(text):
    """
    Convert currency string to integer.
    
    Examples:
        '$123,456,789' -> 123456789
        '$1,000' -> 1000
        '–' or None -> None
    """
    if not text or text == '–' or text == '-':
        return None
    # Remove $ and commas, convert to int
    return int(text.replace('$', '').replace(',', ''))


def error_result(imdb_id, msg):
    """
    Create standardized error result dictionary.
    
    Args:
        imdb_id: IMDb ID that failed
        msg: Error message/code
    
    Returns:
        Dictionary with all fields set to None and error message
    """
    return {
        'imdb_id': imdb_id,
        'domestic_total': None,
        'opening_weekend': None,
        'international_total': None,
        'worldwide_total': None,
        'bom_budget': None,
        'scrape_success': False,
        'error_message': msg
    }

print("Box Office Mojo helper functions loaded!")

# Notebook 1: Data Collection

## Purpose
This notebook handles the collection of raw movie data from multiple sources including:
- **TMDB API**: Movie metadata (budget, cast, crew, genres, runtime, release dates)
- **Box Office Mojo**: Box office revenue data (opening weekend, total domestic, worldwide)
- **OMDb API**: Supplemental metadata and IMDb ratings
- **YouTube Data API**: Trailer view counts and engagement metrics

## Objectives
1. Set up API connections and test endpoints
2. Write data collection functions with error handling and rate limiting
3. Collect data for 3,000+ movies from 2010-2024
4. Merge data sources on IMDb ID
5. Save raw datasets to CSV files in `data/raw/` directory
6. Perform initial data inspection

## Outputs
- `data/raw/movies_tmdb_raw.csv`
- `data/raw/revenue_boxofficemojo_raw.csv`
- `data/raw/trailers_youtube_raw.csv`

## Notes
- This notebook may take several hours to run due to API rate limits
- Once data is collected, subsequent runs should load from saved CSV files
- API keys should be stored in a `.env` file (not committed to git)

---
## Setup and Imports

In [7]:
# Import libraries
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import time
import os
from dotenv import load_dotenv
from datetime import datetime
import json

# Load environment variables
load_dotenv()

# API Keys
TMDB_API_KEY = os.getenv('TMDB_API_KEY')
OMDB_API_KEY = os.getenv('OMDB_API_KEY')
YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY')

# Verify API keys are loaded
print("API Keys loaded:")
print(f"  TMDB: {'✓' if TMDB_API_KEY else '✗'}")
print(f"  OMDb: {'✓' if OMDB_API_KEY else '✗'}")
print(f"  YouTube: {'✓' if YOUTUBE_API_KEY else '✗'}")

# Test TMDB API connection
print("\nTesting TMDB API connection...")
test_url = f"https://api.themoviedb.org/3/movie/550?api_key={TMDB_API_KEY}"
try:
    response = requests.get(test_url)
    if response.status_code == 200:
        print("✓ TMDB API connection successful!")
        print(f"  Test movie: {response.json()['title']}")
    else:
        print(f"✗ TMDB API error: {response.status_code}")
except Exception as e:
    print(f"✗ Connection error: {e}")

API Keys loaded:
  TMDB: ✓
  OMDb: ✓
  YouTube: ✓

Testing TMDB API connection...
✓ TMDB API connection successful!
  Test movie: Fight Club


---
## Data Collection Functions

In [8]:
# TMDB API Base URL
TMDB_BASE_URL = "https://api.themoviedb.org/3"

# Rate limiter class to handle TMDB's 40 requests per 10 seconds limit
class RateLimiter:
    def __init__(self, max_calls=40, time_period=10):
        self.max_calls = max_calls
        self.time_period = time_period
        self.calls = []
    
    def wait_if_needed(self):
        now = time.time()
        # Remove calls older than time_period
        self.calls = [call_time for call_time in self.calls if now - call_time < self.time_period]
        
        if len(self.calls) >= self.max_calls:
            sleep_time = self.time_period - (now - self.calls[0]) + 0.1
            print(f"  Rate limit reached, waiting {sleep_time:.1f} seconds...")
            time.sleep(sleep_time)
            self.calls = []
        
        self.calls.append(time.time())

# Initialize rate limiter
rate_limiter = RateLimiter(max_calls=35, time_period=10)  # Using 35 to be safe

def get_popular_movies_by_year(year, pages=5):
    """
    Get popular movies for a specific year using TMDB discover endpoint.
    
    Args:
        year: Release year (e.g., 2020)
        pages: Number of pages to fetch (20 movies per page)
    
    Returns:
        List of movie IDs
    """
    movie_ids = []
    
    for page in range(1, pages + 1):
        rate_limiter.wait_if_needed()
        
        url = f"{TMDB_BASE_URL}/discover/movie"
        params = {
            'api_key': TMDB_API_KEY,
            'language': 'en-US',
            'sort_by': 'popularity.desc',
            'primary_release_year': year,
            'page': page,
            'vote_count.gte': 50  # Minimum votes to ensure it's not obscure
        }
        
        try:
            response = requests.get(url, params=params, timeout=10)
            if response.status_code == 200:
                data = response.json()
                movie_ids.extend([movie['id'] for movie in data['results']])
            else:
                print(f"  Error fetching page {page} for year {year}: {response.status_code}")
        except Exception as e:
            print(f"  Exception for year {year}, page {page}: {e}")
            time.sleep(2)
    
    return movie_ids

def get_movie_details(movie_id):
    """
    Get detailed information for a specific movie.
    
    Args:
        movie_id: TMDB movie ID
    
    Returns:
        Dictionary with movie details or None if error
    """
    rate_limiter.wait_if_needed()
    
    url = f"{TMDB_BASE_URL}/movie/{movie_id}"
    params = {
        'api_key': TMDB_API_KEY,
        'append_to_response': 'credits,release_dates,videos'
    }
    
    try:
        response = requests.get(url, params=params, timeout=10)
        if response.status_code == 200:
            return response.json()
        else:
            print(f"  Error fetching movie {movie_id}: {response.status_code}")
            return None
    except Exception as e:
        print(f"  Exception for movie {movie_id}: {e}")
        return None

def extract_movie_data(movie_details):
    """
    Extract relevant fields from TMDB movie details.
    
    Args:
        movie_details: Raw JSON response from TMDB
    
    Returns:
        Dictionary with extracted fields
    """
    if not movie_details:
        return None
    
    # Extract release dates to find US release
    us_release_date = None
    us_certification = None
    if 'release_dates' in movie_details and 'results' in movie_details['release_dates']:
        for country_release in movie_details['release_dates']['results']:
            if country_release['iso_3166_1'] == 'US':
                for release in country_release['release_dates']:
                    if release.get('type') in [2, 3]:  # Theatrical release
                        us_release_date = release.get('release_date')
                        us_certification = release.get('certification')
                        break
                break
    
    # Extract cast (top 5 actors)
    cast = []
    if 'credits' in movie_details and 'cast' in movie_details['credits']:
        cast = [
            {
                'id': actor['id'],
                'name': actor['name'],
                'order': actor['order']
            }
            for actor in movie_details['credits']['cast'][:5]
        ]
    
    # Extract director and crew
    director = None
    if 'credits' in movie_details and 'crew' in movie_details['credits']:
        for crew_member in movie_details['credits']['crew']:
            if crew_member['job'] == 'Director':
                director = {
                    'id': crew_member['id'],
                    'name': crew_member['name']
                }
                break
    
    # Extract YouTube trailer key
    trailer_key = None
    if 'videos' in movie_details and 'results' in movie_details['videos']:
        for video in movie_details['videos']['results']:
            if video['type'] == 'Trailer' and video['site'] == 'YouTube':
                trailer_key = video['key']
                break
    
    # Extract genres
    genres = [genre['name'] for genre in movie_details.get('genres', [])]
    
    # Extract production companies
    production_companies = [company['name'] for company in movie_details.get('production_companies', [])]
    
    return {
        'tmdb_id': movie_details.get('id'),
        'imdb_id': movie_details.get('imdb_id'),
        'title': movie_details.get('title'),
        'original_title': movie_details.get('original_title'),
        'release_date': movie_details.get('release_date'),
        'us_release_date': us_release_date,
        'us_certification': us_certification,
        'budget': movie_details.get('budget'),
        'revenue': movie_details.get('revenue'),  # Note: TMDB revenue often incomplete
        'runtime': movie_details.get('runtime'),
        'genres': '|'.join(genres) if genres else None,
        'primary_genre': genres[0] if genres else None,
        'num_genres': len(genres),
        'popularity': movie_details.get('popularity'),
        'vote_average': movie_details.get('vote_average'),
        'vote_count': movie_details.get('vote_count'),
        'director_id': director['id'] if director else None,
        'director_name': director['name'] if director else None,
        'cast_ids': '|'.join([str(actor['id']) for actor in cast]),
        'cast_names': '|'.join([actor['name'] for actor in cast]),
        'production_companies': '|'.join(production_companies) if production_companies else None,
        'num_production_companies': len(production_companies),
        'original_language': movie_details.get('original_language'),
        'production_countries': '|'.join([country['iso_3166_1'] for country in movie_details.get('production_countries', [])]),
        'youtube_trailer_key': trailer_key,
        'tagline': movie_details.get('tagline'),
        'overview': movie_details.get('overview')
    }

def collect_movies_for_year_range(start_year, end_year, pages_per_year=5):
    """
    Collect movie data for a range of years.
    
    Args:
        start_year: Starting year (inclusive)
        end_year: Ending year (inclusive)
        pages_per_year: Number of pages to fetch per year
    
    Returns:
        DataFrame with collected movie data
    """
    all_movies = []
    total_movies = 0
    
    for year in range(start_year, end_year + 1):
        print(f"\n=== Collecting movies for {year} ===")
        
        # Get movie IDs for this year
        movie_ids = get_popular_movies_by_year(year, pages=pages_per_year)
        print(f"  Found {len(movie_ids)} movie IDs for {year}")
        
        # Get details for each movie
        year_movies = 0
        for i, movie_id in enumerate(movie_ids, 1):
            if i % 20 == 0:
                print(f"  Progress: {i}/{len(movie_ids)} movies processed for {year}")
            
            movie_details = get_movie_details(movie_id)
            if movie_details:
                extracted_data = extract_movie_data(movie_details)
                if extracted_data:
                    all_movies.append(extracted_data)
                    year_movies += 1
        
        print(f"  Collected {year_movies} movies for {year}")
        total_movies += year_movies
        print(f"  Total movies collected so far: {total_movies}")
    
    df = pd.DataFrame(all_movies)
    return df

print("Data collection functions loaded successfully!")

Data collection functions loaded successfully!


---
## Collect Data

In [9]:
# Collect TMDB data for movies from 2010-2024
# This will take a while due to rate limiting (approximately 2-3 hours)

# Set parameters
START_YEAR = 2010
END_YEAR = 2024
PAGES_PER_YEAR = 17  # 17 pages x 20 movies = ~340 movies per year x 15 years = ~5,100 movies

print(f"Starting data collection for {START_YEAR}-{END_YEAR}")
print(f"Fetching {PAGES_PER_YEAR} pages per year (~{PAGES_PER_YEAR * 20} movies/year)")
print(f"Estimated total movies: ~{(END_YEAR - START_YEAR + 1) * PAGES_PER_YEAR * 20}")
print(f"This will take approximately 2-3 hours due to API rate limiting.\n")

# Collect the data
start_time = time.time()
df_tmdb = collect_movies_for_year_range(START_YEAR, END_YEAR, pages_per_year=PAGES_PER_YEAR)
end_time = time.time()

print(f"\n{'='*60}")
print(f"Data collection complete!")
print(f"Total movies collected: {len(df_tmdb)}")
print(f"Time elapsed: {(end_time - start_time) / 60:.1f} minutes")
print(f"{'='*60}")

Starting data collection for 2010-2024
Fetching 17 pages per year (~340 movies/year)
Estimated total movies: ~5100
This will take approximately 2-3 hours due to API rate limiting.


=== Collecting movies for 2010 ===
  Found 340 movie IDs for 2010
  Rate limit reached, waiting 4.7 seconds...
  Progress: 20/340 movies processed for 2010
  Progress: 40/340 movies processed for 2010
  Rate limit reached, waiting 4.6 seconds...
  Progress: 60/340 movies processed for 2010
  Progress: 80/340 movies processed for 2010
  Rate limit reached, waiting 5.1 seconds...
  Progress: 100/340 movies processed for 2010
  Progress: 120/340 movies processed for 2010
  Rate limit reached, waiting 5.0 seconds...
  Progress: 140/340 movies processed for 2010
  Rate limit reached, waiting 4.0 seconds...
  Progress: 160/340 movies processed for 2010
  Progress: 180/340 movies processed for 2010
  Rate limit reached, waiting 3.2 seconds...
  Progress: 200/340 movies processed for 2010
  Progress: 220/340 movies

---
## Save Raw Data

In [10]:
# Create data/raw directory if it doesn't exist
os.makedirs('data/raw', exist_ok=True)

# Save to CSV
output_file = 'data/raw/movies_tmdb_raw.csv'
df_tmdb.to_csv(output_file, index=False)

print(f"Data saved to {output_file}")
print(f"File size: {os.path.getsize(output_file) / 1024:.1f} KB")
print(f"Total rows: {len(df_tmdb)}")
print(f"Total columns: {len(df_tmdb.columns)}")

Data saved to data/raw/movies_tmdb_raw.csv
File size: 3223.3 KB
Total rows: 5100
Total columns: 27


---
## Initial Data Inspection

In [11]:
# Basic data inspection
print("="*60)
print("DATASET OVERVIEW")
print("="*60)

print(f"\nShape: {df_tmdb.shape}")
print(f"  Rows (movies): {df_tmdb.shape[0]}")
print(f"  Columns (features): {df_tmdb.shape[1]}")

print("\n" + "="*60)
print("COLUMN NAMES")
print("="*60)
print(df_tmdb.columns.tolist())

print("\n" + "="*60)
print("DATA TYPES")
print("="*60)
print(df_tmdb.dtypes)

print("\n" + "="*60)
print("MISSING VALUES")
print("="*60)
missing = df_tmdb.isnull().sum()
missing_pct = (missing / len(df_tmdb) * 100).round(1)
missing_df = pd.DataFrame({
    'Missing': missing,
    'Percentage': missing_pct
})
print(missing_df[missing_df['Missing'] > 0].sort_values('Missing', ascending=False))

print("\n" + "="*60)
print("FIRST 5 ROWS")
print("="*60)
print(df_tmdb.head())

print("\n" + "="*60)
print("BASIC STATISTICS (Numeric Columns)")
print("="*60)
print(df_tmdb.describe())

print("\n" + "="*60)
print("KEY METRICS")
print("="*60)
print(f"Movies with budget data: {df_tmdb['budget'].notna().sum()} ({df_tmdb['budget'].notna().sum() / len(df_tmdb) * 100:.1f}%)")
print(f"Movies with non-zero budget: {(df_tmdb['budget'] > 0).sum()} ({(df_tmdb['budget'] > 0).sum() / len(df_tmdb) * 100:.1f}%)")
print(f"Movies with revenue data: {df_tmdb['revenue'].notna().sum()} ({df_tmdb['revenue'].notna().sum() / len(df_tmdb) * 100:.1f}%)")
print(f"Movies with non-zero revenue: {(df_tmdb['revenue'] > 0).sum()} ({(df_tmdb['revenue'] > 0).sum() / len(df_tmdb) * 100:.1f}%)")
print(f"Movies with IMDb ID: {df_tmdb['imdb_id'].notna().sum()} ({df_tmdb['imdb_id'].notna().sum() / len(df_tmdb) * 100:.1f}%)")
print(f"Movies with director: {df_tmdb['director_name'].notna().sum()} ({df_tmdb['director_name'].notna().sum() / len(df_tmdb) * 100:.1f}%)")
print(f"Movies with cast data: {df_tmdb['cast_names'].notna().sum()} ({df_tmdb['cast_names'].notna().sum() / len(df_tmdb) * 100:.1f}%)")
print(f"Movies with YouTube trailer: {df_tmdb['youtube_trailer_key'].notna().sum()} ({df_tmdb['youtube_trailer_key'].notna().sum() / len(df_tmdb) * 100:.1f}%)")

print("\n" + "="*60)
print("SAMPLE MOVIES")
print("="*60)
print(df_tmdb[['title', 'release_date', 'budget', 'revenue', 'primary_genre', 'director_name']].sample(10))

DATASET OVERVIEW

Shape: (5100, 27)
  Rows (movies): 5100
  Columns (features): 27

COLUMN NAMES
['tmdb_id', 'imdb_id', 'title', 'original_title', 'release_date', 'us_release_date', 'us_certification', 'budget', 'revenue', 'runtime', 'genres', 'primary_genre', 'num_genres', 'popularity', 'vote_average', 'vote_count', 'director_id', 'director_name', 'cast_ids', 'cast_names', 'production_companies', 'num_production_companies', 'original_language', 'production_countries', 'youtube_trailer_key', 'tagline', 'overview']

DATA TYPES
tmdb_id                       int64
imdb_id                      object
title                        object
original_title               object
release_date                 object
us_release_date              object
us_certification             object
budget                        int64
revenue                       int64
runtime                       int64
genres                       object
primary_genre                object
num_genres                    int64


In [12]:
df_tmdb.head()

Unnamed: 0,tmdb_id,imdb_id,title,original_title,release_date,us_release_date,us_certification,budget,revenue,runtime,...,director_name,cast_ids,cast_names,production_companies,num_production_companies,original_language,production_countries,youtube_trailer_key,tagline,overview
0,27205,tt1375666,Inception,Inception,2010-07-15,2010-07-16T00:00:00.000Z,PG-13,160000000,839030630,148,...,Christopher Nolan,6193|24045|3899|2524|27578,Leonardo DiCaprio|Joseph Gordon-Levitt|Ken Wat...,Legendary Pictures|Syncopy|Warner Bros. Pictures,3,en,GB|US,cdx31ak4KbQ,Your mind is the scene of the crime.,"Cobb, a skilled thief who commits corporate es..."
1,10138,tt1228705,Iron Man 2,Iron Man 2,2010-04-28,2010-05-07T00:00:00.000Z,PG-13,200000000,623933331,124,...,Jon Favreau,3223|12052|1896|1245|6807,Robert Downey Jr.|Gwyneth Paltrow|Don Cheadle|...,Marvel Studios|Fairview Entertainment|Marvel E...,3,en,US,BoohRoVA9WQ,"It's not the armor that makes the hero, but th...",With the world now aware of his dual life as t...
2,38757,tt0398286,Tangled,Tangled,2010-11-24,2010-11-24T00:00:00.000Z,PG,260000000,592461732,100,...,Byron Howard,16855|69899|2517|2372|22132,Mandy Moore|Zachary Levi|Donna Murphy|Ron Perl...,Walt Disney Animation Studios|Walt Disney Pict...,2,en,US,gsYKF8ecC8g,They're taking adventure to new lengths.,"Feisty teenager Rapunzel, who has long and mag..."
3,12444,tt0926084,Harry Potter and the Deathly Hallows: Part 1,Harry Potter and the Deathly Hallows: Part 1,2010-11-17,2010-11-19T00:00:00.000Z,PG-13,250000000,954305868,146,...,David Yates,10980|10990|10989|13014|1283,Daniel Radcliffe|Emma Watson|Rupert Grint|Toby...,Warner Bros. Pictures|Heyday Films,2,en,GB|US,Su1LOpjvdZ4,Nowhere is safe.,"Harry, Ron and Hermione walk away from their l..."
4,10191,tt0892769,How to Train Your Dragon,How to Train Your Dragon,2010-03-18,2010-03-26T00:00:00.000Z,PG,165000000,495141736,98,...,Chris Sanders,449|17276|24264|59174|21007,Jay Baruchel|Gerard Butler|Craig Ferguson|Amer...,DreamWorks Animation,1,en,US,KZtbJ_I9IFM,One adventure will change two worlds.,As the son of a Viking leader on the cusp of m...


---
## Box Office Mojo Revenue Scraping

### Purpose
Scrape box office revenue data from Box Office Mojo to fill gaps in TMDB data. Currently only 37% of movies have both budget and revenue, limiting our dataset size. Box Office Mojo provides comprehensive revenue data including opening weekend, domestic total, international, and worldwide gross.

### Strategy
- Target all 5,094 movies with IMDb IDs
- Use respectful rate limiting (1.5s delays)
- Implement checkpointing for resumability
- Extract: opening weekend, domestic total, international total, worldwide total, budget