# **Avatar: Fire and Ash domestic opening week box office prediction**

In [None]:
import time
import warnings
import re
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
warnings.filterwarnings("ignore")

In [None]:
TMDB_API_KEY = ""
TOTAL_MOVIES = 10000

## TMDB REQUEST


In [None]:
def make_tmdb_request(url, params):
    """Make a request to TMDB API"""
    params['api_key'] = TMDB_API_KEY
    response = requests.get(url, params=params)
    response.raise_for_status()
    return response.json()

## FETCH MOVIES


In [None]:
def get_popular_movies(total_movies):
    """Get list of popular movies from TMDB"""
    print(f"[TMDB] Fetching {total_movies} popular movies...")

    movies = []
    page = 1
    max_pages = min(500, (total_movies // 20) + 1)

    while page <= max_pages and len(movies) < total_movies:
        url = "https://api.themoviedb.org/3/discover/movie"
        params = {
            "sort_by": "popularity.desc",
            "page": page,
            "include_adult": "false",
            "language": "en-US",
            "vote_count.gte": "100"
        }

        try:
            data = make_tmdb_request(url, params)
            movies.extend(data['results'])
            print_progress_bar(len(movies), total_movies, "Collecting movies")
            page += 1
            time.sleep(0.25)
        except Exception as e:
            print(f"\n  Error: {e}")
            break

    print(f"\n[TMDB] ✓ Collected {len(movies)} movies\n")
    return movies[:total_movies]

## RETRIEVE MOVIE DETAILS


In [None]:
def get_movie_details(movie_id):
    """Get detailed information for a single movie"""
    url = f"https://api.themoviedb.org/3/movie/{movie_id}"
    params = {"append_to_response": "credits,release_dates"}

    try:
        return make_tmdb_request(url, params)
    except:
        return None


## MPAA RATING

This function extracts the US MPAA rating (such as PG or R) from the movie’s release data.


In [None]:
def extract_mpaa_rating(release_dates):
    """Extract MPAA rating (PG, PG-13, R, etc.) from release dates"""
    mpaa_rating = ''
    results = release_dates.get('results', [])

    for country_data in results:
        if country_data['iso_3166_1'] == 'US':
            for release in country_data['release_dates']:
                cert = release.get('certification', '')
                if cert:
                    mpaa_rating = cert
                    break
            if mpaa_rating:
                break

    return mpaa_rating


## FEATURE EXTRACTION


In [None]:
def extract_movie_data(details):
    """Extract all 17 features from movie details"""

    # Feature 1-5: Basic info
    title = details.get('title', '')
    release_date = details.get('release_date', '')
    tmdb_id = details.get('id', '')
    imdb_id = details.get('imdb_id', '')
    budget = details.get('budget', 0)

    # Feature 6: Revenue from TMDB
    revenue_tmdb = details.get('revenue', 0)

    # Feature 7-8: Genres and runtime
    genres = '|'.join([g['name'] for g in details.get('genres', [])])
    runtime = details.get('runtime', '')

    # Feature 9: Country
    countries = details.get('production_countries', [])
    country = countries[0]['iso_3166_1'] if countries else ''

    # Feature 10: MPAA rating
    mpaa_rating = extract_mpaa_rating(details.get('release_dates', {}))

    # Feature 11-12: Popularity and franchise
    popularity = round(details.get('popularity', 0), 2)
    is_franchise = 1 if details.get('belongs_to_collection') else 0

    # Feature 13: Director
    crew = details.get('credits', {}).get('crew', [])
    directors = [c['name'] for c in crew if c['job'] == 'Director']
    director = directors[0] if directors else ''

    # Feature 14: Actors (top 5)
    cast = details.get('credits', {}).get('cast', [])
    actors = '|'.join([a['name'] for a in cast[:5]])

    # Feature 15: Vote average
    vote_average = details.get('vote_average', '')

    # Feature 16: Production companies
    prod_companies = details.get('production_companies', [])
    production_companies = '|'.join([pc['name'] for pc in prod_companies])

    # Feature 17: Distributor from TMDB
    distributor_tmdb = prod_companies[0]['name'] if prod_companies else ''

    return {
        'title': title,
        'release_date': release_date,
        'tmdb_id': tmdb_id,
        'imdb_id': imdb_id,
        'budget': budget,
        'revenue_tmdb': revenue_tmdb,
        'genres': genres,
        'runtime': runtime,
        'country': country,
        'mpaa_rating': mpaa_rating,
        'popularity': popularity,
        'is_franchise': is_franchise,
        'director': director,
        'actors': actors,
        'vote_average': vote_average,
        'production_companies': production_companies,
        'distributor_tmdb': distributor_tmdb
    }

## TMDB SCRAPING


In [None]:
def scrape_tmdb_movies(total_movies):

    # Step 1: Get list of popular movies
    movie_list = get_popular_movies(total_movies)

    # Step 2: Get detailed data for each movie
    all_movies = []
    print("[TMDB] Fetching detailed data for each movie...")

    for i, movie_basic in enumerate(movie_list, 1):
        movie_id = movie_basic['id']
        details = get_movie_details(movie_id)

        if details:
            movie_data = extract_movie_data(details)
            all_movies.append(movie_data)

        # Show progress bar
        print_progress_bar(i, len(movie_list), "Scraping details")

        # Show milestone messages
        if i % 100 == 0:
            print(f"\n  ✓ {i} movies completed!")
            print_progress_bar(i, len(movie_list), "Scraping details")

        time.sleep(0.25)  # Be nice to the API

    print(f"\n\n[TMDB] ✓ Scraped {len(all_movies)} movies with 17 features\n")
    return all_movies


## PROGRESS BAR


In [None]:
def print_progress_bar(current, total, prefix="Progress"):
    """Print a nice progress bar"""
    percentage = (current / total) * 100
    bar_length = 40
    filled = int(bar_length * current / total)
    bar = '█' * filled + '░' * (bar_length - filled)
    print(f"\r  {prefix}: [{bar}] {percentage:.1f}% ({current}/{total})", end='', flush=True)

## DATA COLLECTION

This section runs the movie scraper, converts the collected data into a pandas DataFrame, and saves it as a CSV file.

In [None]:
print("TMDB MOVIE SCRAPER")

# Scrape movies
movies_data = scrape_tmdb_movies(TOTAL_MOVIES)

# Convert to DataFrame
df = pd.DataFrame(movies_data)

# Save to CSV
output_file = 'tmdb_data_17_features.csv'
df.to_csv(output_file, index=False)


print("✓ SCRAPING COMPLETE!")
print(f"\nOutput file: {output_file}")
print(f"Total movies: {len(df)}")
print(f"Total features: {len(df.columns)}")

print(f"\n17 Features collected:")
for i, col in enumerate(df.columns, 1):
    print(f"  {i:2d}. {col}")


print(f"\n✓ Data saved to '{output_file}'")

TMDB MOVIE SCRAPER
[TMDB] Fetching 10000 popular movies...
  Collecting movies: [████████████████████████████████████████] 100.0% (10000/10000)
[TMDB] ✓ Collected 10000 movies

[TMDB] Fetching detailed data for each movie...
  Scraping details: [░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░] 1.0% (100/10000)
  ✓ 100 movies completed!
  Scraping details: [░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░] 2.0% (200/10000)
  ✓ 200 movies completed!
  Scraping details: [█░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░] 3.0% (300/10000)
  ✓ 300 movies completed!
  Scraping details: [█░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░] 4.0% (400/10000)
  ✓ 400 movies completed!
  Scraping details: [██░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░] 5.0% (500/10000)
  ✓ 500 movies completed!
  Scraping details: [██░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░] 6.0% (600/10000)
  ✓ 600 movies completed!
  Scraping details: [██░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░] 7.0% (700/10000)
  ✓ 700 movies completed!
  Scraping details: [███░░░░░░░░░

In [None]:
TMDB_CSV_FILE = "tmdb_data_17_features.csv"
MIN_RELEASE_YEAR = 2000

## HELPER FUNCTIONS


In [None]:
def parse_money(text):
    """Convert '$123,456,789' to 123456789"""
    try:
        return int(re.sub(r'[^\d]', '', text))
    except:
        return None


def parse_number(text):
    """Convert '3,456' to 3456"""
    try:
        return int(re.sub(r'[^\d]', '', text))
    except:
        return None


def print_progress_bar(current, total, success, failed):
    """Print a nice progress bar"""
    percentage = (current / total) * 100
    bar_length = 40
    filled = int(bar_length * current / total)
    bar = '█' * filled + '░' * (bar_length - filled)
    print(f"\r  [{bar}] {percentage:.1f}% | Success: {success} | Failed: {failed}", end='', flush=True)

## Box Office Mojo URL

> Add blockquote




In [None]:
def get_bom_url(imdb_id):
    """Create Box Office Mojo URL from IMDB ID"""
    if pd.isna(imdb_id) or not imdb_id:
        return None

    imdb_id = str(imdb_id).strip()

    if not imdb_id.startswith('tt'):
        return None

    return f"https://www.boxofficemojo.com/title/{imdb_id}/"

## BOX OFFICE MOJO PAGE SCRAPING



In [None]:
def scrape_bom_page(url, headers):
    """Scrape data from a single Box Office Mojo movie page"""
    try:
        response = requests.get(url, headers=headers, timeout=10)

        # Check if page exists
        if response.status_code == 404:
            return None

        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # Check if page has box office data
        has_data = soup.find('div', class_='a-section a-spacing-none mojo-performance-summary-table')
        if not has_data:
            return None

        movie_data = {}

        # Extract grosses from performance summary
        perf_summary = soup.find('div', class_='mojo-performance-summary-table')
        if perf_summary:
            sections = perf_summary.find_all('div', class_='a-section')

            for section in sections:
                text = section.get_text()
                money = section.find('span', class_='money')

                if money:
                    value = parse_money(money.get_text(strip=True))

                    if 'Domestic' in text:
                        movie_data['domestic_total_gross'] = value
                    elif 'International' in text:
                        movie_data['international_gross'] = value
                    elif 'Worldwide' in text:
                        movie_data['worldwide_gross'] = value

        # Extract details from summary table
        summary_table = soup.find('div', class_='mojo-summary-values')
        if summary_table:
            all_spans = summary_table.find_all('span')

            i = 0
            while i < len(all_spans):
                label = all_spans[i].get_text(strip=True)

                if i + 1 < len(all_spans):
                    value_span = all_spans[i + 1]
                    value_text = value_span.get_text(strip=True)

                    if 'Distributor' in label:
                        movie_data['distributor_bom'] = value_text

                    elif 'Opening' in label:
                        money = value_span.find('span', class_='money')
                        if money:
                            movie_data['opening_weekend_revenue'] = parse_money(money.get_text(strip=True))

                        theaters_match = re.search(r'([\d,]+)\s+theaters?', value_text)
                        if theaters_match:
                            movie_data['opening_theaters_count'] = parse_number(theaters_match.group(1))

                    elif 'Release Date' in label:
                        movie_data['opening_date_bom'] = value_text

                    elif 'MPAA' in label:
                        movie_data['mpaa_rating_bom'] = value_text

                    elif 'Running Time' in label:
                        movie_data['runtime_bom'] = value_text

                    elif 'Genres' in label:
                        movie_data['genres_bom'] = value_text

                    elif 'Widest Release' in label:
                        theaters_match = re.search(r'([\d,]+)\s+theaters?', value_text)
                        if theaters_match:
                            movie_data['widest_release_theaters'] = parse_number(theaters_match.group(1))

                i += 1

        # Calculate additional metrics
        if 'opening_weekend_revenue' in movie_data and 'opening_theaters_count' in movie_data:
            if movie_data['opening_theaters_count'] and movie_data['opening_theaters_count'] > 0:
                movie_data['avg_per_theater_opening'] = movie_data['opening_weekend_revenue'] // movie_data['opening_theaters_count']

        if 'opening_weekend_revenue' in movie_data and 'domestic_total_gross' in movie_data:
            if movie_data['domestic_total_gross'] and movie_data['domestic_total_gross'] > 0:
                movie_data['percent_of_total'] = round((movie_data['opening_weekend_revenue'] / movie_data['domestic_total_gross']) * 100, 2)

        # Estimate opening week (opening weekend × 1.4)
        if 'opening_weekend_revenue' in movie_data and movie_data['opening_weekend_revenue']:
            movie_data['opening_week_revenue'] = int(movie_data['opening_weekend_revenue'] * 1.4)

        return movie_data if movie_data else None

    except:
        return None

## BOX OFFICE MOJO DATA COLLECTION


In [None]:
def scrape_bom_for_all_movies(tmdb_df):
    """Scrape Box Office Mojo for all movies in TMDB dataframe"""
    print("SCRAPING BOX OFFICE MOJO")

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }

    print(f"\n[BOM] Scraping movies...")

    bom_data_list = []
    success_count = 0
    fail_count = 0

    for idx, row in tmdb_df.iterrows():
        title = row['title']
        imdb_id = row.get('imdb_id', None)
        current = idx + 1

        # Check if we have IMDB ID
        if pd.isna(imdb_id) or not imdb_id:
            fail_count += 1
            print_progress_bar(current, len(tmdb_df), success_count, fail_count)
            continue

        # Get BOM URL
        movie_url = get_bom_url(imdb_id)

        if movie_url:
            # Scrape the movie page
            movie_data = scrape_bom_page(movie_url, headers)

            if movie_data:
                movie_data['title'] = title
                movie_data['bom_url'] = movie_url
                bom_data_list.append(movie_data)
                success_count += 1
            else:
                fail_count += 1
        else:
            fail_count += 1

        # Show progress bar
        print_progress_bar(current, len(tmdb_df), success_count, fail_count)

        # Show milestone messages every 100 movies
        if current % 100 == 0:
            print(f"\n  ✓ {current} movies processed! (Success: {success_count}, Failed: {fail_count})")
            print_progress_bar(current, len(tmdb_df), success_count, fail_count)

        # Wait between requests
        time.sleep(1)

    print(f"\n\n[BOM] ✓ Successfully scraped {success_count} movies!")
    print(f"[BOM] ✗ Failed to scrape {fail_count} movies\n")

    return pd.DataFrame(bom_data_list)

## DATA MERGING


In [None]:

def merge_datasets(tmdb_df, bom_df):
    """Merge TMDB and Box Office Mojo datasets"""
    print("MERGING DATASETS")

    print(f"\n[MERGE] TMDB movies: {len(tmdb_df)}")
    print(f"[MERGE] BOM movies: {len(bom_df)}")

    # Merge on title
    merged = pd.merge(tmdb_df, bom_df, on='title', how='left')

    # Count movies with BOM data
    movies_with_bom = merged['domestic_total_gross'].notna().sum()
    match_percentage = (movies_with_bom / len(tmdb_df)) * 100

    print(f"[MERGE] ✓ {movies_with_bom} movies have BOM data")
    print(f"[MERGE] ✓ Match rate: {match_percentage:.1f}%")
    print(f"[MERGE] ✓ Total features: {len(merged.columns)}\n")

    return merged


## BOM PIPELINE


In [None]:
LIMIT_ROWS = None
print("BOX OFFICE MOJO SCRAPER")

# Load TMDB data
print(f"[LOAD] Loading TMDB data from '{TMDB_CSV_FILE}'...")

try:
    tmdb_data = pd.read_csv(TMDB_CSV_FILE)
    print(f"[LOAD] ✓ Loaded {len(tmdb_data)} movies!\n")

    # Filter by release year
    if MIN_RELEASE_YEAR:
        tmdb_data['release_year'] = pd.to_datetime(tmdb_data['release_date'], errors='coerce').dt.year
        before = len(tmdb_data)
        tmdb_data = tmdb_data[tmdb_data['release_year'] >= MIN_RELEASE_YEAR]
        print(f"[LOAD] ⚠ Filtered to movies from {MIN_RELEASE_YEAR}+: {len(tmdb_data)} movies\n")

    # Limit rows if specified
    if LIMIT_ROWS:
        tmdb_data = tmdb_data.head(LIMIT_ROWS)
        print(f"[LOAD] ⚠ Limited to first {LIMIT_ROWS} movies for testing\n")

    # Scrape Box Office Mojo
    bom_data = scrape_bom_for_all_movies(tmdb_data)

    # Merge datasets
    final_data = merge_datasets(tmdb_data, bom_data)

    # Save to CSV
    output_file = 'movie_dataset_with_bom.csv'
    final_data.to_csv(output_file, index=False)

    print("="*60)
    print("✓ SCRAPING COMPLETE!")
    print("="*60)
    print(f"\nTotal movies: {len(final_data)}")
    print(f"Movies with BOM data: {final_data['domestic_total_gross'].notna().sum()}")
    print(f"Total features: {len(final_data.columns)}")
    print(f"Saved to: {output_file}\n")

    # Show sample
    print("Sample of the data:")
    print(final_data[['title', 'release_date', 'domestic_total_gross', 'opening_weekend_revenue']].head(5))

except FileNotFoundError:
    print(f"[ERROR] File '{TMDB_CSV_FILE}' not found!")
    print("[ERROR] Please run Step 1 first to generate the TMDB data.")

BOX OFFICE MOJO SCRAPER
[LOAD] Loading TMDB data from 'tmdb_data_17_features.csv'...
[LOAD] ✓ Loaded 10000 movies!

[LOAD] ⚠ Filtered to movies from 2000+: 7149 movies

SCRAPING BOX OFFICE MOJO

[BOM] Scraping movies...
  [░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░] 1.4% | Success: 94 | Failed: 0
  ✓ 100 movies processed! (Success: 94, Failed: 0)
  [█░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░] 2.8% | Success: 180 | Failed: 0
  ✓ 200 movies processed! (Success: 180, Failed: 0)
  [█░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░] 4.2% | Success: 261 | Failed: 0
  ✓ 300 movies processed! (Success: 261, Failed: 0)
  [██░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░] 7.0% | Success: 421 | Failed: 0
  ✓ 500 movies processed! (Success: 421, Failed: 0)
  [███░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░] 8.4% | Success: 499 | Failed: 0
  ✓ 600 movies processed! (Success: 499, Failed: 0)
  [███░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░] 9.8% | Success: 578 | Failed: 0
  ✓ 700 movies processed! (Success: 578, Failed: 0)
  [████░

Printing the shape of the new dataset

In [None]:
df = pd.read_csv('/content/movie_dataset_with_bom.csv')
print(df.shape)

(7385, 30)


## LOADING DATA AND DATA CLEANING

> Add blockquote




In [None]:
def load_and_clean_data(filepath, target_col='opening_week_revenue'):
    """Load data and handle missing values intelligently"""

    df = pd.read_csv(filepath)

    # Remove columns that contain future information (prevent data leakage)
    leakage_columns = [
        'revenue_tmdb',
        'domestic_total_gross',
        'international_gross',
        'worldwide_gross',
        'opening_weekend_revenue',
        'percent_of_total'
    ]

    for col in leakage_columns:
        if col in df.columns:
            df = df.drop(columns=[col])

    # Drop rows where target is missing
    df = df[df[target_col].notna()]

    # Drop duplicate movies
    df = df.drop_duplicates(subset=['tmdb_id'], keep='first')

    # Impute numeric features using median
    numeric_features = ['budget', 'runtime', 'popularity', 'vote_average']
    for col in numeric_features:
        if col in df.columns:
            df[col] = df[col].fillna(df[col].median())

    # Binary feature
    if 'is_franchise' in df.columns:
        df['is_franchise'] = df['is_franchise'].fillna(df['is_franchise'].mode()[0])

    # Categorical features
    if 'country' in df.columns:
        df['country'] = df['country'].fillna('US')

    if 'mpaa_rating' in df.columns:
        df['mpaa_rating'] = df['mpaa_rating'].fillna('NR')

    if 'genres' in df.columns:
        df['genres'] = df['genres'].fillna('Drama')

    if 'director' in df.columns:
        df['director'] = df['director'].fillna('Unknown Director')

    if 'actors' in df.columns:
        df['actors'] = df['actors'].fillna('')

    if 'production_companies' in df.columns:
        df['production_companies'] = df['production_companies'].fillna('Independent')

    if 'distributor_bom' in df.columns:
        df['distributor_bom'] = df['distributor_bom'].fillna('Independent')

    if 'release_date' in df.columns:
        if 'release_year' in df.columns:
            df['release_date'] = df.apply(
                lambda row: f"{row['release_year']}-07-01"
                if pd.isna(row['release_date']) and pd.notna(row['release_year'])
                else row['release_date'],
                axis=1
            )
        df = df[df['release_date'].notna()]

    return df


## FEATURE ENGINEERING


In [None]:
def create_basic_features(df):
    """Create basic features from existing columns"""

    df = df.copy()

    # 1. DATE FEATURES
    df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
    df['release_month'] = df['release_date'].dt.month
    df['release_day'] = df['release_date'].dt.day
    df['release_day_of_week'] = df['release_date'].dt.dayofweek
    df['release_quarter'] = df['release_date'].dt.quarter

    # Holiday indicators
    df['is_summer'] = df['release_month'].isin([5, 6, 7, 8]).astype(int)
    df['is_holiday_season'] = df['release_month'].isin([11, 12]).astype(int)
    df['is_weekend_release'] = df['release_day_of_week'].isin([4, 5]).astype(int)

    # 2. GENRE FEATURES
    if 'genres' in df.columns:
        top_genres = [
            'Action', 'Adventure', 'Comedy', 'Drama', 'Thriller',
            'Horror', 'Science Fiction', 'Fantasy', 'Animation'
        ]

        for genre in top_genres:
            col_name = f'genre_{genre.lower().replace(" ", "_")}'
            df[col_name] = df['genres'].apply(
                lambda x: 1 if isinstance(x, str) and genre in x else 0
            )

        df['genre_count'] = df['genres'].apply(
            lambda x: len(x.split('|')) if isinstance(x, str) and x else 1
        )

    # 3. ACTOR COUNT
    if 'actors' in df.columns:
        df['actor_count'] = df['actors'].apply(
            lambda x: len(x.split('|')) if isinstance(x, str) and x else 0
        )

    # 4. MAJOR STUDIO
    if 'production_companies' in df.columns:
        major_studios = [
            'Warner Bros', 'Universal', 'Disney', 'Paramount',
            'Sony', '20th Century', 'Columbia', 'Marvel', 'Lucasfilm'
        ]

        df['is_major_studio'] = df['production_companies'].apply(
            lambda x: 1 if isinstance(x, str) and any(studio in x for studio in major_studios) else 0
        )

    # 5. DIRECTOR FREQUENCY
    if 'director' in df.columns:
        director_freq = df['director'].value_counts()
        df['director_movie_count'] = df['director'].map(director_freq).fillna(1)

    # 6. BUDGET CATEGORIES
    if 'budget' in df.columns:
        df['is_blockbuster'] = (df['budget'] >= 100000000).astype(int)
        df['is_mega_blockbuster'] = (df['budget'] >= 200000000).astype(int)

    return df


## ADVANCED FEATURE ENGINEERING


In [None]:
def create_advanced_features(df):
    """Create advanced engineered features"""

    df = df.copy()

    # 1. LOG TRANSFORMATIONS
    if 'budget' in df.columns:
        df['log_budget'] = np.log10(df['budget'] + 1)

    if 'popularity' in df.columns:
        df['log_popularity'] = np.log10(df['popularity'] + 1)

    if 'runtime' in df.columns:
        df['log_runtime'] = np.log10(df['runtime'] + 1)

    # 2. INTERACTION FEATURES
    if 'budget' in df.columns:
        if 'genre_action' in df.columns:
            df['budget_action_ratio'] = df['budget'] * df['genre_action']

        if 'genre_science_fiction' in df.columns:
            df['budget_scifi_ratio'] = df['budget'] * df['genre_science_fiction']

        if 'is_summer' in df.columns:
            df['budget_summer_boost'] = df['budget'] * df['is_summer']

        if 'is_holiday_season' in df.columns:
            df['budget_holiday_boost'] = df['budget'] * df['is_holiday_season']

    # 3. FRANCHISE × BUDGET
    if 'is_franchise' in df.columns and 'budget' in df.columns:
        df['franchise_budget'] = df['is_franchise'] * df['budget']

    # 4. GENRE COMBINATIONS
    if 'genre_action' in df.columns and 'genre_science_fiction' in df.columns:
        df['action_scifi'] = df['genre_action'] * df['genre_science_fiction']

    if 'genre_action' in df.columns and 'genre_adventure' in df.columns:
        df['action_adventure'] = df['genre_action'] * df['genre_adventure']

    # 5. POLYNOMIAL FEATURES
    if 'budget' in df.columns:
        df['budget_squared'] = df['budget'] ** 2

    if 'runtime' in df.columns:
        df['runtime_squared'] = df['runtime'] ** 2

    # 6. RATIO FEATURES
    if 'budget' in df.columns and 'genre_count' in df.columns:
        df['budget_per_genre'] = df['budget'] / (df['genre_count'] + 1)

    if 'budget' in df.columns and 'runtime' in df.columns:
        df['budget_per_minute'] = df['budget'] / (df['runtime'] + 1)

    if 'popularity' in df.columns and 'vote_average' in df.columns:
        df['popularity_per_rating'] = df['popularity'] / (df['vote_average'] + 1)

    return df


## ENCODING FEATURES


In [None]:
def encode_categorical_features(df, target_encodings=None, target='opening_week_revenue'):
    """Encode categorical features"""

    df = df.copy()

    # Store target encodings for test/avatar data
    if target_encodings is None:
        target_encodings = {}

    # 1. MPAA RATING - Ordinal
    if 'mpaa_rating' in df.columns:
        mpaa_order = {'G': 1, 'PG': 2, 'PG-13': 3, 'R': 4, 'NC-17': 5, 'NR': 0}
        df['mpaa_rating_encoded'] = df['mpaa_rating'].map(mpaa_order).fillna(0)

    # 2. COUNTRY - Target Encoding
    if 'country' in df.columns:
        if 'country' not in target_encodings and target in df.columns:
            country_means = df.groupby('country')[target].mean()
            target_encodings['country'] = country_means

        if 'country' in target_encodings:
            df['country_encoded'] = df['country'].map(target_encodings['country'])
            df['country_encoded'] = df['country_encoded'].fillna(target_encodings['country'].mean())

    # 3. DISTRIBUTOR - Target Encoding
    if 'distributor_bom' in df.columns:
        if 'distributor' not in target_encodings and target in df.columns:
            distributor_means = df.groupby('distributor_bom')[target].mean()
            target_encodings['distributor'] = distributor_means

        if 'distributor' in target_encodings:
            df['distributor_encoded'] = df['distributor_bom'].map(target_encodings['distributor'])
            df['distributor_encoded'] = df['distributor_encoded'].fillna(target_encodings['distributor'].mean())

    # Drop original text and ID columns
    text_cols = [
        'title', 'genres', 'actors', 'director', 'production_companies',
        'release_date', 'mpaa_rating', 'country', 'distributor_bom',
        'tmdb_id', 'imdb_id', 'distributor_tmdb', 'opening_date_bom',
        'mpaa_rating_bom', 'runtime_bom', 'genres_bom', 'bom_url', 'release_year'
    ]

    cols_to_drop = [col for col in text_cols if col in df.columns]
    df = df.drop(columns=cols_to_drop)

    return df, target_encodings


## FEATURE PIPELINE


In [None]:
def apply_feature_pipeline(df, target_encodings=None, is_training=True):
    """Apply complete feature engineering pipeline"""

    # Apply all feature engineering steps
    df = create_basic_features(df)
    df = create_advanced_features(df)
    df, target_encodings = encode_categorical_features(df, target_encodings)

    # Fill any remaining missing values with median
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if df[col].isnull().sum() > 0:
            df[col] = df[col].fillna(df[col].median())

    return df, target_encodings


## MODEL INITIALIZATION


In [None]:
def initialize_all_models():
    """Initialize selected regression models"""

    models = {
        # LINEAR MODEL
        'Linear Regression': LinearRegression(),

        # TREE MODEL
        'Decision Tree': DecisionTreeRegressor(max_depth=10, random_state=42),

        # BOOSTING MODELS
        'Gradient Boosting': GradientBoostingRegressor(n_estimators=150, max_depth=8,
                                                       learning_rate=0.1, random_state=42),
        'AdaBoost': AdaBoostRegressor(n_estimators=100, learning_rate=0.1, random_state=42),
        'XGBoost': XGBRegressor(n_estimators=150, max_depth=8, learning_rate=0.1,
                               random_state=42, n_jobs=-1, verbosity=0),
        'LightGBM': LGBMRegressor(n_estimators=150, max_depth=8, learning_rate=0.1,
                                 random_state=42, n_jobs=-1, verbose=-1, force_row_wise=True),

        # OTHER MODELS
        'SVR': SVR(kernel='rbf', C=1.0),
        'Neural Network': MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=500,
                                      random_state=42, early_stopping=True)
    }

    return models

## MODEL TRAINING


In [None]:
def train_and_evaluate_all_models(X_train, X_test, y_train, y_test):
    """Train and evaluate all models"""

    models = initialize_all_models()
    results = []

    for name, model in models.items():
        try:
            # Train model
            model.fit(X_train, y_train)

            # Predict
            y_pred = model.predict(X_test)

            # Evaluate
            mae = mean_absolute_error(y_test, y_pred)
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))
            r2 = r2_score(y_test, y_pred)

            results.append({
                'Model': name,
                'MAE': mae,
                'RMSE': rmse,
                'R2': r2,
                'model_object': model
            })

        except Exception:
            continue

    # Convert to DataFrame and sort by R²
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values('R2', ascending=False)

    return results_df


## MODEL COMPARISON


In [None]:
def display_model_comparison(results_df):
    """Display comparison of all models"""

    print("MODEL COMPARISON (Ranked by R2)\n")
    print(f"{'Rank':<6} {'Model':<20} {'R2':<10} {'MAE':<15} {'RMSE':<15}")
    print("-" * 65)

    for i, (_, row) in enumerate(results_df.iterrows(), start=1):
        print(
            f"{i:<6} "
            f"{row['Model']:<20} "
            f"{row['R2']:<10.4f} "
            f"${row['MAE']:<14,.0f} "
            f"${row['RMSE']:<14,.0f}"
        )

    best_model = results_df.iloc[0]

    print("\nBest Model:")
    print(f"Model: {best_model['Model']}")
    print(f"R2: {best_model['R2']:.4f}")
    print(f"MAE: ${best_model['MAE']:,.0f}")
    print(f"RMSE: ${best_model['RMSE']:,.0f}")

    return best_model


## AVATAR DATA PREPARATION


In [None]:
def prepare_avatar_data(avatar_csv_path, target_encodings):
    """Load and prepare Avatar data"""

    avatar_df = pd.read_csv(avatar_csv_path)

    # Handle missing numeric values
    numeric_features = ['budget', 'runtime', 'popularity', 'vote_average']
    for col in numeric_features:
        if col in avatar_df.columns and avatar_df[col].isnull().sum() > 0:
            avatar_df[col] = avatar_df[col].fillna(avatar_df[col].median())

    if 'is_franchise' in avatar_df.columns:
        avatar_df['is_franchise'] = avatar_df['is_franchise'].fillna(1)

    if 'country' in avatar_df.columns:
        avatar_df['country'] = avatar_df['country'].fillna('US')

    if 'mpaa_rating' in avatar_df.columns:
        avatar_df['mpaa_rating'] = avatar_df['mpaa_rating'].fillna('PG-13')

    if 'genres' in avatar_df.columns:
        avatar_df['genres'] = avatar_df['genres'].fillna('Action|Adventure')

    avatar_df, _ = apply_feature_pipeline(
        avatar_df, target_encodings, is_training=False
    )

    return avatar_df


## Avatar: Fire and Ash PREDICTION


In [None]:
def predict_avatar_with_best_model(best_model_row, scaler, avatar_df, training_columns):
    """Predict using the best performing model"""

    best_model = best_model_row['model_object']

    # Add missing columns
    for col in training_columns:
        if col not in avatar_df.columns:
            avatar_df[col] = 0

    # Drop extra columns
    avatar_df = avatar_df[training_columns]

    # Scale and predict
    avatar_scaled = scaler.transform(avatar_df)
    prediction = best_model.predict(avatar_scaled)[0]

    return max(prediction, 0)


In [None]:
TRAINING_DATA = "movie_dataset_with_bom.csv"

## DATA PREPARATION FOR MODELS


In [None]:
df = load_and_clean_data(TRAINING_DATA)
df, target_encodings = apply_feature_pipeline(df)

X = df.drop(columns=["opening_week_revenue"])
y = df["opening_week_revenue"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
training_columns = X.columns


## MODEL EVALUATION
Models are evaluated using R2, MAE, and RMSE to compare overall fit and prediction accuracy.

R2 measures how well the model explains the variance in opening week revenue.
MAE measures the average absolute prediction error.
RMSE penalizes larger prediction errors more heavily.


In [None]:
results_df = train_and_evaluate_all_models(
    X_train_scaled, X_test_scaled, y_train, y_test
)

best_model_row = display_model_comparison(results_df)

MODEL COMPARISON (Ranked by R2)

Rank   Model                R2         MAE             RMSE           
-----------------------------------------------------------------
1      LightGBM             0.6507     $11,881,880     $22,726,189    
2      AdaBoost             0.6350     $14,267,580     $23,230,591    
3      XGBoost              0.6280     $11,857,236     $23,452,367    
4      Gradient Boosting    0.6108     $11,999,962     $23,988,197    
5      Linear Regression    0.5887     $14,078,720     $24,659,901    
6      Decision Tree        0.3305     $14,634,047     $31,462,747    
7      SVR                  -0.1596    $22,544,496     $41,406,088    
8      Neural Network       -0.3941    $24,141,041     $45,401,144    

Best Model:
Model: LightGBM
R2: 0.6507
MAE: $11,881,880
RMSE: $22,726,189


In [None]:
AVATAR_MOVIE_NAME = "Avatar: Fire and Ash"
AVATAR_RELEASE_YEAR = 2025
OUTPUT_FILE = "avatar_fire_and_ash_features.csv"


## Avatar: Fire and Ash DATA SCRAPING


In [None]:
def search_movie_on_tmdb(movie_name, year=None):
    print("SEARCHING FOR AVATAR: FIRE AND ASH")


    url = "https://api.themoviedb.org/3/search/movie"
    params = {
        "api_key": TMDB_API_KEY,
        "query": movie_name,
        "language": "en-US"
    }

    if year:
        params["year"] = year

    response = requests.get(url, params=params)
    response.raise_for_status()
    data = response.json()

    if not data.get("results"):
        print("✗ No results found on TMDB")
        return None

    print(f" Found {len(data['results'])} results\n")
    for i, movie in enumerate(data["results"][:5], 1):
        print(f"  {i}. {movie.get('title')} ({movie.get('release_date')})")

    chosen = data["results"][0]
    print(f"\n Using: {chosen.get('title')} (ID={chosen.get('id')})\n")

    return chosen["id"]


def get_movie_details(movie_id):
    url = f"https://api.themoviedb.org/3/movie/{movie_id}"
    params = {
        "api_key": TMDB_API_KEY,
        "append_to_response": "credits,release_dates"
    }

    response = requests.get(url, params=params)
    response.raise_for_status()
    return response.json()


def extract_mpaa_rating(release_dates):
    if not release_dates:
        return "NR"

    for country in release_dates.get("results", []):
        if country.get("iso_3166_1") == "US":
            for r in country.get("release_dates", []):
                if r.get("certification"):
                    return r["certification"]

    return "NR"



def scrape_avatar_data(movie_name, year=None):
    movie_id = search_movie_on_tmdb(movie_name, year)
    if not movie_id:
        return None

    details = get_movie_details(movie_id)


    genres = details.get("genres", [])
    genre_names = [g["name"] for g in genres]

    cast = details.get("credits", {}).get("cast", [])
    crew = details.get("credits", {}).get("crew", [])

    directors = [c["name"] for c in crew if c.get("job") == "Director"]

    prod_companies = details.get("production_companies", [])

    avatar_data = {
        "title": details.get("title", ""),
        "release_date": details.get("release_date", ""),
        "tmdb_id": details.get("id"),
        "imdb_id": details.get("imdb_id", ""),
        "budget": details.get("budget", 0),
        "runtime": details.get("runtime", 0),
        "genres": "|".join(genre_names),
        "country": details.get("production_countries", [{}])[0].get("iso_3166_1", "US"),
        "mpaa_rating": extract_mpaa_rating(details.get("release_dates")),
        "popularity": round(details.get("popularity", 0), 2),
        "vote_average": details.get("vote_average", 0),
        "is_franchise": 1 if details.get("belongs_to_collection") else 0,
        "director": directors[0] if directors else "Unknown Director",
        "actors": "|".join([a["name"] for a in cast[:5]]),
        "production_companies": "|".join([p["name"] for p in prod_companies]),
        "distributor_bom": prod_companies[0]["name"] if prod_companies else "Independent"
    }


    print("AVATAR DATA EXTRACTED")


    for k, v in avatar_data.items():
        if k not in ["actors", "production_companies", "genres"]:
            print(f"{k:25s}: {v}")

    return avatar_data






avatar_data = scrape_avatar_data(AVATAR_MOVIE_NAME, AVATAR_RELEASE_YEAR)

if avatar_data:
    df = pd.DataFrame([avatar_data])
    df.to_csv(OUTPUT_FILE, index=False)


    print(f"Data saved to: {OUTPUT_FILE}")
    print(f"Total features: {df.shape[1]}")

else:
    print("\Failed to scrape Avatar data")


SEARCHING FOR AVATAR: FIRE AND ASH
 Found 1 results

  1. Avatar: Fire and Ash (2025-12-17)

 Using: Avatar: Fire and Ash (ID=83533)

AVATAR DATA EXTRACTED
title                    : Avatar: Fire and Ash
release_date             : 2025-12-17
tmdb_id                  : 83533
imdb_id                  : tt1757678
budget                   : 400000000
runtime                  : 195
country                  : US
mpaa_rating              : PG-13
popularity               : 121.0
vote_average             : 0.0
is_franchise             : 1
director                 : James Cameron
distributor_bom          : 20th Century Studios
Data saved to: avatar_fire_and_ash_features.csv
Total features: 16


## FINAL PREDICTION


In [None]:
avatar_df = prepare_avatar_data(AVATAR_DATA, target_encodings)

prediction = predict_avatar_with_best_model(
    best_model_row,
    scaler,
    avatar_df,
    training_columns
)
print("Avatar: Fire and Ash – Opening Week Prediction")
print(f"Predicted opening week revenue: ${prediction:,.0f}")

Avatar: Fire and Ash – Opening Week Prediction
Predicted opening week revenue: $183,641,693
