#Additional Features for Dimension Table
In this file Budget, Cast, Sequel Info, Collection, MPAA Rating, and Poster information are fetched using TMDb API.

#Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import requests
import time
import os
import sys
import subprocess

#Loading Dataset

In [None]:
# 1. AUTO-INSTALL GDOWN (If missing)
try:
    import gdown
except ImportError:
    print("gdown not found. Installing...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "gdown"])
    import gdown

# 2. FILE MAPPING
# Map to the "Google Drive Link
file = {
    "dimension_table_v1.csv": "https://drive.google.com/file/d/1WyICY140G7DEtJ9oCjStAdFyKnIxsXr-/view?usp=share_link"}

# 3. DOWNLOADER LOOP
for filename, drive_link in file.items():
    if not os.path.exists(filename):
        print(f"Downloading {filename}...")

        # Extract ID from the link safely
        try:
            file_id = drive_link.split('/d/')[1].split('/')[0]
            url = f'https://drive.google.com/uc?id={file_id}'

            # Download (quiet=False shows the progress bar)
            gdown.download(url, filename, quiet=False)
        except IndexError:
            print(f"Error: Could not parse ID for {filename}. Check the link.")
    else:
        print(f"Found {filename} locally. Skipping download.")

# 4. LOAD DATA
print("\nLoading Dataframe...")

# Load Mojo (CSV)
if os.path.exists("dimension_table_v1.csv"):
    df = pd.read_csv("dimension_table_v1.csv")
    print("Dimension table V1 loaded.")

Downloading dimension_table_v1.csv...


Downloading...
From: https://drive.google.com/uc?id=1WyICY140G7DEtJ9oCjStAdFyKnIxsXr-
To: /content/dimension_table_v1.csv
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 544k/544k [00:00<00:00, 8.02MB/s]


Loading Dataframe...
Dimension table V1 loaded.





In [None]:
df.head()

Unnamed: 0,tconst,Release,Opening_Week_Revenue,Release_Date,Theaters,Release_Year,Distributor,genres,runtimeMinutes,directors,Special_Day
0,tt1034303,Defiance,195622,2009-01-01,2,2009,Paramount Vantage,"Action,Drama,History",137,nm0001880,New Year's Day
1,tt0959337,Revolutionary Road,76319,2009-01-01,3,2009,Paramount Vantage,"Drama,Romance",119,nm0005222,New Year's Day
2,tt0841925,Silent Light,19633,2009-01-07,1,2009,Palisades Tartan,"Drama,Romance",136,nm1196161,
3,tt0795438,Not Easily Broken,6143981,2009-01-09,724,2009,Screen Gems,"Drama,Romance",99,nm0004886,
4,tt0901476,Bride Wars,25827125,2009-01-09,3226,2009,Twentieth Century Fox,"Comedy,Romance",89,nm0935095,


In [None]:
# CONFIGURATION

TMDB_READ_ACCESS_TOKEN = 'eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiJlMDYyZTkwY2E4MzM4OGNiNzMzYTc0Njg4MmZhZWE5NSIsIm5iZiI6MTc2MzEwMzIyMC4wNDQ5OTk4LCJzdWIiOiI2OTE2ZDFmNGU0ZjNmNTA1YWY0MGU2ZTIiLCJzY29wZXMiOlsiYXBpX3JlYWQiXSwidmVyc2lvbiI6MX0.91STVcbtmpYytLCt7AbqznoI2tYv394ee5fD-Dh8Lxw'
CHECKPOINT_FILE = 'dimension_table_checkpoint.csv'

TMDB_API_BASE_URL = 'https://api.themoviedb.org/3'
HEADERS = {
    "accept": "application/json",
    "Authorization": f"Bearer {TMDB_READ_ACCESS_TOKEN}"
}

# ========================================
# CORE FUNCTIONS
# ========================================

def get_comprehensive_movie_data(imdb_id):
    """
    Fetches Budget, Cast, Sequel Info, Collection, MPAA Rating, and Poster in just 2 API calls.

    Returns:
        dict with all movie data
    """
    # Default empty return structure
    empty_result = {
        'Budget': np.nan,
        'actors': np.nan,
        'cast_count': 0,
        'poster_url': np.nan,
        'is_sequel': 0,
        'collection_id': np.nan,
        'mpaa_rating': np.nan
    }

    if pd.isna(imdb_id) or not imdb_id:
        return empty_result

    imdb_id = str(imdb_id).strip()

    try:
        # --- CALL 1: Find TMDB ID from IMDb ID ---
        find_url = f"{TMDB_API_BASE_URL}/find/{imdb_id}"
        response = requests.get(find_url, headers=HEADERS, params={"external_source": "imdb_id"}, timeout=10)
        response.raise_for_status()
        results = response.json().get('movie_results', [])

        if not results:
            return empty_result

        tmdb_id = results[0]['id']

        # --- CALL 2: Get Details, Credits, AND Release Dates (OPTIMIZED!) ---
        # Using append_to_response to get everything in one call
        details_url = f"{TMDB_API_BASE_URL}/movie/{tmdb_id}"
        params = {"append_to_response": "credits,release_dates"}

        response = requests.get(details_url, headers=HEADERS, params=params, timeout=10)
        response.raise_for_status()
        data = response.json()

        # ===== PARSE BUDGET =====
        budget = data.get('budget', 0)
        budget = budget if budget > 0 else np.nan

        # ===== PARSE POSTER =====
        poster_path = data.get('poster_path')
        poster_url = f"https://image.tmdb.org/t/p/original{poster_path}" if poster_path else np.nan

        # ===== PARSE CAST =====
        cast = data.get('credits', {}).get('cast', [])
        top_15_actors = [actor['name'] for actor in cast[:15]]

        actors_str = ', '.join(top_15_actors) if top_15_actors else np.nan
        cast_count = len(cast)

        # ===== PARSE COLLECTION INFO (Sequel Detection) =====
        belongs_to_collection = data.get('belongs_to_collection')

        is_sequel = 0
        collection_id = np.nan

        if belongs_to_collection:
            # If movie belongs to a collection, it's likely a sequel/franchise movie
            is_sequel = 1
            collection_id = belongs_to_collection.get('id')

        # ===== PARSE MPAA RATING =====
        mpaa_rating = np.nan
        release_dates_data = data.get('release_dates', {}).get('results', [])

        # Look for US release dates (contains MPAA ratings)
        for country_data in release_dates_data:
            if country_data.get('iso_3166_1') == 'US':
                release_info = country_data.get('release_dates', [])
                for release in release_info:
                    certification = release.get('certification', '').strip()
                    if certification:  # Get first non-empty certification
                        mpaa_rating = certification
                        break
                break

        return {
            'Budget': budget,
            'actors': actors_str,
            'cast_count': cast_count,
            'poster_url': poster_url,
            'is_sequel': is_sequel,
            'collection_id': collection_id,
            'mpaa_rating': mpaa_rating
        }

    except Exception as e:
        print(f"‚ö†Ô∏è Error on {imdb_id}: {e}")
        return empty_result
    finally:
        time.sleep(0.05)  # Rate limiting (safety buffer)

# ========================================
# MAIN EXECUTION
# ========================================

print("=" * 60)
print("üé¨ COMPREHENSIVE MOVIE DATA ENRICHMENT")
print("=" * 60)

# 1. LOAD DATA (Resume Logic)
if os.path.exists(CHECKPOINT_FILE):
    print(f"üîÑ Checkpoint found! Resuming from: {CHECKPOINT_FILE}")
    df = pd.read_csv(CHECKPOINT_FILE)
    df['tconst'] = df['tconst'].astype(str)
else:
    print(f"üÜï Starting fresh with your dataframe (df)")
    # df is already loaded in your environment - just ensure tconst is string
    df = df.copy()  # Make a copy to avoid modifying original
    df['tconst'] = df['tconst'].astype(str)

    # Initialize new columns
    new_cols = {
        'Budget': np.nan,
        'actors': np.nan,
        'cast_count': 0,
        'poster_url': np.nan,
        'is_sequel': 0,
        'collection_id': np.nan,
        'mpaa_rating': np.nan,
        'data_fetched': 0
    }

    for col, default_val in new_cols.items():
        df[col] = default_val

# 2. IDENTIFY REMAINING WORK
remaining_indices = df[df['data_fetched'] != 1].index
total_to_process = len(remaining_indices)

print(f"Movies already processed: {len(df) - total_to_process}")
print(f"Movies remaining: {total_to_process}")
print("-" * 60)

# 3. PROCESSING LOOP
start_time = time.time()
count = 0

for idx in remaining_indices:
    imdb_id = df.at[idx, 'tconst']

    # Fetch Data
    data = get_comprehensive_movie_data(imdb_id)

    # Update Row
    df.at[idx, 'Budget'] = data['Budget']
    df.at[idx, 'actors'] = data['actors']
    df.at[idx, 'cast_count'] = data['cast_count']
    df.at[idx, 'poster_url'] = data['poster_url']
    df.at[idx, 'is_sequel'] = data['is_sequel']
    df.at[idx, 'collection_id'] = data['collection_id']
    df.at[idx, 'mpaa_rating'] = data['mpaa_rating']

    # Mark as done
    df.at[idx, 'data_fetched'] = 1

    count += 1

    # Progress Update (Every 50 movies)
    if count % 50 == 0:
        elapsed = time.time() - start_time
        rate = count / elapsed
        remaining_est = (total_to_process - count) / rate if rate > 0 else 0
        print(f"‚ö° Processed {count}/{total_to_process} | Rate: {rate:.1f} movies/sec | ETA: {remaining_est/60:.1f} min")

    # Checkpoint Save (Every 500 movies)
    if count % 500 == 0:
        df.to_csv(CHECKPOINT_FILE, index=False)
        print(f"üíæ Checkpoint saved at {count} processed movies")

# 4. FINAL SAVES
print("-" * 60)
# Save final checkpoint
df.to_csv(CHECKPOINT_FILE, index=False)
print("üíæ Final checkpoint saved")

# Save final output (drop the tracking column)
df_final = df.drop(columns=['data_fetched'])

# ========================================
# STATISTICS & SUMMARY
# ========================================

total_time = time.time() - start_time

print(f"\n‚è±Ô∏è Processing Time: {total_time/60:.1f} minutes")
print(f"‚ö° Average Rate: {count/total_time:.1f} movies/second")

print(f"\nüí∞ Budget Data:")
print(f"   Movies with budget: {df['Budget'].notna().sum()} ({df['Budget'].notna().sum()/len(df)*100:.1f}%)")
if df['Budget'].notna().any():
    print(f"   Average budget: ${df['Budget'].mean():,.0f}")
    print(f"   Median budget: ${df['Budget'].median():,.0f}")

print(f"\nüé≠ Cast Data:")
print(f"   Movies with cast: {df['actors'].notna().sum()} ({df['actors'].notna().sum()/len(df)*100:.1f}%)")
print(f"   Average cast size: {df['cast_count'].mean():.1f} actors")

print(f"\nüé¨ Poster Data:")
print(f"   Movies with posters: {df['poster_url'].notna().sum()} ({df['poster_url'].notna().sum()/len(df)*100:.1f}%)")

print(f"\nüé• Sequel/Franchise Data:")
print(f"   Movies that are sequels/franchises: {df['is_sequel'].sum()} ({df['is_sequel'].sum()/len(df)*100:.1f}%)")
print(f"   Unique collections: {df['collection_id'].nunique()}")

print(f"\nüîû MPAA Rating Data:")
print(f"   Movies with MPAA rating: {df['mpaa_rating'].notna().sum()} ({df['mpaa_rating'].notna().sum()/len(df)*100:.1f}%)")
if df['mpaa_rating'].notna().any():
    print(f"\n   Rating Distribution:")
    rating_counts = df['mpaa_rating'].value_counts()
    for rating, count in rating_counts.items():
        print(f"      {rating}: {count} ({count/len(df)*100:.1f}%)")

# Sample results
print("\n" + "=" * 60)
print("SAMPLE RESULTS (First 5 movies with data)")
print("=" * 60)
sample = df[df['actors'].notna()].head(5)
sample_cols = ['Release', 'Budget', 'cast_count', 'is_sequel', 'collection_id', 'mpaa_rating']
print(sample[sample_cols].to_string(index=False))

print("\n" + "=" * 60)

üé¨ COMPREHENSIVE MOVIE DATA ENRICHMENT
üÜï Starting fresh with your dataframe (df)
Movies already processed: 0
Movies remaining: 4840
------------------------------------------------------------


  df.at[idx, 'actors'] = data['actors']
  df.at[idx, 'poster_url'] = data['poster_url']
  df.at[idx, 'mpaa_rating'] = data['mpaa_rating']


‚ö° Processed 50/4840 | Rate: 5.4 movies/sec | ETA: 14.7 min
‚ö° Processed 100/4840 | Rate: 5.3 movies/sec | ETA: 14.9 min
‚ö° Processed 150/4840 | Rate: 5.3 movies/sec | ETA: 14.7 min
‚ö° Processed 200/4840 | Rate: 5.3 movies/sec | ETA: 14.7 min
‚ö° Processed 250/4840 | Rate: 5.3 movies/sec | ETA: 14.4 min
‚ö° Processed 300/4840 | Rate: 5.3 movies/sec | ETA: 14.3 min
‚ö° Processed 350/4840 | Rate: 5.3 movies/sec | ETA: 14.1 min
‚ö° Processed 400/4840 | Rate: 5.3 movies/sec | ETA: 13.9 min
‚ö° Processed 450/4840 | Rate: 5.3 movies/sec | ETA: 13.8 min
‚ö° Processed 500/4840 | Rate: 5.3 movies/sec | ETA: 13.6 min
üíæ Checkpoint saved at 500 processed movies
‚ö° Processed 550/4840 | Rate: 5.3 movies/sec | ETA: 13.5 min
‚ö° Processed 600/4840 | Rate: 5.3 movies/sec | ETA: 13.4 min
‚ö° Processed 650/4840 | Rate: 5.3 movies/sec | ETA: 13.3 min
‚ö° Processed 700/4840 | Rate: 5.2 movies/sec | ETA: 13.2 min
‚ö° Processed 750/4840 | Rate: 5.2 movies/sec | ETA: 13.0 min
‚ö° Processed 800/4840 | 

In [None]:
df_final.head()

Unnamed: 0,tconst,Release,Opening_Week_Revenue,Release_Date,Theaters,Release_Year,Distributor,genres,runtimeMinutes,directors,Special_Day,Budget,actors,cast_count,poster_url,is_sequel,collection_id,mpaa_rating
0,tt1034303,Defiance,195622,2009-01-01,2,2009,Paramount Vantage,"Action,Drama,History",137,nm0001880,New Year's Day,32000000.0,"Daniel Craig, Liev Schreiber, Jamie Bell, Alex...",73,https://image.tmdb.org/t/p/original/z6yNHcQOve...,0,,R
1,tt0959337,Revolutionary Road,76319,2009-01-01,3,2009,Paramount Vantage,"Drama,Romance",119,nm0005222,New Year's Day,35000000.0,"Leonardo DiCaprio, Kate Winslet, Kathy Bates, ...",60,https://image.tmdb.org/t/p/original/cvkD3yiVXL...,0,,R
2,tt0841925,Silent Light,19633,2009-01-07,1,2009,Palisades Tartan,"Drama,Romance",136,nm1196161,,,"Cornelio Wall, Miriam Toews, Maria Pankratz, P...",14,https://image.tmdb.org/t/p/original/kniOoQky3G...,0,,
3,tt0795438,Not Easily Broken,6143981,2009-01-09,724,2009,Screen Gems,"Drama,Romance",99,nm0004886,,,"Morris Chestnut, Taraji P. Henson, Maeve Quinl...",19,https://image.tmdb.org/t/p/original/4HJ39DnQyf...,0,,PG
4,tt0901476,Bride Wars,25827125,2009-01-09,3226,2009,Twentieth Century Fox,"Comedy,Romance",89,nm0935095,,30000000.0,"Kate Hudson, Anne Hathaway, Bryan Greenberg, C...",38,https://image.tmdb.org/t/p/original/eyXT8tmUoy...,0,,PG


In [None]:
df_final['Budget'].notna().sum()

np.int64(2811)

In [None]:
df_final.to_csv('dimension_table_final.csv', index=False)