In [None]:
import pandas as pd
import requests
from dotenv import load_dotenv
import os
from tqdm import tqdm

load_dotenv(dotenv_path='.env.example')
# :candado_cerrado_con_llave: Tu clave de API de TMDb
TMDB_API_KEY = os.getenv('TMDB_API_KEY')

df = pd.read_csv('../data/data_extraccion.csv')
df = df[df['imdb_id'].notna() & (df['imdb_id'] != '')]
df['imdb_id'] = df['imdb_id'].astype(str).str.strip()
# Archivo de salida
extraction_data = 'dataset_total.csv'
# Listas para resultados
directors, top_actors, budgets, revenues, runtimes = [], [], [], [], []
def get_tmdb_data(imdb_id):
    """Consulta TMDb con un imdb_id y devuelve los datos necesarios."""
    base_url = "https://api.themoviedb.org/3"
    find_url = f"{base_url}/find/{imdb_id}?api_key={TMDB_API_KEY}&external_source=imdb_id"
    r1 = requests.get(find_url)
    data = r1.json()
    movie_results = data.get('movie_results')
    if not movie_results:
        return None
    tmdb_id = movie_results[0]['id']
    # Detalles generales
    details_url = f"{base_url}/movie/{tmdb_id}?api_key={TMDB_API_KEY}"
    details = requests.get(details_url).json()
    budget = details.get('budget')
    revenue = details.get('revenue')
    runtime = details.get('runtime')
    # Créditos (director y actores)
    credits_url = f"{base_url}/movie/{tmdb_id}/credits?api_key={TMDB_API_KEY}"
    credits = requests.get(credits_url).json()
    director = None
    for crew_member in credits.get('crew', []):
        if crew_member.get('job') == 'Director':
            director = crew_member.get('name')
            break
    cast_list = credits.get('cast', [])
    actors = ', '.join([actor.get('name') for actor in cast_list[:10]])
    return director, actors, budget, revenue, runtime
# Procesamiento con progreso
batch_size = 1000
for i, imdb_id in enumerate(tqdm(df['imdb_id'], desc="Procesando películas")):
    try:
        result = get_tmdb_data(imdb_id)
        if result:
            director, actors, budget, revenue, runtime = result
        else:
            director = actors = budget = revenue = runtime = None
    except Exception as e:
        print(f"Error con ID {imdb_id}: {e}")
        director = actors = budget = revenue = runtime = None
    directors.append(director)
    top_actors.append(actors)
    budgets.append(budget)
    revenues.append(revenue)
    runtimes.append(runtime)
    # Guardar por lotes
    if (i + 1) % batch_size == 0 or (i + 1) == len(df):
        start_idx = i + 1 - len(directors)
        end_idx = i + 1
        temp_df = df.iloc[start_idx:end_idx].copy()
        temp_df['director'] = directors
        temp_df['top_actors'] = top_actors
        temp_df['budget_tmdb'] = budgets
        temp_df['revenue_tmdb'] = revenues
        temp_df['runtime_tmdb'] = runtimes
        if os.path.exists(extraction_data):
            temp_df.to_csv(extraction_data, mode='a', index=False, header=False)
        else:
            temp_df.to_csv(extraction_data, index=False)
        # Limpiar lotes
        directors, top_actors, budgets, revenues, runtimes = [], [], [], [], []
print(f":marca_de_verificación_blanca: Datos guardados en {extraction_data}")