 --- Extracción de Datos de Artistas y Enriquecimiento con YouTube API ---

In [8]:
                             
import os
import pandas as pd
from sqlalchemy import create_engine, text
                                      
from sqlalchemy import Integer, String, Text, TIMESTAMP, Boolean, VARCHAR, Float, BigInteger                     
from dotenv import load_dotenv
import logging
import time
import numpy as np
import google_auth_oauthlib.flow
import googleapiclient.discovery
import googleapiclient.errors
from IPython.display import display
import warnings

In [9]:
                                
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 150)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning, module='pandas')

In [10]:
                                                           
logging.info("Cargando variables de entorno...")
dotenv_path = '/home/nicolas/Escritorio/workshops/workshop_2/env/.env'                         
load_dotenv(dotenv_path=dotenv_path)

                                       
POSTGRES_USER = os.getenv('POSTGRES_USER')
POSTGRES_PASSWORD = os.getenv('POSTGRES_PASSWORD')
POSTGRES_HOST = os.getenv('POSTGRES_HOST')
POSTGRES_PORT = os.getenv('POSTGRES_PORT')
POSTGRES_NAME = os.getenv('POSTGRES_DB')
TABLE_GRAMMY = 'the_grammy_awards_clean'
TABLE_SPOTIFY = 'spotify_dataset_clean'                                    
TABLE_YOUTUBE = 'youtube_stats'                                        

                                                 
YOUTUBE_CLIENT_ID = os.getenv("YOUTUBE_CLIENT_ID")
YOUTUBE_CLIENT_SECRET = os.getenv("YOUTUBE_CLIENT_SECRET")
YOUTUBE_PROJECT_ID = os.getenv("YOUTUBE_PROJECT_ID")
YOUTUBE_AUTH_URI = os.getenv("YOUTUBE_AUTH_URI")
YOUTUBE_TOKEN_URI = os.getenv("YOUTUBE_TOKEN_URI")
YOUTUBE_REDIRECT_URIS= os.getenv("YOUTUBE_REDIRECT_URIS")

                              
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"
YOUTUBE_SCOPES = ["https://www.googleapis.com/auth/youtube.readonly"]

                             
PROGRESS_CSV_PATH = '/home/nicolas/Escritorio/workshops/workshop_2/data/youtube_stats.csv'                                 
CHUNK_SIZE_UPLOAD = 40000                             


                                
                                                  
if not all([POSTGRES_USER, POSTGRES_PASSWORD, POSTGRES_HOST, POSTGRES_PORT, POSTGRES_NAME]):
    logging.error("Faltan variables de entorno para la base de datos en " + dotenv_path)
    raise ValueError("Variables de entorno de DB incompletas.")
if not YOUTUBE_CLIENT_ID or not YOUTUBE_CLIENT_SECRET:
    logging.error("Faltan YOUTUBE_CLIENT_ID y/o YOUTUBE_CLIENT_SECRET en " + dotenv_path)
    raise ValueError("Variables de entorno de YouTube incompletas.")

logging.info("Variables de entorno cargadas.")

                                
if not all([POSTGRES_USER, POSTGRES_PASSWORD, POSTGRES_HOST, POSTGRES_PORT, POSTGRES_NAME]):
    logging.error("Faltan variables de entorno para la base de datos en " + dotenv_path)
    raise ValueError("Variables de entorno de DB incompletas.")
if not YOUTUBE_CLIENT_ID or not YOUTUBE_CLIENT_SECRET:
    logging.error("Faltan YOUTUBE_CLIENT_ID y/o YOUTUBE_CLIENT_SECRET en " + dotenv_path)
    raise ValueError("Variables de entorno de YouTube incompletas.")

logging.info("Variables de entorno cargadas.")

2025-04-11 12:02:51,030 - INFO - Cargando variables de entorno...
2025-04-11 12:02:51,033 - INFO - Variables de entorno cargadas.
2025-04-11 12:02:51,034 - INFO - Variables de entorno cargadas.


In [11]:
                                
engine = None
try:
    logging.info(f"Creando motor SQLAlchemy para la base de datos '{POSTGRES_NAME}'...")
    db_url = f'postgresql+psycopg2://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{POSTGRES_HOST}:{POSTGRES_PORT}/{POSTGRES_NAME}'
    engine = create_engine(db_url)
    logging.info(f"Motor SQLAlchemy creado exitosamente.")
except Exception as e:
    logging.error(f"Error al crear el motor SQLAlchemy: {e}")
    raise


2025-04-11 12:02:51,040 - INFO - Creando motor SQLAlchemy para la base de datos 'artists'...
2025-04-11 12:02:51,041 - INFO - Motor SQLAlchemy creado exitosamente.


In [12]:
                                                           
df_artists_spotify = None
df_artists_grammy = None
unique_artists_full_list = []

if engine:
    try:
        logging.info(f"Extrayendo artistas únicos de '{TABLE_SPOTIFY}'...")
        query_spotify = f'SELECT DISTINCT artists AS artist_name FROM "{TABLE_SPOTIFY}" WHERE artists IS NOT NULL'
        df_artists_spotify = pd.read_sql_query(query_spotify, con=engine)
        logging.info(f"Artistas extraídos de Spotify: {len(df_artists_spotify)}")

        logging.info(f"Extrayendo artistas únicos de '{TABLE_GRAMMY}'...")
        query_grammy = f'SELECT DISTINCT artist AS artist_name FROM "{TABLE_GRAMMY}" WHERE artist IS NOT NULL AND artist != \'No Especificado\''                      
        df_artists_grammy = pd.read_sql_query(query_grammy, con=engine)
        logging.info(f"Artistas extraídos de Grammys: {len(df_artists_grammy)}")

                                                      
        df_artists_spotify.rename(columns={'artists': 'artist_name'}, inplace=True)
        df_artists_grammy.rename(columns={'artist': 'artist_name'}, inplace=True)
                                   
        df_combined_artists = pd.concat([df_artists_spotify, df_artists_grammy], ignore_index=True)
                                                                      
        unique_artists_series = df_combined_artists['artist_name'].drop_duplicates()
                                                          
        excluded_names = ['Various Artists', '(Various Artists)', 'No Especificado', None, np.nan]
                                                              
        unique_artists_series = unique_artists_series.astype(str)
        unique_artists_series = unique_artists_series[~unique_artists_series.isin(excluded_names)]
                                             
        unique_artists_full_list = unique_artists_series.str.strip().unique().tolist()
                                                                  
        unique_artists_full_list = [artist for artist in unique_artists_full_list if artist]

        logging.info(f"Lista completa de artistas únicos a procesar: {len(unique_artists_full_list)}")
                                                                                                 

    except Exception as e:
        logging.error(f"Error al extraer artistas de la base de datos: {e}")
        unique_artists_full_list = []

else:
    logging.error("Engine no disponible. No se pueden extraer artistas.")

2025-04-11 12:02:51,065 - INFO - Extrayendo artistas únicos de 'spotify_dataset_clean'...
2025-04-11 12:02:51,189 - INFO - Artistas extraídos de Spotify: 31437
2025-04-11 12:02:51,189 - INFO - Extrayendo artistas únicos de 'the_grammy_awards_clean'...
2025-04-11 12:02:51,193 - INFO - Artistas extraídos de Grammys: 1658
2025-04-11 12:02:51,209 - INFO - Lista completa de artistas únicos a procesar: 32690


In [13]:
                                                                    
df_processed_so_far = pd.DataFrame()                                     
artists_already_processed = set()

try:
    logging.info(f"Intentando cargar progreso anterior desde: {PROGRESS_CSV_PATH}")
    df_processed_so_far = pd.read_csv(PROGRESS_CSV_PATH)
                                                                                          
    if 'artist_query' in df_processed_so_far.columns:
        artists_already_processed = set(df_processed_so_far['artist_query'].dropna().unique())
        logging.info(f"Progreso cargado. {len(artists_already_processed)} artistas ya procesados.")
    else:
        logging.warning(f"El archivo CSV de progreso no contiene la columna 'artist_query'. Se procesarán todos.")
        df_processed_so_far = pd.DataFrame()                                       

except FileNotFoundError:
    logging.info("No se encontró archivo de progreso previo. Se iniciará desde cero.")
except Exception as e:
    logging.error(f"Error al cargar el archivo de progreso CSV: {e}. Se iniciará desde cero.")
    df_processed_so_far = pd.DataFrame()                                       

                                                            
artists_to_process = [artist for artist in unique_artists_full_list if artist not in artists_already_processed]
logging.info(f"Artistas pendientes de procesar en esta ejecución: {len(artists_to_process)}")

2025-04-11 12:02:51,216 - INFO - Intentando cargar progreso anterior desde: /home/nicolas/Escritorio/workshops/workshop_2/data/youtube_stats.csv
2025-04-11 12:02:51,219 - INFO - Progreso cargado. 98 artistas ya procesados.
2025-04-11 12:02:51,234 - INFO - Artistas pendientes de procesar en esta ejecución: 32592


In [14]:
                                                                             
youtube = None
if artists_to_process:                                           
    logging.info("Iniciando autenticación con YouTube API...")
    try:
        os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"
        redirect_uris = [uri.strip() for uri in YOUTUBE_REDIRECT_URIS.split(',') if uri.strip()]
        client_config = { "installed": {                                              
                 "client_id": YOUTUBE_CLIENT_ID, "client_secret": YOUTUBE_CLIENT_SECRET,
                 "project_id": YOUTUBE_PROJECT_ID, "auth_uri": YOUTUBE_AUTH_URI,
                 "token_uri": YOUTUBE_TOKEN_URI,
                 "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
                 "redirect_uris": redirect_uris }}

        flow = google_auth_oauthlib.flow.InstalledAppFlow.from_client_config(client_config, YOUTUBE_SCOPES)
        credentials = flow.run_local_server(port=0)
        youtube = googleapiclient.discovery.build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, credentials=credentials)
        logging.info("Autenticación con YouTube API exitosa.")
    except Exception as e:
        logging.error(f"Error durante la autenticación con YouTube API: {e}")
        youtube = None
else:
    logging.info("No hay nuevos artistas para procesar en esta ejecución.")

2025-04-11 12:02:51,243 - INFO - Iniciando autenticación con YouTube API...


Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=571313439711-2tf8le0cpicfl5bcsc98ot6g4de5b4tf.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A42151%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fyoutube.readonly&state=2VuTuwB84pwxweUPH1GNsLScuzx8uq&access_type=offline
Abriendo en una sesión existente del navegador


2025-04-11 12:02:57,927 - INFO - "GET /?state=2VuTuwB84pwxweUPH1GNsLScuzx8uq&code=4/0Ab_5qllpE2_u3B8DRvjUAYY0KXal07jUhlDRo5f5gAIljwlsaooJrAxiL_8artWb39cgSQ&scope=https://www.googleapis.com/auth/youtube.readonly HTTP/1.1" 200 65
2025-04-11 12:02:58,297 - INFO - file_cache is only supported with oauth2client<4.0.0
2025-04-11 12:02:58,300 - INFO - Autenticación con YouTube API exitosa.


In [15]:
                                                                        
                                                                         
                                                            
                                                                       
                                                                         

def search_for_channel(youtube_service, artist_name):
    """Busca un canal por nombre y devuelve el ID del primer resultado."""
    try:
                                                                                      
        search_query = f"{artist_name} Topic"
        logging.info(f"  Buscando canal para: '{artist_name}' (Query: '{search_query}')...")
        request = youtube_service.search().list(
            part="snippet",
            q=search_query,
            type="channel",
            maxResults=1
        )
        response = request.execute()
        if response.get('items'):
            channel_id = response['items'][0]['id']['channelId']
            channel_title = response['items'][0]['snippet']['title']
            logging.info(f"    Canal encontrado: '{channel_title}' (ID: {channel_id})")
            return channel_id
        else:
                                                             
            logging.info(f"  No se encontró canal con 'Topic'. Intentando búsqueda directa para '{artist_name}'...")
            request = youtube_service.search().list(
                part="snippet",
                q=artist_name,
                type="channel",
                maxResults=1
            )
            response = request.execute()
            if response.get('items'):
                channel_id = response['items'][0]['id']['channelId']
                channel_title = response['items'][0]['snippet']['title']
                logging.info(f"    Canal encontrado: '{channel_title}' (ID: {channel_id})")
                return channel_id
            else:
                logging.warning(f"  No se encontró canal para '{artist_name}' en búsqueda directa tampoco.")
                return None
    except googleapiclient.errors.HttpError as e:
        if 'quotaExceeded' in str(e):
             raise e                                                  
        logging.error(f"  Error HTTP buscando canal para '{artist_name}': {e}")
        return None
    except Exception as e:
        logging.error(f"  Error inesperado buscando canal para '{artist_name}': {e}")
        return None

def get_channel_stats(youtube_service, channel_id):
    """Obtiene estadísticas de un canal por su ID."""
    if not channel_id: return None
    try:
        logging.info(f"    Obteniendo estadísticas para Channel ID: {channel_id}")
        request = youtube_service.channels().list(
            part="statistics,snippet",
            id=channel_id
        )
        response = request.execute()
        if response.get('items'):
            item = response['items'][0]
            stats = item.get('statistics', {})
            snippet = item.get('snippet', {})
                                                                           
            try: subscriber_count = int(stats.get('subscriberCount', 0))
            except (ValueError, TypeError): subscriber_count = 0
            try: view_count = int(stats.get('viewCount', 0))
            except (ValueError, TypeError): view_count = 0
            try: video_count = int(stats.get('videoCount', 0))
            except (ValueError, TypeError): video_count = 0

            return {
                'channel_title_verified': snippet.get('title'),
                'subscriber_count': subscriber_count if not stats.get('hiddenSubscriberCount', False) else -1,                           
                'view_count': view_count,
                'video_count': video_count,
            }
        else:
            logging.warning(f"    No se encontraron estadísticas para el channel ID: {channel_id}")
            return None
    except googleapiclient.errors.HttpError as e:
        if 'quotaExceeded' in str(e): raise e           
        logging.error(f"    Error HTTP obteniendo estadísticas para {channel_id}: {e}")
        return None
    except Exception as e:
        logging.error(f"    Error inesperado obteniendo estadísticas para {channel_id}: {e}")
        return None

def search_top_videos(youtube_service, channel_id):
    """Busca los top 10 videos musicales más vistos de un canal."""
    video_ids = []
    if not channel_id: return []
    try:
        logging.info(f"    Buscando top 10 videos musicales para Channel ID: {channel_id}")
        request = youtube_service.search().list(
            part="id",
            channelId=channel_id,
            order="viewCount",
            type="video",
            videoCategoryId="10",         
            maxResults=10
        )
        response = request.execute()
        for item in response.get('items', []):
            video_ids.append(item['id']['videoId'])
        logging.info(f"      Se encontraron {len(video_ids)} IDs de videos musicales top.")
        return video_ids
    except googleapiclient.errors.HttpError as e:
        if 'quotaExceeded' in str(e): raise e           
                                                                                   
        if 'videoCategoryId filter is not supported' in str(e):
             logging.warning(f"      Búsqueda por categoría musical no soportada para {channel_id}. Intentando sin categoría...")
             try:
                 request = youtube_service.search().list(
                     part="id", channelId=channel_id, order="viewCount", type="video", maxResults=10)
                 response = request.execute()
                 for item in response.get('items', []): video_ids.append(item['id']['videoId'])
                 logging.info(f"      Se encontraron {len(video_ids)} IDs de videos top (sin filtro de categoría).")
                 return video_ids
             except Exception as inner_e:
                 logging.error(f"      Error en búsqueda sin categoría para {channel_id}: {inner_e}")
                 return []
        else:
            logging.error(f"    Error HTTP buscando videos top para {channel_id}: {e}")
            return []
    except Exception as e:
        logging.error(f"    Error inesperado buscando videos top para {channel_id}: {e}")
        return []

def get_video_likes(youtube_service, video_ids_list):
    """Obtiene el conteo de likes para una lista de IDs de video."""
    total_likes = 0
    if not video_ids_list:
        logging.info("      No hay IDs de video para buscar likes.")
        return 0

    logging.info(f"    Obteniendo likes para {len(video_ids_list)} videos top...")
    batch_size = 50                  
    likes_found_count = 0
    processed_ids = 0
    for i in range(0, len(video_ids_list), batch_size):
        batch_ids = video_ids_list[i:i + batch_size]
        processed_ids += len(batch_ids)
        try:
            request = youtube_service.videos().list(
                part="statistics",
                id=",".join(batch_ids)
            )
            response = request.execute()
            for item in response.get('items', []):
                likes = item.get('statistics', {}).get('likeCount')
                if likes is not None:
                    try:
                        total_likes += int(likes)
                        likes_found_count += 1
                    except (ValueError, TypeError):
                         logging.warning(f"      Valor de like no numérico encontrado para video {item.get('id')}: {likes}")

        except googleapiclient.errors.HttpError as e:
            if 'quotaExceeded' in str(e): raise e           
            logging.error(f"    Error HTTP obteniendo likes para lote ({len(batch_ids)} IDs): {e}")
                                                                                                    
            break                                        
        except Exception as e:
            logging.error(f"    Error inesperado obteniendo likes para lote: {e}")
            break                           

    logging.info(f"      Likes sumados de {likes_found_count}/{len(video_ids_list)} videos.")
    return total_likes

In [16]:
                                                                  
current_run_data = []                                                      
processed_count_this_run = 0
quota_exceeded = False

if youtube and artists_to_process:
    logging.info(f"Iniciando procesamiento de {len(artists_to_process)} artistas pendientes...")
    processing_start_time = time.time()

    for artist in artists_to_process:
        processed_count_this_run += 1
        logging.info(f"Procesando artista {processed_count_this_run}/{len(artists_to_process)}: {artist}")
        artist_info = {'artist_query': artist}                                     

        try:
                                
            channel_id = search_for_channel(youtube, artist)
            artist_info['channel_id_found'] = channel_id
            time.sleep(0.4)                                  

            if channel_id:
                                                   
                stats = get_channel_stats(youtube, channel_id)
                time.sleep(0.4)                                         

                if stats:
                    artist_info.update(stats)

                                                           
                    top_video_ids = search_top_videos(youtube, channel_id)
                    time.sleep(0.4)                                  

                                                             
                    total_top10_likes = get_video_likes(youtube, top_video_ids)
                    artist_info['total_top10_video_likes'] = total_top10_likes
                    time.sleep(0.4)                                       
                else:
                                                                               
                    artist_info.update({
                        'channel_title_verified': None, 'subscriber_count': None,
                        'view_count': None, 'video_count': None,
                        'total_top10_video_likes': None
                    })
            else:
                                                         
                artist_info.update({
                    'channel_title_verified': None, 'subscriber_count': None,
                    'view_count': None, 'video_count': None,
                    'total_top10_video_likes': None
                })

            current_run_data.append(artist_info)

        except googleapiclient.errors.HttpError as e:
            if 'quotaExceeded' in str(e):
                logging.error(f"¡CUOTA EXCEDIDA procesando a {artist}! Deteniendo esta ejecución.")
                quota_exceeded = True
                                                                                     
                if 'channel_id_found' not in artist_info: artist_info['channel_id_found'] = None
                if 'channel_title_verified' not in artist_info: artist_info['channel_title_verified'] = None
                if 'subscriber_count' not in artist_info: artist_info['subscriber_count'] = None
                if 'view_count' not in artist_info: artist_info['view_count'] = None
                if 'video_count' not in artist_info: artist_info['video_count'] = None
                if 'total_top10_video_likes' not in artist_info: artist_info['total_top10_video_likes'] = None
                current_run_data.append(artist_info)                                     
                break                      
            else:
                logging.error(f"Error HTTP inesperado procesando {artist}: {e}")
                                               
                if 'channel_id_found' not in artist_info: artist_info['channel_id_found'] = f"Error: {e}"
                current_run_data.append(artist_info)

        except Exception as e:
             logging.error(f"Error general inesperado procesando {artist}: {e}")
                                            
             if 'channel_id_found' not in artist_info: artist_info['channel_id_found'] = f"Error: {e}"
             current_run_data.append(artist_info)

                                                             
        time.sleep(1)

    processing_end_time = time.time()
    logging.info(f"Procesamiento de {processed_count_this_run} artistas finalizado en {processing_end_time - processing_start_time:.4f} segundos.")

elif not youtube:
    logging.error("El objeto de servicio de YouTube no está disponible (falló la autenticación).")
elif not artists_to_process:
    logging.info("No hay artistas pendientes para procesar.")
else:
     logging.warning("No se procesaron artistas por una razón desconocida.")

2025-04-11 12:02:58,400 - INFO - Iniciando procesamiento de 32592 artistas pendientes...
2025-04-11 12:02:58,402 - INFO - Procesando artista 1/32592: Dabzee;Simhakutty;Abby


2025-04-11 12:02:58,402 - INFO -   Buscando canal para: 'Dabzee;Simhakutty;Abby' (Query: 'Dabzee;Simhakutty;Abby Topic')...
2025-04-11 12:02:59,237 - ERROR - ¡CUOTA EXCEDIDA procesando a Dabzee;Simhakutty;Abby! Deteniendo esta ejecución.
2025-04-11 12:02:59,238 - INFO - Procesamiento de 1 artistas finalizado en 0.8358 segundos.


In [17]:
                                                
df_combined_results = pd.DataFrame()

if current_run_data:                                                      
    df_current_run = pd.DataFrame(current_run_data)
    logging.info(f"Se creará/actualizará el CSV con {len(df_current_run)} nuevos registros procesados.")
                                                                    
    df_combined_results = pd.concat([df_processed_so_far, df_current_run], ignore_index=True)
                                                                        
    df_combined_results.drop_duplicates(subset=['artist_query'], keep='last', inplace=True)

elif not df_processed_so_far.empty:
     logging.info("No se procesaron nuevos artistas en esta ejecución. Se guardará el estado anterior.")
     df_combined_results = df_processed_so_far                                
else:
     logging.info("No se procesaron artistas en esta ejecución y no había datos previos.")


                                                             
if not df_combined_results.empty:
    try:
        logging.info(f"Guardando {len(df_combined_results)} registros combinados en {PROGRESS_CSV_PATH}...")
        df_combined_results.to_csv(PROGRESS_CSV_PATH, index=False)
        logging.info(f"Resultados combinados guardados exitosamente.")
        print("\n--- Primeras filas del archivo de resultados guardado ---")
        display(df_combined_results.head())
        print("\n--- Últimas filas del archivo de resultados guardado ---")
        display(df_combined_results.tail())
    except Exception as e:
        logging.error(f"Error al guardar los resultados combinados en CSV: {e}")
elif quota_exceeded:
    logging.warning("La cuota fue excedida y no se generaron nuevos datos para guardar.")
else:
    logging.info("No hay resultados para guardar (ni previos ni de esta ejecución).")


logging.info("--- Script de Enriquecimiento con YouTube API Finalizado ---")

2025-04-11 12:02:59,246 - INFO - Se creará/actualizará el CSV con 1 nuevos registros procesados.
2025-04-11 12:02:59,248 - INFO - Guardando 99 registros combinados en /home/nicolas/Escritorio/workshops/workshop_2/data/youtube_stats.csv...
2025-04-11 12:02:59,257 - INFO - Resultados combinados guardados exitosamente.



--- Primeras filas del archivo de resultados guardado ---


Unnamed: 0,artist_query,channel_id_found,channel_title_verified,subscriber_count,view_count,video_count,total_top10_video_likes
0,Nalan,UC_zzCBiTkpQwP8lwHgQ7M3Q,Nalan - Topic,19400.0,53723640.0,187.0,163164.0
1,Grupo Sensação,UCKiMawhTZ5z1S8i1_fezSAw,Grupo Sensação,55800.0,8161696.0,24.0,0.0
2,Gorillaz;Beck,UCNIV5B_aJnLrKDSnW_MOmcQ,Gorillaz - Topic,43100.0,1533592000.0,560.0,9354631.0
3,Parcels,UC2as7PrmUgmdZAkMIWNY6EQ,Parcels,306000.0,180354800.0,236.0,745054.0
4,Klingande,UCOX8OMkI7ULP7K8bfB_HTHA,Klingande,83500.0,64798540.0,44.0,163822.0



--- Últimas filas del archivo de resultados guardado ---


Unnamed: 0,artist_query,channel_id_found,channel_title_verified,subscriber_count,view_count,video_count,total_top10_video_likes
94,Too Much Joy,UCaS6hAeJPwHqSpp9Q3stclg,Too Much Joy - Topic,434.0,210069.0,226.0,992.0
95,張偉文;胡美儀,UCjp4hbsLF7OxCJrgWmE9CWQ,粤调粿粿,19.0,8777.0,39.0,0.0
96,Daniel Adams-Ray;Erik Lundin,,,,,,
97,Foster;Chelsea Collins,,,,,,
98,Dabzee;Simhakutty;Abby,,,,,,


2025-04-11 12:02:59,274 - INFO - --- Script de Enriquecimiento con YouTube API Finalizado ---


In [18]:
                    
                                                              
if os.path.exists(PROGRESS_CSV_PATH) and engine is not None:
    logging.info(f"\n--- Cargando datos finales desde {PROGRESS_CSV_PATH} a la tabla '{TABLE_YOUTUBE}' ---")
    try:
                                                              
        df_to_upload = pd.read_csv(PROGRESS_CSV_PATH)
        logging.info(f"Leído {len(df_to_upload)} filas desde {PROGRESS_CSV_PATH} para cargar a DB.")

                                                                    
        sql_types = {
            'artist_query': Text(),                           
            'channel_id_found': Text(),                                              
            'channel_title_verified': Text(),                                        
            'subscriber_count': BigInteger(),                                                      
            'view_count': BigInteger(),                                         
            'video_count': Integer(),                         
            'total_top10_video_likes': BigInteger()                               
        }
                                                                 

        logging.info(f"Cargando DataFrame a la tabla '{TABLE_YOUTUBE}' en chunks de {CHUNK_SIZE_UPLOAD}...")
        start_upload_time = time.time()

        df_to_upload.to_sql(
            TABLE_YOUTUBE,
            con=engine,
            if_exists='replace',                                                       
            index=False,
            method='multi',
            dtype=sql_types,
            chunksize=CHUNK_SIZE_UPLOAD                  
        )

        end_upload_time = time.time()
        logging.info(f"DataFrame cargado a '{TABLE_YOUTUBE}' en {end_upload_time - start_upload_time:.2f} segundos.")

                                                
        logging.info(f"Verificando número de filas en la tabla '{TABLE_YOUTUBE}'...")
        with engine.connect() as connection:
            query_count = text(f'SELECT COUNT(*) FROM "{TABLE_YOUTUBE}"')
            result = connection.execute(query_count)
            num_db_final_rows = result.scalar_one()

        logging.info(f"Número de filas en la tabla final '{TABLE_YOUTUBE}': {num_db_final_rows}")
        logging.info(f"Número de filas en el DataFrame cargado: {len(df_to_upload)}")

        if len(df_to_upload) == num_db_final_rows:
            logging.info("¡Verificación de carga final exitosa!")
        else:
            logging.warning("Discrepancia en el número de filas entre el CSV final y la tabla cargada.")

    except Exception as e:
        logging.error(f"Error al cargar el CSV final '{PROGRESS_CSV_PATH}' en la base de datos: {e}")

elif not os.path.exists(PROGRESS_CSV_PATH):
     logging.error(f"No se encontró el archivo {PROGRESS_CSV_PATH} para cargar a la base de datos.")
elif engine is None:
     logging.error("No se puede cargar a la base de datos porque la conexión (engine) no está definida.")

logging.info("--- Script Completo Finalizado ---")

2025-04-11 12:02:59,284 - INFO - 
--- Cargando datos finales desde /home/nicolas/Escritorio/workshops/workshop_2/data/youtube_stats.csv a la tabla 'youtube_stats' ---
2025-04-11 12:02:59,286 - INFO - Leído 99 filas desde /home/nicolas/Escritorio/workshops/workshop_2/data/youtube_stats.csv para cargar a DB.
2025-04-11 12:02:59,286 - INFO - Cargando DataFrame a la tabla 'youtube_stats' en chunks de 40000...
2025-04-11 12:02:59,305 - INFO - DataFrame cargado a 'youtube_stats' en 0.02 segundos.
2025-04-11 12:02:59,306 - INFO - Verificando número de filas en la tabla 'youtube_stats'...
2025-04-11 12:02:59,307 - INFO - Número de filas en la tabla final 'youtube_stats': 99
2025-04-11 12:02:59,307 - INFO - Número de filas en el DataFrame cargado: 99
2025-04-11 12:02:59,308 - INFO - ¡Verificación de carga final exitosa!
2025-04-11 12:02:59,308 - INFO - --- Script Completo Finalizado ---
