In [2]:
import pandas as pd
from googleapiclient.discovery import build
from youtube_transcript_api import YouTubeTranscriptApi
import json
from datetime import datetime
import os
import sys

# Assuming your src folder is one level up from where the notebook is running
# Adjust the path as needed
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
# Now try importing
from src.summarizer import summarize_with_openai

class YoutubeChannelSubtitles:
    def __init__(self, api_key):
        self.youtube = build('youtube', 'v3', developerKey=api_key)
        
    def get_channel_id(self, channel_name):
        """Gets the channel ID from the channel name"""
        request = self.youtube.search().list(
            q=channel_name,
            type='channel',
            part='id',
            maxResults=1
        )
        response = request.execute()
        
        if response['items']:
            return response['items'][0]['id']['channelId']
        return None

    def get_recent_videos(self, channel_id, max_results=50, days_back=7):
        """Gets videos from the channel published in the last X days"""
        # Request more videos than needed to ensure we have enough after filtering
        request = self.youtube.search().list(
            channelId=channel_id,
            order='date',  # Sort by date
            part='snippet',
            maxResults=max_results,  # Request more to filter afterward
            type='video'
        )
        response = request.execute()
        
        videos = []
        # Calculate cutoff date (7 days ago from now)
        cutoff_date = datetime.now() - timedelta(days=days_back)
        
        for item in response['items']:
            # Convert published date string to datetime object
            published_at = datetime.strptime(
                item['snippet']['publishedAt'], 
                '%Y-%m-%dT%H:%M:%SZ'
            )
            
            # Only include videos published after the cutoff date
            if published_at >= cutoff_date:
                video = {
                    'title': item['snippet']['title'],
                    'video_id': item['id']['videoId'],
                    'published_at': item['snippet']['publishedAt']
                }
                videos.append(video)
        
        return videos

    def download_subtitles(self, video_id, languages=['es', 'en'], output_dir='subtitles'):
        """Downloads subtitles for a video in the specified languages"""
        try:
            # Create directory if it doesn't exist
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)

            transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
            results = {}
            
            for language in languages:
                try:
                    transcript = transcript_list.find_transcript([language])
                    subtitles = transcript.fetch()
                    
                    # Save in JSON format
                    filename = f'{output_dir}/{video_id}_{language}.json'
                    with open(filename, 'w', encoding='utf-8') as f:
                        json.dump(subtitles, f, ensure_ascii=False, indent=2)
                    
                    results[language] = 'Success'
                    
                except Exception as e:
                    results[language] = f'Failed: {str(e)}'
            
            return results
            
        except Exception as e:
            return f'Failed to get subtitles: {str(e)}'

    def process_channel(self, channel_name, max_videos=50, languages=['es', 'en'], days_back=7):
        """Processes videos from a channel within the specified time frame"""
        # Get channel ID
        channel_id = self.get_channel_id(channel_name)
        if not channel_id:
            return f"Channel not found: {channel_name}"

        # Get recent videos from the last X days
        videos = self.get_recent_videos(channel_id, max_videos, days_back)
        
        if not videos:
            return f"No videos found in the last {days_back} days for channel: {channel_name}"
        
        results = []
        for video in videos:
            result = {
                'title': video['title'],
                'video_id': video['video_id'],
                'published_at': video['published_at'],
                'subtitles': self.download_subtitles(video['video_id'], languages)
            }
            results.append(result)
            
        # Save results to a log file
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        with open(f'results_{timestamp}.json', 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
            
        return results

    def process_multiple_channels_to_csv(self, channel_names, max_videos=5, days_back=7):
        """
        Process videos from multiple channels and combine them into a single CSV
        
        Args:
            channel_names (list): List of channel names to process
            max_videos (int): Maximum videos to fetch per channel
            days_back (int): Only include videos from the last X days
            
        Returns:
            str: Filename of the created CSV
        """
        # List to store data from all channels
        all_data = []
        
        # Process each channel
        for channel_name in channel_names:
            print(f"Processing channel: {channel_name}")
            
            # Get channel ID
            channel_id = self.get_channel_id(channel_name)
            if not channel_id:
                print(f"Channel not found: {channel_name}")
                continue

            # Get videos from the last X days
            videos = self.get_recent_videos(channel_id, max_videos, days_back)
            
            if not videos:
                print(f"No videos found in the last {days_back} days for channel: {channel_name}")
                continue
            
            # Process each video
            for video in videos:
                video_id = video['video_id']
                title = video['title']
                date = datetime.strptime(video['published_at'], '%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m-%d')
                video_url = f"https://www.youtube.com/watch?v={video_id}"
                
                try:
                    # Get subtitles
                    transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
                    
                    # Try manual subtitles first
                    try:
                        transcript = transcript_list.find_manually_created_transcript()
                    except:
                        # If no manual subtitles, try with any available language
                        transcript = transcript_list.find_transcript(['en', 'es'])
                    
                    subtitles = transcript.fetch()
                    language = transcript.language
                    
                    # Get text from each subtitle entry
                    subtitle_texts = [entry['text'] for entry in subtitles]
                    
                    # Clean special characters from each text segment
                    cleaned_texts = []
                    for text in subtitle_texts:
                        # Replace newlines with spaces
                        cleaned_text = text.replace('\n', ' ')
                        # Replace escaped backslashes
                        cleaned_text = cleaned_text.replace('\\', '')
                        # Replace multiple spaces with a single space
                        import re
                        cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
                        cleaned_texts.append(cleaned_text)
                    
                    # Join all cleaned text segments with spaces
                    full_text = ' '.join(cleaned_texts).strip()
                    
                    summary = "not summary yet" #summarize_with_openai(full_text)

                    # Add to data list with channel name
                    all_data.append({
                        'Title': title,
                        'Date': date,
                        'Link': video_url,
                        'Summary': summary,
                        'Source': channel_name,  # Add channel name
                    })
                    
                except Exception as e:
                    print(f"Error processing video {video_id}: {str(e)}")
        
        # If no data was collected
        if not all_data:
            return "No videos found for any of the specified channels"
            
        # Create DataFrame and save as CSV
        df = pd.DataFrame(all_data)
        df = df[[ 'Title', 'Date', 'Link', 'Summary', 'Source']]
        
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        csv_filename = f'multi_channel_subtitles_{timestamp}.csv'
        df.to_csv(csv_filename, index=False, encoding='utf-8')
        
        return df



In [2]:
# Ejemplo de uso
if __name__ == "__main__":
    API_KEY = 'AIzaSyDKckSI_0asveDz4lHuU_KMrmmU9zy0-18'
    yt = YoutubeChannelSubtitles(API_KEY)
    
    csv_file = yt.process_multiple_channels_to_csv(
        channel_names=["@la_inteligencia_artificial", "@dotcsv", "@gustavo-entrala"],
        max_videos=5,
        days_back=7
    )
    
    print(f"CSV generado: {csv_file}")



Processing channel: @la_inteligencia_artificial
Processing channel: @dotcsv
No videos found in the last 7 days for channel: @dotcsv
Processing channel: @gustavo-entrala
CSV generado:                                                Title        Date  \
0         ¡BRUTAL lo nuevo de OpenAI y es GRATIS! 🔥🤯  2025-03-26   
1  🤖 Consiguen reconectar el cerebro de una perso...  2025-03-26   
2  🤖 Lo primero y más importante es validar la id...  2025-03-25   
3  🤖 Existe una burbuja de valorizaciones de empr...  2025-03-25   
4  🤖 Ninguna sociedad está preparada para el camb...  2025-03-25   
5  El Chip Cuántico MAJORANA: el mayor INVENTO de...  2025-03-21   

                                          Link          Summary  \
0  https://www.youtube.com/watch?v=aOqgMPjJBf8  not summary yet   
1  https://www.youtube.com/watch?v=w9EUvKfsHPI  not summary yet   
2  https://www.youtube.com/watch?v=8WBS9-caOUA  not summary yet   
3  https://www.youtube.com/watch?v=CES1h_yZ83A  not summary yet   
4  ht

In [3]:
csv_file

Unnamed: 0,Title,Date,Link,Summary,Source
0,¡BRUTAL lo nuevo de OpenAI y es GRATIS! 🔥🤯,2025-03-26,https://www.youtube.com/watch?v=aOqgMPjJBf8,not summary yet,@la_inteligencia_artificial
1,🤖 Consiguen reconectar el cerebro de una perso...,2025-03-26,https://www.youtube.com/watch?v=w9EUvKfsHPI,not summary yet,@la_inteligencia_artificial
2,🤖 Lo primero y más importante es validar la id...,2025-03-25,https://www.youtube.com/watch?v=8WBS9-caOUA,not summary yet,@la_inteligencia_artificial
3,🤖 Existe una burbuja de valorizaciones de empr...,2025-03-25,https://www.youtube.com/watch?v=CES1h_yZ83A,not summary yet,@la_inteligencia_artificial
4,🤖 Ninguna sociedad está preparada para el camb...,2025-03-25,https://www.youtube.com/watch?v=Ad76pb-bb7I,not summary yet,@la_inteligencia_artificial
5,El Chip Cuántico MAJORANA: el mayor INVENTO de...,2025-03-21,https://www.youtube.com/watch?v=iCmgjF1p9CU,not summary yet,@gustavo-entrala


In [26]:
"""
Web scraping functionality for AI News Scraper
Contains functions to scrape articles from various news sources
"""

import logging
from datetime import datetime, timedelta
from src.summarizer import summarize_with_openai
from config.config import AI_NEWS_URL, MIT_NEWS_URL, STANFORD_NEWS_URL, YOUTUBE_API_KEY, YOUTUBE_CHANNELS
from src.email_sender import send_combined_email_report
from src.scraper import scrape_articles_AI_news, get_article_content, scrape_mit_articles, get_mit_article_content, scrape_stanford_articles, get_stanford_article_content
from src.youtube_scraper import process_youtube_channels
import csv

# Get logger
logger = logging.getLogger('ai_news_scraper.processor')

def save_to_csv(articles, filename) -> None:
    """
    Save articles to CSV file.
    
    Args:
        articles (List[Dict]): List of articles to save
        filename (str): Path to save the CSV file
    """
    try:
        keys = ['Title', 'Date', 'Link','Summary', 'Source']
        with open(filename, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.DictWriter(file, fieldnames=keys, extrasaction='ignore')
            writer.writeheader()
            writer.writerows(articles)
        logger.info(f"Articles saved to CSV: {filename}")
    except Exception as e:
        logging.error(f"Error saving to CSV: {e}")
        raise

def save_to_html(articles, date_str, filename):
    """
    Save articles to a styled HTML file.
    
    Args:
        articles (list): List of processed articles
        date_str (str): Date range string for the title
        filename (str): Path to save the HTML file
    """
    try:
        import pandas as pd
        
        if not articles:
            logging.warning("No articles to save to HTML")
            return

        # Convert to DataFrame
        df = pd.DataFrame(articles)
        
        # Define function to make links clickable
        def make_clickable(val):
            return f'<a href="{val}" target="_blank">{val}</a>'

        # Define CSS styles
        styles = [
            # Header style
            dict(selector="th", props=[
                ("background-color", "#914048"),
                ("color", "white"),
                ("font-weight", "bold"),
                ("text-align", "center"),
                ("padding", "10px"),
                ("border", "1px solid #ddd")
            ]),
            # Cell style
            dict(selector="td", props=[
                ("border", "1px solid #ddd"),
                ("padding", "8px"),
                ("text-align", "left")
            ]),
            # Table style
            dict(selector="", props=[
                ("border-collapse", "collapse"),
                ("width", "100%"),
                ("margin", "20px 0"),
                ("font-family", "Arial, sans-serif")
            ]),
            # Source group style
            dict(selector=".source-header", props=[
                ("background-color", "#f5f5f5"),
                ("padding", "10px"),
                ("margin", "20px 0 10px 0"),
                ("font-size", "1.2em"),
                ("font-weight", "bold")
            ])
        ]

        # Create HTML template
        html_template = f"""
        <html>
        <head>
            <title>AI News Summary - {date_str}</title>
            <style>
                body {{
                    font-family: Arial, sans-serif;
                    margin: 20px;
                    background-color: #f8f9fa;
                }}
                .container {{
                    max-width: 1200px;
                    margin: 0 auto;
                    background-color: white;
                    padding: 20px;
                    box-shadow: 0 0 10px rgba(0,0,0,0.1);
                }}
                h1 {{
                    color: #333;
                    text-align: center;
                    padding-bottom: 20px;
                    border-bottom: 2px solid #914048;
                }}
                .source-section {{
                    margin-top: 30px;
                }}
                .source-header {{
                    background-color: #f5f5f5;
                    padding: 10px;
                    margin: 20px 0 10px 0;
                    font-size: 1.2em;
                    font-weight: bold;
                    border-left: 4px solid #914048;
                }}
            </style>
        </head>
        <body>
            <div class="container">
                <h1>AI News Summary - {date_str}</h1>
        """

        # Group articles by source
        sources = sorted(set(article['Source'] for article in articles))
        
        for source in sources:
            source_articles = [a for a in articles if a['Source'] == source]
            if source_articles:
                # Create DataFrame for this source
                df_source = pd.DataFrame(source_articles)
                
                # Select and reorder columns
                columns_to_display = ['Title', 'Date', 'Link', 'Summary']
                df_display = df_source[columns_to_display].copy()
                
                # Format the date column
                df_display['Date'] = df_display['Date'].apply(lambda x: x.strftime('%Y-%m-%d') if pd.notnull(x) else '')
                
                # Style the DataFrame
                df_styled = df_display.style\
                    .format({'Link': make_clickable})\
                    .set_table_styles(styles)
                
                # Add source section to HTML
                html_template += f"""
                    <div class="source-section">
                        <div class="source-header">{source}</div>
                        {df_styled.to_html(escape=False)}
                    </div>
                """

        html_template += """
            </div>
        </body>
        </html>
        """

        # Save to file
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(html_template)
            
        logger.info(f"Articles saved to HTML: {filename}")
        
    except Exception as e:
        logger.error(f"Error saving to HTML: {e}")
        raise

def process_all_news(recipients, target_date=None):
    """
    Process news from all sources and send combined email.
    
    Args:
        recipients (str or list): Email recipient(s)
        target_date (str, optional): Target date in YYYY-MM-DD format
        
    Returns:
        None
    """
    try:
        # Set date range
        end_date = (datetime.strptime(target_date, '%Y-%m-%d').date() 
                   if target_date else datetime.now().date())
        start_date = end_date - timedelta(days=7)
        date_str = f"{start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}"

        logger.info(f"Processing news for date range: {date_str}")

        # Get and process articles
        all_articles = []
        source_counts = {}  # Para llevar la cuenta de artículos por fuente

        source_configs = [
            ('AI News', scrape_articles_AI_news, get_article_content),
            ('MIT News', scrape_mit_articles, get_mit_article_content),
            ('Stanford News', scrape_stanford_articles, get_stanford_article_content)
        ]

        # Collect articles from all sources
        for source_name, scraper_func, content_func in source_configs:
            try:
                logger.info(f"Processing source: {source_name}")
                url = globals()[f"{source_name.upper().replace(' ', '_')}_URL"]
                articles = scraper_func(url)
                
                source_articles = []  # Artículos para esta fuente

                # Process each article
                for article in articles:
                    article_date = article.get('Date')
                    
                    # Check if article is within date range
                    if article_date and start_date <= article_date <= end_date:
                        # Get and summarize content
                        content = content_func(article['Link'])
                        if content:
                            article['Summary'] = "not summary yet" #summarize_with_openai(content)
                        article['Source'] = source_name
                        all_articles.append(article)
                        source_articles.append(article)

                source_counts[source_name] = len(source_articles)
                logger.info(f"Found {len(source_articles)} articles from {source_name}")

            except Exception as e:
                logger.error(f"Error processing {source_name}: {e}")
                source_counts[source_name] = 0
                continue

        # Process YouTube channels
        try:
            if YOUTUBE_API_KEY and YOUTUBE_CHANNELS:
                youtube_articles = process_youtube_channels(
                    YOUTUBE_API_KEY, 
                    YOUTUBE_CHANNELS,
                    max_videos=10, 
                    days_back=(end_date - start_date).days
                )
                
                # Filter by date range (should already be filtered, but double-check)
                youtube_articles = [
                    article for article in youtube_articles 
                    if article.get('Date') and start_date <= article.get('Date') <= end_date
                ]
                
                all_articles.append(youtube_articles)
                source_counts['YouTube'] = len(youtube_articles)
                logger.info(f"Added {len(youtube_articles)} YouTube videos to articles")
            else:
                logger.info("YouTube processing skipped: API key or channels not configured")
                source_counts['YouTube'] = 0
        except Exception as e:
            logger.error(f"Error processing YouTube channels: {e}")
            source_counts['YouTube'] = 0
        
        print(youtube_articles)
        # Log total counts
        logger.info(f"Total articles collected: {len(all_articles)}")
        for source, count in source_counts.items():
            logger.info(f"  - {source}: {count} articles")

        # if all_articles:
        #     # Create date string for filenames
        #     end_date_str = end_date.strftime('%Y-%m-%d')
            
        #     # Save to CSV
        #     csv_path = f"data/articles_week_{end_date_str}.csv"
        #     save_to_csv(all_articles, csv_path)
            
        #     # Save to HTML
        #     html_path = f"results/articles_week_{end_date_str}.html"
        #     save_to_html(all_articles, date_str, html_path)
            
        #     # Send email
        #     send_combined_email_report(all_articles, date_str, recipients)
        #     logger.info("Articles processed, saved, and email sent successfully!")
        # else:
        #     logger.info(f"No articles found for date range: {date_str}")

    except Exception as e:
        logger.error(f"Error in process_all_news: {e}")
        raise

In [27]:
EMAIL = os.getenv("EMAIL")
PASSWORD = os.getenv("PASSWORD")
IMAP_SERVER = os.getenv("IMAP_SERVER")
RECIPIENT_EMAIL = json.loads(os.getenv("RECIPIENT_EMAILS", "[]"))
if not RECIPIENT_EMAIL:
    raise ValueError("No recipient emails configured")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY")
target_date = '2025-03-16'
    
# Process news
process_all_news(RECIPIENT_EMAIL, target_date)


Error getting channel ID for '@la_inteligencia_artificial': 'str' object has no attribute 'search'
Channel not found: @la_inteligencia_artificial
Error getting channel ID for '@dotcsv': 'str' object has no attribute 'search'
Channel not found: @dotcsv
Error getting channel ID for '@gustavo-entrala': 'str' object has no attribute 'search'
Channel not found: @gustavo-entrala
No videos found for any of the specified channels
Error processing YouTube channels: 'str' object has no attribute 'get'


No videos found for any of the specified channels


In [4]:
import logging
import pandas as pd
from googleapiclient.discovery import build
from youtube_transcript_api import YouTubeTranscriptApi
import json
from datetime import datetime, timedelta
import os
import traceback

# Get logger
logger = logging.getLogger('ai_news_scraper.youtube')

# Resto de tus funciones de YouTube tal como las definiste...
def build_youtube_client(api_key):
    """
    Construye y devuelve un cliente de la API de YouTube.
    
    Args:
        api_key (str): Clave de API de YouTube
        
    Returns:
        object: Cliente de la API de YouTube
    """
    try:
        if not api_key or not isinstance(api_key, str):
            logger.error(f"Invalid YouTube API key: {api_key}")
            return None

        youtube = build('youtube', 'v3', developerKey=api_key)
        logger.info("YouTube API client created successfully")
        return youtube
    except Exception as e:
        logger.error(f"Failed to create YouTube API client: {str(e)}")
        raise


def get_channel_id(youtube, channel_name):
    """
    Obtiene el ID del canal a partir del nombre del canal.
    
    Args:
        youtube (object): Cliente de la API de YouTube
        channel_name (str): Nombre del canal
        
    Returns:
        str: ID del canal o None si no se encuentra
    """
    try:
        request = youtube.search().list(
            q=channel_name,
            type='channel',
            part='id',
            maxResults=1
        )
        response = request.execute()
        
        if response['items']:
            channel_id = response['items'][0]['id']['channelId']
            logger.info(f"Found channel ID for '{channel_name}': {channel_id}")
            return channel_id
        
        logger.warning(f"No channel found for name '{channel_name}'")
        return None
    except Exception as e:
        logger.error(f"Error getting channel ID for '{channel_name}': {str(e)}")
        return None


def get_recent_videos(youtube, channel_id, max_results=50, days_back=7):
    """
    Obtiene videos recientes del canal publicados en los últimos X días.
    
    Args:
        youtube (object): Cliente de la API de YouTube
        channel_id (str): ID del canal
        max_results (int): Número máximo de resultados
        days_back (int): Cuando contar los 7 dias
        
    Returns:
        list: Lista de videos con información
    """
    try:
        request = youtube.search().list(
            channelId=channel_id,
            order='date',
            part='snippet',
            maxResults=max_results,
            type='video'
        )
        response = request.execute()
        
        videos = []
        cutoff_date = datetime.now() - timedelta(days=7)
        
        for item in response['items']:
            published_at = datetime.strptime(
                item['snippet']['publishedAt'], 
                '%Y-%m-%dT%H:%M:%SZ'
            )
            
            if published_at >= cutoff_date:
                video = {
                    'title': item['snippet']['title'],
                    'video_id': item['id']['videoId'],
                    'published_at': item['snippet']['publishedAt']
                }
                videos.append(video)
        
        logger.info(f"Found {len(videos)} videos from the last {days_back} days for channel {channel_id}")
        return videos
    except Exception as e:
        logger.error(f"Error getting videos for channel {channel_id}: {str(e)}")
        return []


def download_subtitles(video_id, languages=['es', 'en'], output_dir='subtitles'):
    """
    Descarga subtítulos para un video en los idiomas especificados.
    
    Args:
        video_id (str): ID del video
        languages (list): Lista de códigos de idioma
        output_dir (str): Directorio de salida
        
    Returns:
        dict: Resultados para cada idioma
    """
    try:
        # Crear directorio si no existe
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
            logger.info(f"Created directory: {output_dir}")

        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
        results = {}
        
        for language in languages:
            try:
                transcript = transcript_list.find_transcript([language])
                subtitles = transcript.fetch()
                
                # Guardar en formato JSON
                filename = f'{output_dir}/{video_id}_{language}.json'
                with open(filename, 'w', encoding='utf-8') as f:
                    json.dump(subtitles, f, ensure_ascii=False, indent=2)
                
                results[language] = 'Success'
                logger.info(f"Downloaded {language} subtitles for video {video_id}")
                
            except Exception as e:
                results[language] = f'Failed: {str(e)}'
                logger.warning(f"Failed to download {language} subtitles for video {video_id}: {str(e)}")
        
        return results
        
    except Exception as e:
        error_msg = f"Failed to get any subtitles for video {video_id}: {str(e)}"
        logger.error(error_msg)
        return {'error': error_msg}


def process_channel(youtube, channel_name, max_videos=50, languages=['es', 'en'], days_back=7):
    """
    Procesa videos de un canal dentro del período de tiempo especificado.
    
    Args:
        youtube (object): Cliente de la API de YouTube
        channel_name (str): Nombre del canal
        max_videos (int): Número máximo de videos
        languages (list): Lista de idiomas para los subtítulos
        days_back (int): Días hacia atrás para filtrar
        
    Returns:
        list: Resultados del procesamiento
    """
    try:
        logger.info(f"Processing channel: {channel_name}")
        
        # Obtener ID del canal
        channel_id = get_channel_id(youtube, channel_name)
        if not channel_id:
            error_msg = f"Channel not found: {channel_name}"
            logger.warning(error_msg)
            return error_msg

        # Obtener videos recientes
        videos = get_recent_videos(youtube, channel_id, max_videos, days_back)
        
        if not videos:
            msg = f"No videos found in the last {days_back} days for channel: {channel_name}"
            logger.info(msg)
            return msg
        
        results = []
        for video in videos:
            try:
                result = {
                    'title': video['title'],
                    'video_id': video['video_id'],
                    'published_at': video['published_at'],
                    'subtitles': download_subtitles(video['video_id'], languages)
                }
                results.append(result)
            except Exception as e:
                logger.error(f"Error processing video {video['video_id']}: {str(e)}")
        
        # Guardar resultados en un archivo JSON
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        results_file = f'results_{timestamp}.json'
        with open(results_file, 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
        
        logger.info(f"Saved processing results to {results_file}")
        return results
        
    except Exception as e:
        error_msg = f"Error processing channel {channel_name}: {str(e)}"
        logger.error(error_msg)
        logger.error(traceback.format_exc())
        return error_msg


def clean_subtitle_text(text):
    """
    Limpia el texto de los subtítulos.
    
    Args:
        text (str): Texto a limpiar
        
    Returns:
        str: Texto limpio
    """
    try:
        import re
        
        # Reemplazar saltos de línea con espacios
        cleaned_text = text.replace('\n', ' ')
        # Reemplazar barras invertidas escapadas
        cleaned_text = cleaned_text.replace('\\', '')
        # Reemplazar múltiples espacios con uno solo
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
        
        return cleaned_text.strip()
    except Exception as e:
        logger.error(f"Error cleaning subtitle text: {str(e)}")
        return text


def get_video_transcript(video_id):
    """
    Obtiene la transcripción completa de un video.
    
    Args:
        video_id (str): ID del video
        
    Returns:
        tuple: (texto completo, idioma) o (None, None) si falla
    """
    try:
        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
        
        # Intentar con subtítulos manuales primero
        try:
            transcript = transcript_list.find_manually_created_transcript()
        except:
            # Si no hay subtítulos manuales, probar con cualquier idioma disponible
            transcript = transcript_list.find_transcript(['en', 'es'])
        
        subtitles = transcript.fetch()
        language = transcript.language
        
        # Obtener texto de cada entrada de subtítulos
        subtitle_texts = [entry['text'] for entry in subtitles]
        
        # Limpiar cada segmento de texto
        cleaned_texts = [clean_subtitle_text(text) for text in subtitle_texts]
        
        # Unir todos los segmentos de texto limpios con espacios
        full_text = ' '.join(cleaned_texts).strip()
        
        logger.info(f"Successfully retrieved transcript for video {video_id} in {language}")
        return full_text, language
    
    except Exception as e:
        logger.error(f"Failed to get transcript for video {video_id}: {str(e)}")
        return None, None


def process_youtube_channels(api_key, channel_names, max_videos=5, days_back=7):
    """
    Procesa videos de múltiples canales y los combina en un solo CSV.
    
    Args:
        youtube (object): Cliente de la API de YouTube
        channel_names (list): Lista de nombres de canales
        max_videos (int): Máximo de videos por canal
        days_back (int): Solo incluir videos de los últimos X días
        
    Returns:
        DataFrame o str: DataFrame con los datos o mensaje de error
    """
    try:
        logger.info(f"Processing {len(channel_names)} channels: {', '.join(channel_names)}")
        
        youtube = build_youtube_client(api_key)

        # Lista para almacenar datos de todos los canales
        all_data = []
        
        # Procesar cada canal
        for channel_name in channel_names:
            try:
                logger.info(f"Processing channel: {channel_name}")
                
                # Obtener ID del canal
                channel_id = get_channel_id(youtube, channel_name)
                if not channel_id:
                    logger.warning(f"Channel not found: {channel_name}")
                    continue

                # Obtener videos de los últimos X días
                videos = get_recent_videos(youtube, channel_id, max_videos, days_back)
                
                if not videos:
                    logger.info(f"No videos found in the last 7 days for channel: {channel_name}")
                    continue
                
                # Procesar cada video
                for video in videos:
                    try:
                        video_id = video['video_id']
                        title = video['title']
                        date = datetime.strptime(video['published_at'], '%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m-%d')
                        video_url = f"https://www.youtube.com/watch?v={video_id}"
                        
                        # Obtener transcripción
                        full_text, language = get_video_transcript(video_id)
                        
                        if full_text:
                            # Generar resumen (descomentado cuando se implemente)
                            try:
                                summary = "not summary yet"  # summarize_with_openai(full_text)
                                
                                # Añadir a la lista de datos con el nombre del canal
                                all_data.append({
                                    'Title': title,
                                    'Date': date,
                                    'Link': video_url,
                                    'Summary': summary,
                                    'Source': channel_name,
                                    'Language': language
                                })
                                
                                logger.info(f"Successfully processed video: {title}")
                            except Exception as e:
                                logger.error(f"Error generating summary for video {video_id}: {str(e)}")
                    except Exception as e:
                        logger.error(f"Error processing video {video.get('video_id', 'unknown')}: {str(e)}")
            
            except Exception as e:
                logger.error(f"Error processing channel {channel_name}: {str(e)}")
        
        # Si no se recopiló ningún dato
        if not all_data:
            msg = "No videos found for any of the specified channels"
            logger.warning(msg)
            return msg
                
        # # Crear DataFrame y guardar como CSV
        # df = pd.DataFrame(all_data)
        # df = df[['Title', 'Date', 'Link', 'Summary', 'Source', 'Language']]
        
        # timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        # csv_filename = f'multi_channel_subtitles_{timestamp}.csv'
        # df.to_csv(csv_filename, index=False, encoding='utf-8')
        
        # logger.info(f"Successfully saved data to {csv_filename}. Processed {len(all_data)} videos.")
        return all_data
        
    except Exception as e:
        error_msg = f"Error processing multiple channels: {str(e)}"
        logger.error(error_msg)
        logger.error(traceback.format_exc())
        return error_msg


In [5]:
YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY")
YOUTUBE_CHANNELS = ["@la_inteligencia_artificial", "@dotcsv", "@gustavo-entrala"]
a = process_youtube_channels(YOUTUBE_API_KEY, YOUTUBE_CHANNELS, max_videos=5)

Error getting channel ID for '@la_inteligencia_artificial': <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/search?q=%40la_inteligencia_artificial&type=channel&part=id&maxResults=1&key=AIzaSyDKckSI_0asveDz4lHuU_KMrmmU9zy0-18&alt=json returned "The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.". Details: "[{'message': 'The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.', 'domain': 'youtube.quota', 'reason': 'quotaExceeded'}]">
Channel not found: @la_inteligencia_artificial
Error getting channel ID for '@dotcsv': <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/search?q=%40dotcsv&type=channel&part=id&maxResults=1&key=AIzaSyDKckSI_0asveDz4lHuU_KMrmmU9zy0-18&alt=json returned "The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.". Details: "[{'m