In [3]:
import pandas as pd
import yt_dlp
import uuid
import os
import time

# Function to scrape channel and video metadata and download videos
def scrape_channel_videos(channel_urls, duration_limit=98, download_folder='downloaded_videos'):
    os.makedirs(download_folder, exist_ok=True)

    data = []

    ydl_opts_metadata = {
        'quiet': True,
        'extract_flat': False,
        'skip_download': True,
        'ignoreerrors': True,
        'force_generic_extractor': False,
        'force_ipv4': True,
        'cookiefile': '/home/mhammed/Desktop/tech_projects/video_evaluator/www.youtube.com_cookies.txt',
    }

    with yt_dlp.YoutubeDL(ydl_opts_metadata) as ydl:
        print("Starting channel scraping...")
        for channel_url in channel_urls:
            print(f"Processing channel: {channel_url}")
            try:
                time.sleep(5)
                channel_info = ydl.extract_info(channel_url, download=False)
                channel_data = {
                    'channel_name': channel_info.get('channel'),
                    'channel_url': channel_info.get('channel_url'),
                    'channel_description': channel_info.get('description'),
                    'channel_subscriber_count': channel_info.get('channel_follower_count'),
                    'channel_join_date': channel_info.get('channel_created'),
                }
                print(f"Processing channel: {channel_info.get('channel')}")

                for entry in channel_info.get('entries', []):
                    time.sleep(5)
                    print(f"Processing video: {entry.get('title')}")
                    if entry and entry.get('duration', 0) <= duration_limit and entry.get('view_count') >= channel_info.get('channel_follower_count'):
                        video_comments = [c.get('text') for c in entry.get('comments', [])] if entry.get('comments') else []

                        video_id = str(uuid.uuid4())
                        downloaded = False
                        resolution = None

                        ydl_opts_download = {
                            'quiet': True,
                            'ignoreerrors': True,
                            'format': 'bestvideo[height<=720]+bestaudio/best[height<=720]',
                            'outtmpl': f'{download_folder}/{video_id}.%(ext)s',
                        }
                        print(f"Downloading video {entry.get('webpage_url')}")

                        time.sleep(5)
                        with yt_dlp.YoutubeDL(ydl_opts_download) as ydl_download:
                            try:
                                info_dict = ydl_download.extract_info(entry.get('webpage_url'), download=True)
                                downloaded = True if info_dict else False
                                resolution = info_dict.get('height') if info_dict else None
                                print(f"Downloaded video {entry.get('webpage_url')}")
                            except Exception as download_error:
                                print(f"Error downloading video {entry.get('webpage_url')}: {download_error}")
                            
                        video_data = {
                            'id': video_id,
                            'video_title': entry.get('title'),
                            'video_description': entry.get('description'),
                            'video_url': entry.get('webpage_url'),
                            'video_publish_date': entry.get('upload_date'),
                            'video_tags': entry.get('tags'),
                            'views': entry.get('view_count'),
                            'comments': video_comments,
                            'comment_count': entry.get('comment_count'),
                            'likes': entry.get('like_count'),
                            'dislikes': entry.get('dislike_count'),
                            'resolution': resolution,
                            'downloaded': downloaded,
                            **channel_data
                        }
                        
                        data.append(video_data)
                        print(f"Processed video {entry.get('webpage_url')}")

            except Exception as e:
                print(f"Error processing {channel_url}: {e}")

    return pd.DataFrame(data)


# List of YouTube channel URLs (replace with your URLs)
channel_urls = [
    'https://www.youtube.com/channel/UCNhxq7He5p-_FdBh0OaxcQg', #https://www.youtube.com/@Nike
    'https://www.youtube.com/channel/UCuLUOxd7ezJ8c6NSLBNRRfg', #https://www.youtube.com/@adidas
    'https://www.youtube.com/channel/UCbpUSVxiBSjc0CHm0ksuJgw', #https://www.youtube.com/@McDonalds
    'https://www.youtube.com/channel/UCYFQ33UIPERYx8-ZHucZbDA', #https://www.youtube.com/@Apple
    'https://www.youtube.com/channel/UCnEdfCdbxJJ9ouWKLSRCRRw', #https://www.youtube.com/@Samsung
    'https://www.youtube.com/channel/UC8VddvuHJzIj__Ud0rY2_ww', #https://www.youtube.com/@redbull
    'https://www.youtube.com/channel/UC-WMwOzgFdvvGVLB1EZ-n-w', #https://www.youtube.com/@GoPro
    'https://www.youtube.com/channel/UC1xnncYc7586km_rIYQLtLQ', #https://www.youtube.com/@Uber
    'https://www.youtube.com/channel/UCGie8GMlUo3kBKIopdvumVQ', #https://www.youtube.com/@Netflix
    'https://www.youtube.com/channel/UCIrgJInjLS2BhlHOMDW7v0g', #https://www.youtube.com/@disney
    'https://www.youtube.com/channel/UC9G8DcGtPfHsVEfUTM_TjEw', #https://www.youtube.com/@intel
    'https://www.youtube.com/channel/UCxGq825hl0AHP18I9-JGKgg', #https://www.youtube.com/@amazon
    'https://www.youtube.com/channel/UCnba_sSOe_umiHCpYYvRCqQ', #https://www.youtube.com/@Microsoft
    'https://www.youtube.com/channel/UCDCIVTeg-_V-xmFV8kAkblg', #https://www.youtube.com/@Sony
    ]

# Scrape, download videos, and save to CSV
df = scrape_channel_videos(channel_urls)
df.to_csv('youtube_videos_metadata.csv', index=False)

# Display dataframe
df.head()


Starting channel scraping...
Processing channel: https://www.youtube.com/channel/UCuLUOxd7ezJ8c6NSLBNRRfg


KeyboardInterrupt: 

In [8]:
import re
import requests

def get_channel_id_from_handle(handle_url):
    """
    Attempts to extract the channel ID (UC...) from a YouTube handle page by scraping.
    Returns the channel ID string if found, or None if not found.
    """
    try:
        # Example handle URL: https://www.youtube.com/@Nike
        response = requests.get(handle_url, timeout=10)
        if response.status_code == 200:
            # Look for channelId pattern in the page source
            # YouTube often has a JSON script with: "channelId":"UCXXXX..."
            match = re.search(r'"channelId":"(UC[0-9A-Za-z_\-]+)"', response.text)
            if match:
                return match.group(1)
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {handle_url}: {e}")
    
    return None


channel_urls = [
    "https://www.youtube.com/@Nike",
    "https://www.youtube.com/@adidas",
    "https://www.youtube.com/@CocaCola",
    "https://www.youtube.com/@McDonalds",
    "https://www.youtube.com/@Apple",
    "https://www.youtube.com/@Samsung",
    "https://www.youtube.com/@redbull",
    "https://www.youtube.com/@GoPro",
    "https://www.youtube.com/@Airbnb",
    "https://www.youtube.com/@Uber",
    "https://www.youtube.com/@Netflix",
    "https://www.youtube.com/@disney",
    "https://www.youtube.com/@starbucks",
    "https://www.youtube.com/@Walmart",
    "https://www.youtube.com/@intel",
    "https://www.youtube.com/@amazon",
    "https://www.youtube.com/@Microsoft",
    "https://www.youtube.com/@Sony",
]

for url in channel_urls:
    channel_id = get_channel_id_from_handle(url)
    if channel_id:
        # print(f"{url}  ->  https://www.youtube.com/channel/{channel_id}")
        print(f"'https://www.youtube.com/channel/{channel_id}', #{url}")
    # else:
    #     print(f"{url}  ->  Channel ID not found")


'https://www.youtube.com/channel/UCNhxq7He5p-_FdBh0OaxcQg', #https://www.youtube.com/@Nike
'https://www.youtube.com/channel/UCuLUOxd7ezJ8c6NSLBNRRfg', #https://www.youtube.com/@adidas
'https://www.youtube.com/channel/UCbpUSVxiBSjc0CHm0ksuJgw', #https://www.youtube.com/@McDonalds
'https://www.youtube.com/channel/UCYFQ33UIPERYx8-ZHucZbDA', #https://www.youtube.com/@Apple
'https://www.youtube.com/channel/UCnEdfCdbxJJ9ouWKLSRCRRw', #https://www.youtube.com/@Samsung
'https://www.youtube.com/channel/UC8VddvuHJzIj__Ud0rY2_ww', #https://www.youtube.com/@redbull
'https://www.youtube.com/channel/UC-WMwOzgFdvvGVLB1EZ-n-w', #https://www.youtube.com/@GoPro
'https://www.youtube.com/channel/UC1xnncYc7586km_rIYQLtLQ', #https://www.youtube.com/@Uber
'https://www.youtube.com/channel/UCGie8GMlUo3kBKIopdvumVQ', #https://www.youtube.com/@Netflix
'https://www.youtube.com/channel/UCIrgJInjLS2BhlHOMDW7v0g', #https://www.youtube.com/@disney
'https://www.youtube.com/channel/UC9G8DcGtPfHsVEfUTM_TjEw', #https://w