In [2]:
import requests
import pandas as pd
import time
import os
from urllib.parse import urlparse, unquote

# **Configuration:** Insert your YouTube Data API v3 key here
API_KEY = "AIzaSyARDSUjc-E8I5lDjAeut1Zy1lrPSb4OGqY"

# # **Input:** List of YouTube channel URLs to process
# channel_urls = [
#     'https://www.youtube.com/channel/UCNhxq7He5p-_FdBh0OaxcQg', #https://www.youtube.com/@Nike
#     'https://www.youtube.com/channel/UCuLUOxd7ezJ8c6NSLBNRRfg', #https://www.youtube.com/@adidas
#     'https://www.youtube.com/channel/UCbpUSVxiBSjc0CHm0ksuJgw', #https://www.youtube.com/@McDonalds
#     'https://www.youtube.com/channel/UCYFQ33UIPERYx8-ZHucZbDA', #https://www.youtube.com/@Apple
#     'https://www.youtube.com/channel/UCnEdfCdbxJJ9ouWKLSRCRRw', #https://www.youtube.com/@Samsung
#     'https://www.youtube.com/channel/UC8VddvuHJzIj__Ud0rY2_ww', #https://www.youtube.com/@redbull
#     'https://www.youtube.com/channel/UC-WMwOzgFdvvGVLB1EZ-n-w', #https://www.youtube.com/@GoPro
#     'https://www.youtube.com/channel/UC1xnncYc7586km_rIYQLtLQ', #https://www.youtube.com/@Uber
#     'https://www.youtube.com/channel/UCGie8GMlUo3kBKIopdvumVQ', #https://www.youtube.com/@Netflix
#     'https://www.youtube.com/channel/UCIrgJInjLS2BhlHOMDW7v0g', #https://www.youtube.com/@disney
#     'https://www.youtube.com/channel/UC9G8DcGtPfHsVEfUTM_TjEw', #https://www.youtube.com/@intel
#     'https://www.youtube.com/channel/UCxGq825hl0AHP18I9-JGKgg', #https://www.youtube.com/@amazon
#     'https://www.youtube.com/channel/UCnba_sSOe_umiHCpYYvRCqQ', #https://www.youtube.com/@Microsoft
#     'https://www.youtube.com/channel/UCDCIVTeg-_V-xmFV8kAkblg', #https://www.youtube.com/@Sony
#     ]


channel_urls = [
    "https://www.youtube.com/@Nike",
    "https://www.youtube.com/@adidas",
    "https://www.youtube.com/@Coca-Cola",
    "https://www.youtube.com/@McDonalds",
    "https://www.youtube.com/@Apple",
    "https://www.youtube.com/@Samsung",
    "https://www.youtube.com/@redbull",
    "https://www.youtube.com/@GoPro",
    "https://www.youtube.com/@Airbnb",
    "https://www.youtube.com/@Uber",
    "https://www.youtube.com/@Netflix",
    "https://www.youtube.com/@disney",
    "https://www.youtube.com/@starbucks",
    "https://www.youtube.com/@Walmart",
    "https://www.youtube.com/@intel",
    "https://www.youtube.com/@amazon",
    "https://www.youtube.com/@Microsoft",
    "https://www.youtube.com/@Sony",
]


In [44]:
def parse_iso8601_duration(duration: str) -> int:
    """
    Parse an ISO 8601 duration string (e.g. 'PT1H2M3S', 'PT4M5S', 'PT30S') to total seconds.
    """
    time_str = duration.replace('PT', '')  # remove the 'PT' prefix
    hours = minutes = seconds = 0
    if 'H' in time_str:
        hours_part, time_str = time_str.split('H')
        hours = int(hours_part) if hours_part else 0
    if 'M' in time_str:
        minutes_part, time_str = time_str.split('M')
        minutes = int(minutes_part) if minutes_part else 0
    if 'S' in time_str:
        seconds_part = time_str.split('S')[0]
        seconds = int(seconds_part) if seconds_part else 0
    return hours * 3600 + minutes * 60 + seconds

def get_channel_data(api_key: str, channel_url: str):
    """
    Given a YouTube channel URL (handle, channel ID, legacy username, or custom URL),
    retrieve the channel's unique ID, snippet, statistics, and uploads playlist ID.
    Returns a dictionary with channel metadata and the uploads playlist ID.
    """
    url = channel_url.strip()
    # Prepend scheme if the URL is provided in shorthand (e.g. "/@handle")
    if url.startswith("/"):
        url = "https://www.youtube.com" + url
    parsed = urlparse(url)
    path = parsed.path  # e.g. "/@ChannelHandle", "/channel/UC123...", "/user/Name", "/c/Custom"
    
    base_channels_url = "https://www.googleapis.com/youtube/v3/channels"
    params = {"part": "snippet,contentDetails,statistics", "key": api_key}
    
    # Determine query param based on URL pattern
    if path.startswith("/channel/"):  
        # URL form: /channel/UCxxxxxxxx
        channel_id = path.split("/channel/")[1]
        params["id"] = channel_id
    elif path.startswith("/user/"):    
        # URL form: /user/Username (legacy username)
        username = path.split("/user/")[1]
        params["forUsername"] = username
    elif path.startswith("/@"):       
        # URL form: /@Handle (new handle format)
        handle = path.split("/@")[1]
        # Remove trailing slash if any
        handle = handle.rstrip("/")
        # The API forHandle parameter accepts handle with or without 】
        params["forHandle"] = handle
    elif path.startswith("/c/"):      
        # URL form: /c/CustomName (legacy custom URL not directly supported by API)
        custom = path.split("/c/")[1]
        custom = custom.rstrip("/")
        custom = unquote(custom)  # decode URL encoding if any
        # Use search API to resolve custom channel name to channelId
        search_url = "https://www.googleapis.com/youtube/v3/search"
        search_params = {
            "part": "snippet",
            "q": custom,
            "type": "channel",
            "maxResults": 1,
            "key": api_key
        }
        time.sleep(5)
        resp = requests.get(search_url, params=search_params)
        data = resp.json()
        if data.get("items"):
            channel_id = data["items"][0]["id"]["channelId"]
            params["id"] = channel_id
        else:
            raise ValueError(f"Channel not found for custom URL: {channel_url}")
    else:
        # Perhaps a direct channel ID (starting with UC) or just a handle without prefix
        identifier = path.strip("/")
        if identifier.startswith("UC"):
            params["id"] = identifier
        elif identifier:  # possibly a handle without the @ or some custom string
            params["forHandle"] = identifier
        else:
            raise ValueError(f"Unrecognized channel URL format: {channel_url}")
    
    # Call the Channels API to get channel info
    time.sleep(5)
    resp = requests.get(base_channels_url, params=params)
    data = resp.json()
    if not data.get("items"):
        raise ValueError(f"No channel data found for URL: {channel_url}")
    channel_info = data["items"][0]
    
    # Extract channel metadata
    channel_id = channel_info.get("id")
    snippet = channel_info.get("snippet", {})
    stats = channel_info.get("statistics", {})
    content = channel_info.get("contentDetails", {})
    
    channel_name = snippet.get("title", "")
    channel_description = snippet.get("description", "")
    channel_join_date = snippet.get("publishedAt", "")  # Channel creation date
    subscriber_count = stats.get("subscriberCount")
    if subscriber_count is not None:
        subscriber_count = int(subscriber_count)
    uploads_playlist_id = None
    if content.get("relatedPlaylists"):
        uploads_playlist_id = content["relatedPlaylists"].get("uploads")
    
    return {
        "channel_id": channel_id,
        "channel_name": channel_name,
        "channel_description": channel_description,
        "channel_join_date": channel_join_date,
        "subscriber_count": subscriber_count,
        "uploads_playlist_id": uploads_playlist_id,
        "channel_url": channel_url
    }

# Quick test of duration parser:
test_durations = ["PT1M37S", "PT2M", "PT45S", "PT1H2M3S", "PT98S"]
print("Duration parsing test:", {d: parse_iso8601_duration(d) for d in test_durations})


Duration parsing test: {'PT1M37S': 97, 'PT2M': 120, 'PT45S': 45, 'PT1H2M3S': 3723, 'PT98S': 98}


In [45]:
video_records = []  # to collect video data dictionaries for DataFrame

for url in channel_urls:
    try:
        channel_data = get_channel_data(API_KEY, url)
    except Exception as e:
        print(f"Error retrieving channel data for {url}: {e}")
        continue
    
    channel_name = channel_data["channel_name"]
    print(f"\nProcessing channel: {channel_name}")
    uploads_playlist_id = channel_data.get("uploads_playlist_id")
    if not uploads_playlist_id:
        print(f"No uploads playlist found for channel {channel_name}. Skipping.")
        continue
    
    # Retrieve all videos from the uploads playlist
    playlist_items_url = "https://www.googleapis.com/youtube/v3/playlistItems"
    playlist_params = {
        "part": "snippet,contentDetails",
        "playlistId": uploads_playlist_id,
        "maxResults": 50,
        "key": API_KEY
    }
    videos_in_playlist = []
    while True:
        time.sleep(1)
        resp = requests.get(playlist_items_url, params=playlist_params)
        data = resp.json()
        if "error" in data:
            print(f"API error when fetching playlist items: {data['error'].get('message')}")
            break
        items = data.get("items", [])
        if not items:
            break
        for item in items:
            # snippet contains title, publishedAt; contentDetails contains videoId
            vid_id = item["contentDetails"]["videoId"]
            vid_title = item["snippet"].get("title", "")
            vid_published = item["snippet"].get("publishedAt", "")
            videos_in_playlist.append({
                "video_id": vid_id,
                "title": vid_title,
                "published_at": vid_published
            })
        # Pagination: check if there's another page
        if "nextPageToken" in data:
            playlist_params["pageToken"] = data["nextPageToken"]
        else:
            break
    
    print(f"Found {len(videos_in_playlist)} videos in channel {channel_name}.")
    
    if not videos_in_playlist:
        continue
    
    # Now retrieve detailed video info in batches of 50
    video_ids = [v["video_id"] for v in videos_in_playlist]
    for i in range(0, len(video_ids), 50):
        batch_ids = video_ids[i:i+50]
        videos_url = "https://www.googleapis.com/youtube/v3/videos"
        videos_params = {
            "part": "snippet,contentDetails,statistics",
            "id": ",".join(batch_ids),
            "key": API_KEY
        }
        time.sleep(1)
        resp = requests.get(videos_url, params=videos_params)
        data = resp.json()
        if "items" not in data:
            continue
        for item in data["items"]:
            print(f"Processing video {i+1} of {len(videos_in_playlist)} in channel {channel_name}...")
            vid_id = item["id"]
            snippet = item.get("snippet", {})
            content_details = item.get("contentDetails", {})
            stats = item.get("statistics", {})
            
            # Video metadata fields
            title = snippet.get("title", "")
            description = snippet.get("description", "")
            published_at = snippet.get("publishedAt", "")
            tags = snippet.get("tags", [])
            duration_iso = content_details.get("duration", "")
            duration_seconds = parse_iso8601_duration(duration_iso) if duration_iso else None
            view_count = int(stats.get("viewCount", 0))
            like_count = int(stats.get("likeCount", 0)) if "likeCount" in stats else None
            # Dislike count may not be available (private as of Dec 2021】
            dislike_count = int(stats.get("dislikeCount", 0)) if "dislikeCount" in stats else None
            comment_count = int(stats.get("commentCount", 0)) if "commentCount" in stats else 0
            
            # # Filter by video length < 98 seconds
            # if duration_seconds is None or duration_seconds >= 98:
            #     continue  # skip videos 98s or longer
            
            video_url = f"https://www.youtube.com/watch?v={vid_id}"
            
            # Prepare a record dict for this video
            record = {
                "channel_name": channel_name,
                "channel_id": channel_data["channel_id"],
                "channel_url": channel_data["channel_url"],
                "channel_description": channel_data["channel_description"],
                "channel_join_date": channel_data["channel_join_date"],
                "subscriber_count": channel_data["subscriber_count"],
                "video_id": vid_id,
                "video_url": video_url,
                "title": title,
                "description": description,
                "published_at": published_at,
                "tags": ",".join(tags) if tags else "",
                "view_count": view_count,
                "like_count": like_count,
                "dislike_count": dislike_count,
                "comment_count": comment_count,
                "duration_seconds": duration_seconds
            }
            video_records.append(record)
            print(f"Processed video {i+1} of {len(videos_in_playlist)} in channel {channel_name}...")



Processing channel: Nike
Found 566 videos in channel Nike.
Processing video 1 of 566 in channel Nike...
Processed video 1 of 566 in channel Nike...
Processing video 1 of 566 in channel Nike...
Processed video 1 of 566 in channel Nike...
Processing video 1 of 566 in channel Nike...
Processed video 1 of 566 in channel Nike...
Processing video 1 of 566 in channel Nike...
Processed video 1 of 566 in channel Nike...
Processing video 1 of 566 in channel Nike...
Processed video 1 of 566 in channel Nike...
Processing video 1 of 566 in channel Nike...
Processed video 1 of 566 in channel Nike...
Processing video 1 of 566 in channel Nike...
Processed video 1 of 566 in channel Nike...
Processing video 1 of 566 in channel Nike...
Processed video 1 of 566 in channel Nike...
Processing video 1 of 566 in channel Nike...
Processed video 1 of 566 in channel Nike...
Processing video 1 of 566 in channel Nike...
Processed video 1 of 566 in channel Nike...
Processing video 1 of 566 in channel Nike...
Proce

In [49]:
# print(len(video_records))
# video_records_df = pd.DataFrame(video_records)
try:
    video_records_df = pd.read_csv('youtube_videos_data.csv')
    print(f"Successfully loaded {len(video_records_df)} records")
    display(video_records_df.head())
except FileNotFoundError:
    print("Error: File not found. Save your DataFrame first using:")
    print("video_records_df.to_csv('youtube_videos_data.csv', index=False)")
    
# Duplicate detection and removal
print(f"Original records: {len(video_records_df)}")
dupes = video_records_df.duplicated().sum()
print(f"Duplicate rows found: {dupes}")

if dupes > 0:
    print("\nDuplicate examples:")
    display(video_records_df[video_records_df.duplicated(keep=False)].sort_values(by='video_id').head(2))
    
    # Remove duplicates while keeping first occurrence
    video_records_df = video_records_df.drop_duplicates()
    print(f"\nRemoved {dupes} duplicates. New total: {len(video_records_df)}")
else:
    print("No duplicates found in the dataset")

# display(video_records_df.tail()) 
# Apply both filter conditions using logical AND (&)
filtered_df = video_records_df[
    (video_records_df['duration_seconds'] <= 98) &
    (video_records_df['view_count'] >= 0.5 * video_records_df['subscriber_count'])
]

# Display filtered results
print(len(filtered_df))
display(filtered_df)

# Save filtered DataFrame to CSV
filtered_df.to_csv("filtered_youtube_videos_data.csv", index=False)

print(f"\nSaved filtered data for {len(filtered_df)} videos to filtered_youtube_videos_data.csv")


Successfully loaded 36519 records


Unnamed: 0,channel_name,channel_id,channel_url,channel_description,channel_join_date,subscriber_count,video_id,video_url,title,description,published_at,tags,view_count,like_count,dislike_count,comment_count,duration_seconds
0,Nike,UCUFgkRb0ZHc4Rpq15VRCICA,https://www.youtube.com/@Nike,You can't win. So Win.\n\nFor more information...,2006-03-08T03:18:02Z,2110000,u9Nt6a72HwE,https://www.youtube.com/watch?v=u9Nt6a72HwE,The A’ONE Experience. Engineered to the exact ...,,2025-04-04T20:54:24Z,"nike commercial,nike",2848,113.0,,4,19
1,Nike,UCUFgkRb0ZHc4Rpq15VRCICA,https://www.youtube.com/@Nike,You can't win. So Win.\n\nFor more information...,2006-03-08T03:18:02Z,2110000,neO9ABCWm9k,https://www.youtube.com/watch?v=neO9ABCWm9k,Compete. Win. Repeat. The Mal Swanson formula....,,2025-03-23T16:00:15Z,,1861,119.0,,6,137
2,Nike,UCUFgkRb0ZHc4Rpq15VRCICA,https://www.youtube.com/@Nike,You can't win. So Win.\n\nFor more information...,2006-03-08T03:18:02Z,2110000,6f6NOsAs_BU,https://www.youtube.com/watch?v=6f6NOsAs_BU,"The harder the hustle, the longer the receipt....",,2025-03-22T22:48:12Z,,2027,117.0,,1,19
3,Nike,UCUFgkRb0ZHc4Rpq15VRCICA,https://www.youtube.com/@Nike,You can't win. So Win.\n\nFor more information...,2006-03-08T03:18:02Z,2110000,M8GrA3___KI,https://www.youtube.com/watch?v=M8GrA3___KI,Losing isn’t Juju Watkins' aesthetic.,,2025-03-22T18:00:40Z,,4341,337.0,,12,77
4,Nike,UCUFgkRb0ZHc4Rpq15VRCICA,https://www.youtube.com/@Nike,You can't win. So Win.\n\nFor more information...,2006-03-08T03:18:02Z,2110000,OoVA_KGuaGw,https://www.youtube.com/watch?v=OoVA_KGuaGw,Talk is cheap but the work is expensive. #Nike,,2025-03-22T16:00:48Z,,2382,163.0,,2,19


Original records: 36519
Duplicate rows found: 0
No duplicates found in the dataset
1170


Unnamed: 0,channel_name,channel_id,channel_url,channel_description,channel_join_date,subscriber_count,video_id,video_url,title,description,published_at,tags,view_count,like_count,dislike_count,comment_count,duration_seconds
28,Nike,UCUFgkRb0ZHc4Rpq15VRCICA,https://www.youtube.com/@Nike,You can't win. So Win.\n\nFor more information...,2006-03-08T03:18:02Z,2110000,b0Ezn5pZE7o,https://www.youtube.com/watch?v=b0Ezn5pZE7o,So Win. | Nike,There’s one guarantee in sport. You’ll be told...,2025-02-10T00:58:09Z,"nike commercial,nike,Super Bowl,You Can't Win ...",3259071,19836.0,,997,61
35,Nike,UCUFgkRb0ZHc4Rpq15VRCICA,https://www.youtube.com/@Nike,You can't win. So Win.\n\nFor more information...,2006-03-08T03:18:02Z,2110000,xMrUkDYvbDc,https://www.youtube.com/watch?v=xMrUkDYvbDc,I Told You So | Nike,The best athletes are open books.\nThey’ll tel...,2024-12-20T18:00:57Z,"nike commercial,nike",2506795,4247.0,,86,31
49,Nike,UCUFgkRb0ZHc4Rpq15VRCICA,https://www.youtube.com/@Nike,You can't win. So Win.\n\nFor more information...,2006-03-08T03:18:02Z,2110000,JfK0mHEy0po,https://www.youtube.com/watch?v=JfK0mHEy0po,Joy | Nike,Feeling great doesn’t always feel good.\n\nSub...,2024-09-10T23:39:51Z,"nike commercial,Nike,Nike Running,Joyful Run,V...",1865917,4622.0,,71,31
68,Nike,UCUFgkRb0ZHc4Rpq15VRCICA,https://www.youtube.com/@Nike,You can't win. So Win.\n\nFor more information...,2006-03-08T03:18:02Z,2110000,pwLergHG81c,https://www.youtube.com/watch?v=pwLergHG81c,WINNING ISN’T FOR EVERYONE | AM I A BAD PERSON...,"You can’t win them all, but you should sure as...",2024-07-19T14:04:09Z,"nike commercial,nike",3419040,50727.0,,3425,91
76,Nike,UCUFgkRb0ZHc4Rpq15VRCICA,https://www.youtube.com/@Nike,You can't win. So Win.\n\nFor more information...,2006-03-08T03:18:02Z,2110000,C_BZQkU5Cds,https://www.youtube.com/watch?v=C_BZQkU5Cds,WHAT IF YOU CAN? | Nike,Imagine being a team player who has never met ...,2024-05-10T16:00:12Z,"nike,Kids Sport,Middle East,Football,Soccer,Gy...",37659378,194282.0,,715,96
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35704,Sony,UCVjS9AuBloqJJjhsy3vIfug,https://www.youtube.com/@Sony,The Official Global YouTube channel for Sony.\...,2005-09-30T18:45:07Z,667000,9UWbASEi7_o,https://www.youtube.com/watch?v=9UWbASEi7_o,Sony develops the world’s first ghost catching...,Find out more here: http://www.proton-pack.com/,2016-03-31T15:00:10Z,"yt:cc=on,Ghostbusters,Proton Pack,Ect 1,Paul F...",349115,1819.0,,203,49
35726,Sony,UCVjS9AuBloqJJjhsy3vIfug,https://www.youtube.com/@Sony,The Official Global YouTube channel for Sony.\...,2005-09-30T18:45:07Z,667000,BbEf2i6RwLI,https://www.youtube.com/watch?v=BbEf2i6RwLI,"""Create Better Space"" - Concept movie for Glas...",Sony's Glass Sound Speaker fills every corner ...,2016-01-20T07:09:15Z,"yt:cc=on,Glass Sound Speaker,Glass Sound,Speak...",492545,398.0,,17,93
35819,Sony,UCVjS9AuBloqJJjhsy3vIfug,https://www.youtube.com/@Sony,The Official Global YouTube channel for Sony.\...,2005-09-30T18:45:07Z,667000,x5iczT4f17Q,https://www.youtube.com/watch?v=x5iczT4f17Q,BRAVIA X9000C/X9100C series Floating Style - O...,Excite all your senses: the essential new shap...,2015-01-06T01:15:11Z,"yt:cc=on,4K,BRAVIA,TV,thin,slim,X1,TRILUMINOS,...",653695,1748.0,,130,77
36287,Sony,UCVjS9AuBloqJJjhsy3vIfug,https://www.youtube.com/@Sony,The Official Global YouTube channel for Sony.\...,2005-09-30T18:45:07Z,667000,zmmvdNChjoo,https://www.youtube.com/watch?v=zmmvdNChjoo,映画『桜蘭高校ホスト部』予告編,累計発行部数1300万部突破の人気少女コミック待望の映画化！ \r\n超セレブ高校のとんでも...,2012-03-02T06:18:21Z,"桜蘭高校ホスト部,韓哲,川口春奈,山本裕典,竜星涼,中村昌也,千葉雄大,高木心平,高木万平,...",884851,2633.0,,127,61



Saved filtered data for 1170 videos to filtered_youtube_videos_data.csv


In [None]:
# # Add a field for comments text in each record
# for record in video_records:
#     video_id = record["video_id"]
#     comment_texts = []
#     # Use commentThreads.list to get top-level comments
#     comments_url = "https://www.googleapis.com/youtube/v3/commentThreads"
#     comments_params = {
#         "part": "snippet",
#         "videoId": video_id,
#         "textFormat": "plainText",
#         "maxResults": 50,
#         "key": API_KEY
#     }
#     fetched_count = 0
#     max_to_fetch = 100  # limit to 100 comments per video for this example (adjust as needed, or remove limit to get all)
#     while True:
#         time.sleep(5)
#         resp = requests.get(comments_url, params=comments_params)
#         data = resp.json()
#         if "error" in data:
#             # If comments are disabled or any error occurs, we break
#             # (YouTube disables comments on some videos, like those for kids)
#             # We store an indication if comments are disabled or unavailable.
#             error_msg = data["error"].get("message", "Unknown error")
#             print(f"Could not fetch comments for video {video_id}: {error_msg}")
#             break
#         items = data.get("items", [])
#         for item in items:
#             top_comment = item["snippet"]["topLevelComment"]
#             comment_snippet = top_comment["snippet"]
#             comment_text = comment_snippet.get("textDisplay", "")
#             comment_texts.append(comment_text.strip())
#             fetched_count += 1
#             if fetched_count >= max_to_fetch:
#                 break
#         if fetched_count >= max_to_fetch:
#             # Reached our limit of comments to fetch
#             break
#         if "nextPageToken" in data:
#             comments_params["pageToken"] = data["nextPageToken"]
#         else:
#             break
    
#     # Join all comments into one string (separated by newlines)
#     if comment_texts:
#         record["comments"] = "\n\n".join(comment_texts)
#     else:
#         # No comments or none fetched
#         record["comments"] = ""


In [7]:
filtered_video_records_df = pd.read_csv('final_youtube_videos_data.csv')
# filtered_video_records_df = filtered_video_records_df[filtered_video_records_df["downloaded"] == False]
len(filtered_video_records_df)

739

In [None]:
from pytubefix import YouTube
import os
import time

# Create a downloads directory if it doesn't exist
download_dir = "downloaded_videos"
os.makedirs(download_dir, exist_ok=True)

start = time.time()
i = 1
video_records = []
for index, record in filtered_video_records_df.iterrows():
    if record["downloaded"]:
        video_records.append(record)
        continue
    video_id = record["video_id"]
    video_url = record["video_url"]
    print(f"Processing video {video_id}...")
    try:
        time.sleep(2)
        yt = YouTube(video_url)
        # Get the highest resolution stream that includes audio (progressive stream)
        stream = yt.streams.get_highest_resolution()
        if stream is None:
            # If for some reason no progressive stream is found, pick the highest itag progressive manually
            stream = yt.streams.filter(progressive=True).order_by('resolution').desc().first()
        if stream:
            # Download the video to the download directory with video_id as filename
            out_file = stream.download(output_path=download_dir, filename=f"{video_id}.mp4")
            record["downloaded_resolution"] = stream.resolution
            record["downloaded"] = True
            print(f"Downloaded {i}th video {video_url} to {out_file}")
            i += 1
        else:
            record["downloaded_resolution"] = None
            record["downloaded"] = False
            print(f"No downloadable stream for video {video_id}.")
    except Exception as e:
        record["downloaded_resolution"] = None
        record["downloaded"] = False
        print(f"Failed to download {video_url}: {e}")
    video_records.append(record)
end = time.time()
print(f"Download completed in {end - start} seconds")

# Save the video records to a new CSV file
video_records_df = pd.DataFrame(video_records)
video_records_df.to_csv('final_youtube_videos_data.csv', index=False)


Processing video WbjsKJ3vWD8...
Failed to download https://www.youtube.com/watch?v=WbjsKJ3vWD8: WbjsKJ3vWD8 is not available in your region
Processing video BXTDP9l77jc...
Downloaded 1th video https://www.youtube.com/watch?v=BXTDP9l77jc to /home/mhammed/Desktop/tech_projects/video_evaluator/downloaded_videos/BXTDP9l77jc.mp4
Processing video UYHjyNNy_4Y...
Downloaded 2th video https://www.youtube.com/watch?v=UYHjyNNy_4Y to /home/mhammed/Desktop/tech_projects/video_evaluator/downloaded_videos/UYHjyNNy_4Y.mp4
Processing video d-1xU0VfJ-g...
Downloaded 3th video https://www.youtube.com/watch?v=d-1xU0VfJ-g to /home/mhammed/Desktop/tech_projects/video_evaluator/downloaded_videos/d-1xU0VfJ-g.mp4
Processing video SILvPVVAhBo...
Downloaded 4th video https://www.youtube.com/watch?v=SILvPVVAhBo to /home/mhammed/Desktop/tech_projects/video_evaluator/downloaded_videos/SILvPVVAhBo.mp4
Processing video _oK1LAqIv3U...
Downloaded 5th video https://www.youtube.com/watch?v=_oK1LAqIv3U to /home/mhammed/De

In [46]:
# Create DataFrame from the records
df = pd.DataFrame(video_records)

# Save to CSV (without index)
df.to_csv("youtube_videos_data.csv", index=False)

print(f"\nSaved data for {len(df)} videos to youtube_videos_data.csv")
df.head(3)  # Display first 3 rows as a sample



Saved data for 36519 videos to youtube_videos_data.csv


Unnamed: 0,channel_name,channel_id,channel_url,channel_description,channel_join_date,subscriber_count,video_id,video_url,title,description,published_at,tags,view_count,like_count,dislike_count,comment_count,duration_seconds
0,Nike,UCUFgkRb0ZHc4Rpq15VRCICA,https://www.youtube.com/@Nike,You can't win. So Win.\n\nFor more information...,2006-03-08T03:18:02Z,2110000,u9Nt6a72HwE,https://www.youtube.com/watch?v=u9Nt6a72HwE,The A’ONE Experience. Engineered to the exact ...,,2025-04-04T20:54:24Z,"nike commercial,nike",2848,113.0,,4,19
1,Nike,UCUFgkRb0ZHc4Rpq15VRCICA,https://www.youtube.com/@Nike,You can't win. So Win.\n\nFor more information...,2006-03-08T03:18:02Z,2110000,neO9ABCWm9k,https://www.youtube.com/watch?v=neO9ABCWm9k,Compete. Win. Repeat. The Mal Swanson formula....,,2025-03-23T16:00:15Z,,1861,119.0,,6,137
2,Nike,UCUFgkRb0ZHc4Rpq15VRCICA,https://www.youtube.com/@Nike,You can't win. So Win.\n\nFor more information...,2006-03-08T03:18:02Z,2110000,6f6NOsAs_BU,https://www.youtube.com/watch?v=6f6NOsAs_BU,"The harder the hustle, the longer the receipt....",,2025-03-22T22:48:12Z,,2027,117.0,,1,19


In [None]:
import pandas as pd
import os

# Create DataFrame from new records
new_df = pd.DataFrame(video_records)

try:
    # Load existing data if available
    existing_df = pd.read_csv("youtube_videos_data.csv")
    
    # Combine datasets and remove duplicates
    combined_df = pd.concat([existing_df, new_df], ignore_index=True)
    initial_count = len(combined_df)
    combined_df = combined_df.drop_duplicates(keep='first')
    
    # Calculate stats
    new_videos_added = len(combined_df) - len(existing_df)
    duplicates_removed = initial_count - len(combined_df)
    
    # Save updated data
    combined_df.to_csv("youtube_videos_data.csv", index=False)
    
    print(f"""Update Report:
    - Existing videos: {len(existing_df):,}
    - New videos added: {new_videos_added:,}
    - Duplicates removed: {duplicates_removed:,}
    - Total unique videos: {len(combined_df):,}""")

except FileNotFoundError:
    # First-time save with duplicate check
    new_df = new_df.drop_duplicates()
    new_df.to_csv("youtube_videos_data.csv", index=False)
    print(f"Created new file with {len(new_df):,} unique videos")

# Display sample of latest data
combined_df.head(3) if 'combined_df' in locals() else new_df.head(3)
