In [None]:
from googleapiclient.discovery import build
from google.colab import drive
import pandas as pd

In [None]:
# API Key
api_key = '...'

youtube = build('youtube', 'v3', developerKey=api_key)

def get_channel_statistics(channel_id):
    channel_request = youtube.channels().list(
        part='statistics',
        id=channel_id
    )
    channel_response = channel_request.execute()
    if 'items' in channel_response and channel_response['items']:
        return channel_response['items'][0]['statistics']
    else:
        return None

# Empty frame
videos_info = []
pageToken = None

while True:
    request = youtube.search().list(
        q='income inequality',
        part='snippet',
        type='video',
        maxResults=50,
        pageToken=pageToken,
        regionCode='US'
    )
    response = request.execute()
    for item in response['items']:
        video_id = item['id']['videoId']
        channel_id = item['snippet']['channelId']
        stats = get_video_statistics(video_id)
        channel_stats = get_channel_statistics(channel_id)
        if stats and int(stats.get('commentCount', 0)) >= 10:
            videos_info.append({
                'title': item['snippet']['title'],
                'video_id': video_id,
                'description': item['snippet']['description'],
                'comment_count': stats['commentCount'],
                'view_count': stats.get('viewCount'),  # viewCount
                'published_at': item['snippet']['publishedAt'],  # time
                'channel_subscriber_count': channel_stats.get('subscriberCount') if channel_stats else 'Unknown'  # Add channel subcribers
            })
    pageToken = response.get('nextPageToken')
    if not pageToken:
        break

# DataFrame
df_videos = pd.DataFrame(videos_info)

# Result
print(f"Total collected videos: {len(df_videos)}")
print(df_videos.head())

Total collected videos: 315
                                               title     video_id  \
0                          Is inequality inevitable?  rEnf_CFoyv0   
1  How Wealth Inequality Spiraled Out of Control ...  wOI8RuhW7q0   
2  Wealth Gap: Last Week Tonight with John Oliver...  LfgSEwjAeno   
3  A Look At Income Inequality In The United Stat...  qc7g6Uhi1i4   
4  Thomas Sowell on the Myths of Economic Inequality  mS5WYp5xmvI   

                                         description comment_count view_count  \
0  Explore how economic inequality can be measure...          2246     995686   
1  Here it is: The full story of wealth inequalit...          5955     973519   
2  John Oliver discusses America's growing wealth...          8656   17937666   
3  Income inequality has always existed in the U....           209     130275   
4  Recorded on November 15, 2018 Thomas Sowell di...          2153    7832958   

           published_at channel_subscriber_count  
0  2022-10-11T15:01

In [None]:
df_videos.drop_duplicates(subset='video_id', inplace=True)
print(f"Total videos with >10 comments after removing duplicates: {len(df_videos)}")
df_videos.head()

Total videos with >10 comments after removing duplicates: 276


Unnamed: 0,title,video_id,description,comment_count,view_count,published_at,channel_subscriber_count
0,Is inequality inevitable?,rEnf_CFoyv0,Explore how economic inequality can be measure...,2246,995686,2022-10-11T15:01:01Z,19800000
1,How Wealth Inequality Spiraled Out of Control ...,wOI8RuhW7q0,Here it is: The full story of wealth inequalit...,5955,973519,2021-11-03T19:01:04Z,630000
2,Wealth Gap: Last Week Tonight with John Oliver...,LfgSEwjAeno,John Oliver discusses America's growing wealth...,8656,17937666,2014-07-14T06:30:01Z,9440000
3,A Look At Income Inequality In The United Stat...,qc7g6Uhi1i4,Income inequality has always existed in the U....,209,130275,2020-02-20T21:22:33Z,1330000
4,Thomas Sowell on the Myths of Economic Inequality,mS5WYp5xmvI,"Recorded on November 15, 2018 Thomas Sowell di...",2153,7832958,2018-12-03T15:57:22Z,902000


In [None]:
!pip install yt-dlp

Collecting yt-dlp
  Downloading yt_dlp-2024.4.9-py3-none-any.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting brotli (from yt-dlp)
  Downloading Brotli-1.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m54.4 MB/s[0m eta [36m0:00:00[0m
Collecting mutagen (from yt-dlp)
  Downloading mutagen-1.47.0-py3-none-any.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.4/194.4 kB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pycryptodomex (from yt-dlp)
  Downloading pycryptodomex-3.20.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m52.4 MB/s[0m eta [36m0:00:00[0m
Collecting websockets>=12.0 (from yt-dlp

In [None]:
import yt_dlp
import os

In [None]:
# Define a function to get captions
def download_and_read_subtitles(video_id, lang='en'):
    # Configuring the yt-dlp option
    ydl_opts = {
        'skip_download': True,  # skip
        'writesubtitles': True,  # download captions
        'writeautomaticsub': True,  # download automatic captions
        'subtitleslangs': [lang],  # language
        'subtitlesformat': 'vtt',  # format
        'quiet': True,
        'outtmpl': '%(id)s.%(ext)s',  # output
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        # download
        try:
            result = ydl.extract_info(f'https://www.youtube.com/watch?v={video_id}', download=True)
            # path to subtitle file
            subtitle_file = ydl.prepare_filename(result).replace('.webm', '.en.vtt').replace('.mp4', '.en.vtt')
            # reading
            if os.path.exists(subtitle_file):
                with open(subtitle_file, 'r', encoding='utf-8') as file:
                    subtitles_text = file.read()
                # remove docs
                os.remove(subtitle_file)
                return subtitles_text
        except Exception as e:
            print(f"Error downloading subtitles for video {video_id}: {e}")
    return None

In [None]:
df_videos['subtitles_text'] = df_videos['video_id'].apply(download_and_read_subtitles)









































































































































































































































































































































































































































































































































































































































































































































































































ERROR: [youtube] zvAFPHLFMa0: Requested format is not available. Use --list-formats for a list of available formats


Error downloading subtitles for video zvAFPHLFMa0: ERROR: [youtube] zvAFPHLFMa0: Requested format is not available. Use --list-formats for a list of available formats


























































































































































































































































In [None]:
failed_video_ids = ['-xOLaf4U8d8', 'RdIMXvGw0TA', 'Tl5Z4JG2PmU', 'zvAFPHLFMa0']
failed_videos_df = df_videos[df_videos['video_id'].isin(failed_video_ids)]
failed_videos_df['subtitles_text'] = failed_videos_df['video_id'].apply(lambda x: download_and_read_subtitles(x))
df_videos.update(failed_videos_df)













ERROR: [youtube] zvAFPHLFMa0: Requested format is not available. Use --list-formats for a list of available formats


Error downloading subtitles for video zvAFPHLFMa0: ERROR: [youtube] zvAFPHLFMa0: Requested format is not available. Use --list-formats for a list of available formats


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  failed_videos_df['subtitles_text'] = failed_videos_df['video_id'].apply(lambda x: download_and_read_subtitles(x))


In [None]:
# Check the error rows
video_ids_to_check = ['-xOLaf4U8d8', 'RdIMXvGw0TA', 'Tl5Z4JG2PmU', 'zvAFPHLFMa0']
checked_videos_df = df_videos[df_videos['video_id'].isin(video_ids_to_check)][['video_id', 'subtitles_text']]
print(checked_videos_df)

        video_id                                     subtitles_text
53   -xOLaf4U8d8  WEBVTT\nKind: captions\nLanguage: en\n\n00:00:...
100  RdIMXvGw0TA  WEBVTT\nKind: captions\nLanguage: en\n\n00:00:...
182  Tl5Z4JG2PmU  WEBVTT\nKind: captions\nLanguage: en\n\n00:00:...
242  zvAFPHLFMa0                                               None


In [None]:
none_string_rows = df_videos[df_videos['subtitles_text'] == "None"]
print(none_string_rows)

                                                 title     video_id  \
30   Explained | Racial Wealth Gap | FULL EPISODE |...  Mqrhn8khGLM   
34                     Understanding Wealth Inequality  6c21XMMH5Vg   
36                             EconMovies #17- Endgame  CHReFKy_Mi4   
62                  The economics of income inequality  hGHN9u5d5vQ   
68   Fairness for Children - Income and Education I...  VLDMPg5op28   
71               How to Actually Fix Income Inequality  3eMj1kskVTY   
80   Income inequality is causing poor and middle-c...  t-Q-EZ42FCg   
102  Can widening income inequality ever be bridged...  Xel4L2c1tbQ   
105  Obama&#39;s Address Fails to Look at Roots of ...  s3adl8FAYng   
113          America is nearing peak income inequality  Lc9Tz_6EVH4   
117             Bloomberg: Income Inequality is Great!  pG9iJ2_fqzE   
118  Income Inequality in America: Across Lake Prov...  TtwsPx7MST0   
130             Exploring Income and Wealth Inequality  kKKyDz4FPno   
153   

In [None]:
none_string_rows['subtitles_text'] = none_string_rows['video_id'].apply(lambda x: download_and_read_subtitles(x))

























































































ERROR: [youtube] zvAFPHLFMa0: Requested format is not available. Use --list-formats for a list of available formats


Error downloading subtitles for video zvAFPHLFMa0: ERROR: [youtube] zvAFPHLFMa0: Requested format is not available. Use --list-formats for a list of available formats






































A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  none_string_rows['subtitles_text'] = none_string_rows['video_id'].apply(lambda x: download_and_read_subtitles(x))


In [None]:
none_string_rows

Unnamed: 0,title,video_id,description,comment_count,view_count,published_at,channel_subscriber_count,subtitles_text,cleaned_subtitles
30,Explained | Racial Wealth Gap | FULL EPISODE |...,Mqrhn8khGLM,"In partnership with Vox Media Studios and Vox,...",2861,1469023.0,2020-04-17T13:00:03Z,28100000,,
34,Understanding Wealth Inequality,6c21XMMH5Vg,We've talked about public goods and externalit...,412,29424.0,2022-02-28T16:22:14Z,2960000,,
36,EconMovies #17- Endgame,CHReFKy_Mi4,"Hey internet, this is Jacob Clifford. Avengers...",96,71527.0,2019-08-26T17:52:48Z,874000,,
62,The economics of income inequality,hGHN9u5d5vQ,Corey Packer and Antony Davies discuss topics ...,22,2776.0,2019-08-11T01:10:11Z,5280,,
68,Fairness for Children - Income and Education I...,VLDMPg5op28,The latest Innocenti Report Card raises concer...,18,70781.0,2016-04-13T22:34:50Z,11100,,
71,How to Actually Fix Income Inequality,3eMj1kskVTY,Audience Question: How do we fix income inequa...,194,14002.0,2020-09-14T19:00:00Z,2120000,,
80,Income inequality is causing poor and middle-c...,t-Q-EZ42FCg,With some parents spending as much as $35000 a...,51,3849.0,2019-08-14T19:34:40Z,5740000,,
102,Can widening income inequality ever be bridged...,Xel4L2c1tbQ,The wealth of the world's billionaires has rea...,16,5090.0,2020-10-08T17:30:01Z,13200000,,
105,Obama&#39;s Address Fails to Look at Roots of ...,s3adl8FAYng,"David Cay Johnston: Put in historical context,...",16,3583.0,2014-01-29T18:47:54Z,970000,,
113,America is nearing peak income inequality,Lc9Tz_6EVH4,According to an Economic Policy Institute stud...,28,3404.0,2018-07-27T15:42:30Z,5740000,,


I have to remove video_id: zvAFPHLFMa0 due to the error can not be fixed.

In [None]:
# Define a function to clean the caption
import re
def clean_and_deduplicate_subtitles(subtitles_text):
    # remove time and html
    clean_text = re.sub(r'\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}.*?\n', '', subtitles_text)
    clean_text = re.sub(r'<[^>]+>', '', clean_text)

    # spliting sentences
    sentences = clean_text.split('\n')

    # remove duplicates
    seen = set()
    deduplicated_sentences = [x for x in sentences if not (x in seen or seen.add(x))]

    # reback
    final_text = '\n'.join(deduplicated_sentences)

    return final_text.strip()

In [None]:
# transform into character
df_videos['subtitles_text'] = df_videos['subtitles_text'].astype(str).fillna('')
df_videos['cleaned_subtitles'] = df_videos['subtitles_text'].apply(clean_and_deduplicate_subtitles)

In [None]:
def better_text(subtitle_text):
    lines = subtitle_text.split('\n')
    filtered_lines = [line for line in lines if line not in ['WEBVTT', 'Kind: captions', 'Language: en', '']]
    cleaned_text = ' '.join(filtered_lines)
    return cleaned_text

In [None]:
df_videos['cleaned_subtitles'] = df_videos['cleaned_subtitles'].apply(better_text)

In [None]:
none_subtitles_count = df_videos['subtitles_text'].isnull().sum()
print(f"Number of rows with 'None' in 'subtitles_text': {none_subtitles_count}")

Number of rows with 'None' in 'subtitles_text': 0


In [None]:
print(df_videos.iloc[30])

title                       Explained | Racial Wealth Gap | FULL EPISODE |...
video_id                                                          Mqrhn8khGLM
description                 In partnership with Vox Media Studios and Vox,...
comment_count                                                            2861
view_count                                                            1469023
published_at                                             2020-04-17T13:00:03Z
channel_subscriber_count                                             28100000
subtitles_text                                                           None
cleaned_subtitles                                                        None
Name: 30, dtype: object


In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
file_path = '/.../videos_index.csv'
df_videos.to_csv(file_path, index=False)