In [71]:
# Load libraries and constants
import os
from glob import glob
import re
from datetime import datetime
import urllib
from googleapiclient.discovery import build
import pandas as pd

DEFAULT_SECRETS_DIR = os.path.join('../configs', 'secrets.yaml')
PUBLISHED_AFTER = datetime.strptime("1/1/2017", '%d/%m/%Y').isoformat() + 'Z'
MAX_PAGE_SIZE = 50 # no benefit if this number is anything other than 50
VIDEO_DOWNLOAD_DIRECTORY="../data/youtube_dataset/videos"
VIDEO_DATA_PATH="../data/youtube_dataset/video_data.json"
TRANSCRIPTS_DATA_PATH = "../data/youtube_dataset/transcripts.json"
CACHE_PATH = "../data/cache"
GOOGLE_API_CLIENT_SECRETS_FILE = "../configs/client_secret_google_api.json"

def load_video_data(data_path=VIDEO_DATA_PATH):
    if os.path.exists(VIDEO_DATA_PATH):
        vid_df = pd.read_json(VIDEO_DATA_PATH, orient='index')
        vid_df.index.name = 'id'
        vid_df.publish_date = pd.to_datetime(vid_df.publish_date)
        return vid_df
    else:
        return None

def store_video_data(vid_df: pd.DataFrame, data_path=VIDEO_DATA_PATH):
    vid_df.publish_date = vid_df.publish_date.apply(lambda x : x.isoformat())
    vid_df.to_json(VIDEO_DATA_PATH, indent=4, orient="index")

def parse_video_file(video_path):
    match = re.match(r"\[(.+)\]_(.*)", os.path.basename(video_path))
    return {
        'id': match.group(1),
        'video_name': match.group(2)
    }

#def list_all_video_files(video_download_dir=VIDEO_DOWNLOAD_DIRECTORY):
    #downloaded_ids = glob(os.path.join(video_download_dir, "*.mp4"))
    #downloaded_ids = set(map(lambda x: x['id'], downloaded_ids))

In [74]:
# load video_data if it already exists
import pandas as pd
from IPython.display import display

vid_df = load_video_data(VIDEO_DATA_PATH)
if not vid_df is None:
    display(vid_df.head(5))

Unnamed: 0_level_0,publish_date,video_title,video_description,channel_title,video_tags,view_count,like_count,comments_count,channel_subs_count,channel_view_count,channel_vid_count,channel_id,query
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
S-nHYzK-BVg,2017-09-25 13:12:22+00:00,Beginner's Guide to Microsoft Word,"If you like this video, here's my entire playl...",Technology for Teachers and Students,"[microsoft word, word tutorial, using ms word,...",6973801,88277.0,1891.0,1480000,141834118,522,UCYUPLUCkMiUgiyVuluCc7tQ,windows tutorial
DHq3bqowzW0,2018-08-31 01:32:47+00:00,How to Clean C Drive In Windows 10 (Make Your...,"This video shows you, How to Clean C Drive (Lo...",Geeks Tutorial,"[how to clean local disk c windows 10, how to ...",9910893,202666.0,8536.0,558000,84677874,109,UCU1K6P1M8hq-TBnDlHQtO7A,windows tutorial
EJHKuwBhdB4,2024-02-07 14:45:00+00:00,How to Create a Windows 10 Installation USB wi...,"In this tutorial video, I'll show you how to c...",Memory's Tech Tips,"[how to create a windows 10 installation usb, ...",29995,354.0,38.0,13100,3192330,394,UCpFxsy-mzKIIX14aOH-veXg,windows tutorial
fFe3iESppag,2022-04-13 22:32:33+00:00,How to Clean C Drive In Windows 11 (Make Your ...,"This video shows you, How to Clean C Drive (Lo...",Geeks Tutorial,"[how to clean local disk c windows 11, how to ...",440892,7991.0,291.0,558000,84677874,109,UCU1K6P1M8hq-TBnDlHQtO7A,windows tutorial
ttiA0zRbzko,2018-05-05 15:19:14+00:00,How to Speed Up Your Windows 10 Performance! (...,"This video shows you, How to speed up any Wind...",Geeks Tutorial,"[how to speed up windows 10, speed up windows ...",4112471,85921.0,4556.0,558000,84677874,109,UCU1K6P1M8hq-TBnDlHQtO7A,windows tutorial


## Search youtube 

In [56]:
import yaml

queries = [
  "windows tutorial",
  "linux tutorial",
  "macos tutorial",
  "pandas python tutorial",
  "how to create an account on tutorial",
  "how to download tutorial computer",
  "PowerPoint presentation tutorial",
  "Git and GitHub tutorial",
  "PowerPoint presentation tutorial",
  "Building a website with WordPress tutorial",
  "Creating macros in Excel tutorial",
  "CAD design with AutoCAD tutorial",
  "Animating with Adobe Animate tutorial",
  "how to install C++ tutorial",
  "how to install java tutorial",
  "GIMP photo manipulation tutorial",
  "Creating music with FL Studio tutorial",
  "Creating animations in Maya tutorial",
  "Configuring a VPN tutorial",
  "Audacity tutorial",
  "Learning to code with Scratch tutorial",
  "zoom online meetings tutorial",
  "how to use zoom tutorial"
]

with open(DEFAULT_SECRETS_DIR, 'r') as secrets_file:
    secrets = yaml.safe_load(secrets_file)

yt_api_key = secrets['youtube_api_key']
videos = []

with build('YouTube', "v3", developerKey=yt_api_key) as yt:
    for query in queries:

        if query in list(vid_df['query'].unique()): # check if query already used
            continue

        search_result_list = yt.search().list(
            part='snippet',
            publishedAfter=PUBLISHED_AFTER,
            order='relevance',
            q=query,
            type='video',
            videoCategoryId='26',
            videoCaption="closedCaption",
            maxResults=MAX_PAGE_SIZE,
            relevanceLanguage='en',
        ).execute()

        video_ids = [search_result['id']['videoId'] for search_result in search_result_list['items']]
        video_results = yt.videos().list(part=['id', 'snippet', 'statistics'], id=video_ids).execute()
        video_results_dict = {video_data['id']:video_data for video_data in video_results['items']}

        channel_ids = [video['snippet']['channelId'] for video in video_results['items']]
        channel_results = yt.channels().list(part=['statistics'], id=channel_ids).execute()
        channel_results_dict = {channel_data['id']:channel_data for channel_data in channel_results['items']}

        for video_id, channel_id in zip(video_ids, channel_ids):
            video_data = video_results_dict[video_id]
            channel_data = channel_results_dict[channel_id]
            videos.append(
                {
                    "id": video_data['id'],
                    "publish_date": video_data['snippet'].get('publishedAt', None),
                    "video_title": video_data['snippet'].get('title', None),
                    "video_description": video_data['snippet'].get('description', None),
                    "channel_title": video_data['snippet'].get('channelTitle', None),
                    "video_tags": video_data['snippet'].get('tags', None),
                    "view_count" : video_data['statistics'].get('viewCount', None),
                    "like_count" : video_data['statistics'].get('likeCount', None),
                    "comments_count": video_data['statistics'].get('commentCount', None),
                    "channel_subs_count": channel_data['statistics'].get('subscriberCount', None),
                    "channel_view_count": channel_data['statistics'].get('viewCount', None),
                    "channel_vid_count": channel_data['statistics'].get('videoCount', None),
                    "channel_id": channel_id,
                    "query": query
                }
            )

In [75]:
import pandas as pd

# saving video data
vid_df_new = pd.DataFrame(videos).drop_duplicates(subset='id', keep='first').set_index('id')
vid_df_new.publish_date = pd.to_datetime(vid_df_new.publish_date)
if not vid_df is None:
    vid_df_new = vid_df_new.loc[~vid_df_new.index.isin(vid_df.index)]
    if(len(vid_df_new) > 0):
        vid_df = pd.concat([vid_df, vid_df_new])
    print(f"{len(vid_df_new)} new videos have been found on youtube.")
    print(f"{len(vid_df)} is the new number of videos in dataset.")

store_video_data(vid_df, VIDEO_DATA_PATH)

0 new videos have been found on youtube.
1053 is the new number of videos in dataset.


## Download the videos

In [76]:
# load video_data if starting notebook from here
vid_df = load_video_data(VIDEO_DATA_PATH)

In [77]:
# Monkey patch to make pytube work again
# pytube is occisionally broken and needs monkey patching
from pytube import YouTube
from pytube.innertube import InnerTube

def bypass_age_gate(self):
        """Attempt to update the vid_info by bypassing the age gate."""
        clients = [
            'ANDROID_EMBED', 'IOS', 'ANDROID', 'WEB_EMBED',
            'ANDROID_EMBED', 'IOS_EMBED', 'WEB_MUSIC',
            'IOS_MUSIC', 'WEB_CREATOR', 'ANDROID_CREATOR',
            'IOS_CREATOR', 'MWEB', 'TV_EMBED', 'WEB'
        ]
        print("Clients List:\n\n", clients,"\n")
        success_client = None
        try:
            for client in clients:
                innertube = InnerTube(
                    client=client,
                    use_oauth=self.use_oauth,
                    allow_cache=self.allow_oauth_cache
                )
                innertube_response = innertube.player(self.video_id)

                playability_status = innertube_response['playabilityStatus'].get('status', None)

                # Print the status of each client
                print(f"Client: {client}, Status: {playability_status}")

                # If the video is accessible, update _vid_info and exit the loop
                if playability_status != 'UNPLAYABLE':
                    self._vid_info = innertube_response
                    success_client = client
                    print(f"Chosen client: {client}")
                    break

        except Exception as e:
            print(f"Error: {e}")

        finally:
            if not success_client:
                print("No successful client found. Performing generic action...")
                # Perform generic action here

YouTube.bypass_age_gate = bypass_age_gate

In [None]:
import re
from glob import glob
from tqdm import tqdm
from pytube import YouTube

# retrieve all downloaded ids
downloaded_ids = glob(os.path.join(VIDEO_DOWNLOAD_DIRECTORY, "*.mp4"))
downloaded_ids = set(map(lambda x: parse_video_file(x)['id'], downloaded_ids))

for vid_id in tqdm(vid_df.index):
    # do not re-download them
    if vid_id in downloaded_ids:
        continue
    try:
        video_handle = YouTube(f"https://www.youtube.com/watch?v={vid_id}", use_oauth=True, allow_oauth_cache=True)

        video_stream = video_handle.streams.filter(
            progressive=True,
            file_extension='mp4',
            resolution='720p'
        ).first()

        if video_stream:
            video_stream.download(filename_prefix=f"[{vid_id}]_", output_path=VIDEO_DOWNLOAD_DIRECTORY, skip_existing=True)
        else:
            continue # TODO logging
    except Exception as e:
        print(e)

### Extend the video data with video details

In [114]:
import cv2
from tqdm import tqdm
# Extend data with video details
# I wanted to the properties of exactly the videos I have downloaded
# this is why i did not use the Youtube API for this

vid_df = load_video_data(VIDEO_DATA_PATH)

downloaded_paths = glob(os.path.join(VIDEO_DOWNLOAD_DIRECTORY, "*.mp4"))
downloaded_paths = dict(map(lambda x: (parse_video_file(x)['id'], x), downloaded_paths))

vid_details = []
for vid_id in tqdm(vid_df.index):
    vid_path = downloaded_paths.get(vid_id, None)
    if vid_path:
        vid_cap = cv2.VideoCapture(vid_path)
        vid_id = parse_video_file(vid_path)['id']
        if (vid_cap.isOpened()):
            fps = vid_cap.get(cv2.CAP_PROP_FPS)
            fr_cnt = int(vid_cap.get(cv2.CAP_PROP_FRAME_COUNT))
            width = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
            height = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
            vid_details.append({
                "id": vid_id,
                "video_available": True,
                "fps": fps,
                "frame_count": fr_cnt,
                "frame_width": width,
                "frame_height": height
            })
        else:
            vid_details.append({
                "id": vid_id,
                "video_available": False,
                "fps": None,
                "frame_count": None,
                "frame_width": None,
                "frame_height": None
            })
        vid_cap.release()
    else:
        vid_details.append({
            "id": vid_id,
            "video_available": False,
            "fps": None,
            "frame_count": None,
            "frame_width": None,
            "frame_height": None
        })

100%|██████████| 1053/1053 [00:07<00:00, 148.36it/s]


In [127]:
vid_details_df = pd.DataFrame(vid_details)
vid_details_df.set_index('id', inplace=True)
print(f"{sum(~vid_details_df['video_available'])} videos were unavailable")
vid_df = vid_df.drop(columns=vid_details_df.columns.intersection(vid_df.columns))
vid_df = pd.merge(vid_df, vid_details_df, left_index=True, right_index=True)
store_video_data(vid_df, VIDEO_DATA_PATH)

40 videos were unavailable


## Download transcripts (WIP)

In [11]:
vid_df = load_video_data(VIDEO_DATA_PATH)

In [12]:
# This is the old code for donwloading code currently doesnt work
# if the library introduces a fix then it can be used again
#from youtube_transcript_api import YouTubeTranscriptApi, NoTranscriptFound, NoTranscriptAvailable, TranscriptsDisabled
#import traceback
#from tqdm import tqdm
#
#transcripts = dict()
#
#for vid_id in tqdm(vid_df.index[:3]):
#    try:
#        transcript_list = YouTubeTranscriptApi.list_transcripts(vid_id)
#        ts_handle = transcript_list.find_transcript(['en', 'en-GB', 'en-US', 'de', 'fr','es'])
#        transcript = ts_handle.fetch()
#        transcripts[vid_id] = {'is_generated': ts_handle.is_generated, 'availability': "available", 'message':"", 'transcript': transcript} 
#    except NoTranscriptFound as e:
#        transcripts[vid_id] = {'is_generated': None, 'availability': "not_found", 'message': traceback.format_exc(), 'transcript': None} 
#    except NoTranscriptAvailable as e:
#        transcripts[vid_id] = {'is_generated': None, 'availability': "not_available", 'message': traceback.format_exc(), 'transcript': None} 
#    except TranscriptsDisabled as e:
#        transcripts[vid_id] = {'is_generated': None, 'availability': "disabled", 'message': traceback.format_exc(), 'transcript': None} 

100%|██████████| 3/3 [00:02<00:00,  1.34it/s]


In [61]:
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials

SCOPES = ['https://www.googleapis.com/auth/youtube.force-ssl']
token_path = os.path.join(CACHE_PATH, 'token.json')

if os.path.exists('token.json'):
    creds = Credentials.from_authorized_user_file(token_path, SCOPES)

if not creds or not creds.valid:
    if creds and creds.expired and creds.refresh_token:
        creds.refresh(Request())
    else:
        flow = InstalledAppFlow.from_client_secrets_file(GOOGLE_API_CLIENT_SECRETS_FILE, SCOPES)
        creds = flow.run_local_server(port=0)
    # Save the credentials for the next run
    with open(token_path, 'w') as token_file:
        token_file.write(creds.to_json())

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=720733739355-mhedlch5gflcreiuov7i8tt9299e3he8.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A59887%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fyoutube.force-ssl&state=cF9ZRv0tk1tdv9eFPMpRoUfaIGYaDW&access_type=offline


AttributeError: 'NoneType' object has no attribute 'replace'

In [None]:
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from googleapiclient.discovery import build

import os

# The CLIENT_SECRETS_FILE variable specifies the name of a file that contains the OAuth 2.0 information for this application,
# including its client_id and client_secret.
CLIENT_SECRETS_FILE = "client_secret.json"

# This scope allows for full read/write access to the authenticated user's account and requires requests to use an SSL connection.
SCOPES = ['https://www.googleapis.com/auth/youtube.force-ssl']

def get_authenticated_service():
    creds = None
    # The file token.json stores the user's access and refresh tokens, and is
    # created automatically when the authorization flow completes for the first
    # time.
    if os.path.exists('token.json'):
        creds = Credentials.from_authorized_user_file('token.json', SCOPES)
    # If there are no (valid) credentials available, let the user log in.
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(CLIENT_SECRETS_FILE, SCOPES)
            creds = flow.run_local_server(port=0)
        # Save the credentials for the next run
        with open('token.json', 'w') as token:
            token.write(creds.to_json())

    return build('youtube', 'v3', credentials=creds)

# Sample python code for channels.list
def channels_list_by_username(service, **kwargs):
    results = service.channels().list(**kwargs).execute()
    print(results)

if __name__ == '__main__':
    # Disable OAuthlib's HTTPS verification when running locally.
    # *DO NOT* leave this option enabled when running in production.
    os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1'

    youtube = get_authenticated_service()
    channels_list_by_username(youtube, part='snippet,contentDetails,statistics', forUsername='GoogleDevelopers')

In [None]:
# Using pytube to download transcripts
transcripts = dict()

for vid_id in tqdm(vid_df.index[:3]):
    try:

In [28]:
from pytube import YouTube
video_handle = YouTube(f"https://www.youtube.com/watch?v=Ywsf2c1LQoI", use_oauth=True, allow_oauth_cache=True)

In [33]:
import yaml
with open(DEFAULT_SECRETS_DIR, 'r') as secrets_file:
    secrets = yaml.safe_load(secrets_file)

yt_api_key = secrets['youtube_api_key']

yt = build('YouTube', "v3", developerKey=yt_api_key)


In [53]:
caps = yt.captions().list(part=['snippet'], videoId='zkJkRCZ4_9o').execute()

In [45]:
id = caps['items'][0]['id']

In [57]:
x = yt.captions().download(id=caps['items'][0]['id']).execute()

HttpError: <HttpError 401 when requesting https://youtube.googleapis.com/youtube/v3/captions/AUieDabHDSXVq7oxN-mfbNIAaMMwerVkKGnGTUgzZjhKMtcgMYw?key=AIzaSyCkdTVBCg6mXEEZqDN3fNmxWAqCqgY7TUo returned "API keys are not supported by this API. Expected OAuth2 access token or other authentication credentials that assert a principal. See https://cloud.google.com/docs/authentication". Details: "[{'message': 'Login Required.', 'domain': 'global', 'reason': 'required', 'location': 'Authorization', 'locationType': 'header'}]">

In [56]:
x=x.execute()

HttpError: <HttpError 401 when requesting https://youtube.googleapis.com/youtube/v3/captions/AUieDabHDSXVq7oxN-mfbNIAaMMwerVkKGnGTUgzZjhKMtcgMYw?key=AIzaSyCkdTVBCg6mXEEZqDN3fNmxWAqCqgY7TUo returned "API keys are not supported by this API. Expected OAuth2 access token or other authentication credentials that assert a principal. See https://cloud.google.com/docs/authentication". Details: "[{'message': 'Login Required.', 'domain': 'global', 'reason': 'required', 'location': 'Authorization', 'locationType': 'header'}]">

## Extract images for data cleaning classifier

In [158]:
import cv2
from glob import glob
import os
from tqdm import tqdm
import numpy as np
# Get the duration of all videos

number_of_datapoints = 10000
vid_df = load_video_data(VIDEO_DATA_PATH)
vid_df = vid_df[vid_df['video_available']]
vid_frame_intervals = pd.DataFrame()
vid_frame_intervals['end'] = vid_df['frame_count'].astype(int).cumsum()
vid_frame_intervals['start'] = vid_frame_intervals['end'].shift(1, fill_value=0)
total_frames = sum(vid_df['frame_count'])
choices = pd.Series(np.random.randint(0, total_frames, number_of_datapoints))
#total_frames
vid_frame_intervals['end'] > choices.

ValueError: ('Lengths must match to compare', (1013,), (10000,))