## Data Scraping using the YT API

This notebook containes the different approaches to scrpaping video IDs, titles and thumbnail URLs.

In [2]:
from googleapiclient.discovery import build
import pandas as pd
import time



#open key file and read in api key

with open ("key", "r") as myfile:
    api_key=myfile.read()





### Searching for Videos by Keyword

In [80]:
# Scraping youtube video id and title using the YT Api search function. 
# Quota cost: 100 per search (each next page request counts as a search)


#opening the google API service using the api-key

searchData = []
with build('youtube', 'v3', developerKey=api_key) as youtube:
    request = youtube.search().list(part='id,snippet',
                                    q='try not to laugh', 
                                    fields = 'nextPageToken,items(id,snippet)',
                                    maxResults = 50
                                    )
    
    i = 0
    print('Start')
    while request and i < 40:
        response = request.execute()
        print('page: ' + str(i))
        

        for result in response['items']:
            if result['id']['kind'] == 'youtube#video':
                vidId = result['id']['videoId']
                searchData.append([vidId, result['snippet']['title'], 
                           'https://i.ytimg.com/vi/' + vidId + '/default.jpg', 
                           'https://i.ytimg.com/vi/' + vidId + '/mqdefault.jpg',
                           'https://i.ytimg.com/vi/' + vidId + '/hqdefault.jpg']
                         )

        request = youtube.search().list_next(
            request, response)
        i += 1
        
    print('done')

print('saving...')
pd.DataFrame(searchData, columns = ['ID', 'TITLE', 'THUMBNAIL1','THUMBNAIL2','THUMBNAIL3']).to_csv('searchData' + str(round(time.time())) + '.csv', index=False)
print('saved...')

Start
page: 0
page: 1
page: 2
page: 3
page: 4
page: 5
page: 6
page: 7
page: 8
page: 9
page: 10
page: 11
page: 12
page: 13
page: 14
done
saving...
saved...


In [10]:
response['items'][1]['snippet']['tags'][:10]
#response['items'][0]['id']

['Popular',
 'Songs',
 'Mix',
 '2021',
 'Camila Cabello',
 'Ed Sheeran',
 'Kygo',
 'Style',
 'Chill',
 'Chill Out']

In [83]:
with build('youtube', 'v3', developerKey=api_key) as youtube:
    
    request = youtube.playlistItems().list(part='id,snippet', playlistId='PLv3TTBr1W_9tppikBxAE_G6qjWdBljBHJ', maxResults = 2)

    response = request.execute()

    print(response)

{'kind': 'youtube#playlistItemListResponse', 'etag': 'dTfadgPsawtdlsWoj7taUJ9HzBU', 'nextPageToken': 'EAAaBlBUOkNBSQ', 'items': [{'kind': 'youtube#playlistItem', 'etag': '4PP9KMdQMmkviHBYOzydnPGQkx4', 'id': 'UEx2M1RUQnIxV185dHBwaWtCeEFFX0c2cWpXZEJsakJISi5DNUNCNkIzMEM5Mjg5NkRB', 'snippet': {'publishedAt': '2021-11-14T17:33:25Z', 'channelId': 'UCpzml8N7xz3G_efuqWxY_YQ', 'title': 'Mr. Bean as Palpatine', 'description': 'SUBSCRIBE for more memes! \n\nPalpatine got something else on his mind \n\nSOCIALS\nInstagram: https://www.instagram.com/jonkarip/\nTwitter:  https://twitter.com/JonkariP\nDiscord: https://discord.gg/WAn8DKT\n#sequelmeme #meme #shorts\n\nBlue Danube (by Strauss) by Strauss\nCreative Commons — Attribution 3.0 Unported— CC BY 3.0\nhttps://creativecommons.org/licenses/...\nMusic provided by FreeMusic109 https://youtube.com/FreeMusic109', 'thumbnails': {'default': {'url': 'https://i.ytimg.com/vi/FItHgP_l234/default.jpg', 'width': 120, 'height': 90}, 'medium': {'url': 'https://

In [91]:
for result in response['items']:
    print('Title: ' + result['snippet']['title'])
    print('Id: ' + result['snippet']['resourceId']['videoId'])

Title: Mr. Bean as Palpatine
Id: FItHgP_l234
Title: Super Weedio Bros. | PhillipLeeX
Id: hVp12W4NU4Q


### Scoraping videos from playlist without search

In [95]:
# Scraping youtube video id and title using the YT Api playlistItems function. 
# Quota cost: 1 per call (each next page request counts as a call)
# Downside: needs playlist id to work

searchData = []
with build('youtube', 'v3', developerKey=api_key) as youtube:
    request = youtube.playlistItems().list(part='id,snippet', 
                                           playlistId='PLLGmt3bXA_93pvHgKm7dbEvW410pDFKKl', 
                                            fields = 'nextPageToken,items(id,snippet)',
                                            maxResults = 50
                                            )
    
    i = 0
    print('Start')
    while request:
        response = request.execute()
        print('page: ' + str(i))
        

        for result in response['items']:
            
            vidId = result['snippet']['resourceId']['videoId']
            searchData.append([vidId, result['snippet']['title'], 
                           'https://i.ytimg.com/vi/' + vidId + '/default.jpg', 
                           'https://i.ytimg.com/vi/' + vidId + '/mqdefault.jpg',
                           'https://i.ytimg.com/vi/' + vidId + '/hqdefault.jpg']
                         )

        request = youtube.search().list_next(
            request, response)
        i += 1
        
    print('done')

print('saving...')
pd.DataFrame(searchData, columns = ['ID', 'TITLE', 'THUMBNAIL1','THUMBNAIL2','THUMBNAIL3']).to_csv('searchData' + str(round(time.time())) + '.csv', index=False)
print('saved...')

Start
page: 0
page: 1
page: 2
page: 3
page: 4
page: 5
page: 6
page: 7
page: 8
page: 9
page: 10
page: 11
page: 12
page: 13
page: 14
page: 15
page: 16
page: 17
page: 18
page: 19
page: 20
page: 21
page: 22
page: 23
page: 24
page: 25
page: 26
page: 27
page: 28
page: 29
page: 30
page: 31
page: 32
page: 33
page: 34
page: 35
page: 36
page: 37
page: 38
page: 39
page: 40
page: 41
page: 42
page: 43
page: 44
page: 45
page: 46
page: 47
page: 48
page: 49
page: 50
page: 51
page: 52
page: 53
page: 54
page: 55
page: 56
page: 57
page: 58
page: 59
page: 60
page: 61
page: 62
page: 63
page: 64
page: 65
page: 66
page: 67
page: 68
page: 69
page: 70
page: 71
page: 72
page: 73
page: 74
page: 75
page: 76
page: 77
page: 78
page: 79
page: 80
page: 81
page: 82
page: 83
page: 84
page: 85
page: 86
page: 87
page: 88
done
saving...
saved...


### Searching for playlists and then extracting videos

In [3]:
# improving the search efficiency by searching for playlists 
# and then scraping all videos from the resulting playlists
# search quota 100 for 50 playlists
# in each playlist 1 quota point for 50 videos

# already scraped: vlogs, cats, dogs, music, cars, fishing, restoration, tutorial,tech, 
#                  travel, asmr, lifestyle, Beauty, DIY, learning

playlistQuery = 'learning'


# searching for playlists
with build('youtube', 'v3', developerKey=api_key) as youtube:
    
    # type = plalist, q = searchword, order = videoCount to get large playlists
    request = youtube.search().list(part='id,snippet', q=playlistQuery, maxResults = 50, 
                                    type = 'playlist', order = 'videoCount')

    response = request.execute()

# saving all playlist ids in a list

playlists = []

for result in response['items']:
        playlists.append([result['id']['playlistId'],result['snippet']['title']])
        

# scraping all videos of each playlist

for playlist in playlists:
    
    playlistId = playlist[0]
    
    print(playlist[1], playlist[0])
    
    searchData = []
    
    
    with build('youtube', 'v3', developerKey=api_key) as youtube:
        
        # requesting 50 videos each
        request = youtube.playlistItems().list(part='id,snippet', 
                                               playlistId=playlistId, 
                                                fields = 'nextPageToken,items(id,snippet)',
                                                maxResults = 50
                                                )

        i = 0
        print('Start')
        
        #requesting new pages as long as there is a return on the request
        while request:
            try:
                response = request.execute()
                print('page: ' + str(i))


                for result in response['items']:

                    # saving the relevant details
                    vidId = result['snippet']['resourceId']['videoId']
                    searchData.append([vidId, result['snippet']['title'], 
                                   'https://i.ytimg.com/vi/' + vidId + '/default.jpg', 
                                   'https://i.ytimg.com/vi/' + vidId + '/mqdefault.jpg',
                                   'https://i.ytimg.com/vi/' + vidId + '/hqdefault.jpg']
                                 )

                request = youtube.search().list_next(
                    request, response)
                i += 1
            except:
                print('Some kind of Error.')
        print('done')
    
    # writing to disk after each playlist to not lose everything in case of an error
    print('saving...')
    pd.DataFrame(searchData, columns = ['ID', 'TITLE', 'THUMBNAIL1','THUMBNAIL2','THUMBNAIL3']).to_csv('ScrapedData/searchData' + str(round(time.time())) + '.csv', index=False, sep=",")
    print('saved...')

        

HttpError: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/search?part=id%2Csnippet&q=learning&maxResults=50&type=playlist&order=videoCount&key=AIzaSyA2uGpj33bOCo1LJO-5UrPkaFaLkA58MkM%0A&alt=json returned "The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.". Details: "[{'message': 'The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.', 'domain': 'youtube.quota', 'reason': 'quotaExceeded'}]">