# Youtube Channels Scraping

Using this code you can scrape YouTube channel like Netflix, TedxTalks, Whitehouse and more. This code will save all details in CSV file

In [1]:
from bs4 import BeautifulSoup
import re, requests, csv, time
from requests import get
    
# set youtube channel name here
channel_name = "whitehouse"

def get_soup(url):
    #open url and return BeautifulSoup object, or None if site does not exist
    result = requests.get(url)
    if result.status_code != 200: return None
    time.sleep(5) # slow down as per youtube 'terms of use' to human speed
    return BeautifulSoup(result.text, 'html.parser')

def channel_section_links():
    #list of { 'title': , 'link':  }
    soup = get(f"https://www.youtube.com/user/{channel_name}/playlists")
    html_soup = BeautifulSoup(soup.text, 'html.parser')
    

    play_list_atags = html_soup.find_all('a', {'href': re.compile(f"{channel_name}/playlists")})
    
    elements = [{'title': x.text.strip(), "link": fix_url(x['href'])} for x in play_list_atags if
                x.span and ('shelf_id=0' not in x['href'])] # filter out play lists
    print("printing elements")
    
   
    if len(elements) == 0: # no sections, make up no sections section with default link
        elements = [ {'title':'no sections',
                      'link':f'https://youtube.com/{channel_name}/playlists'}]
    
    return elements


def fix_url(url):  # correct relative urls back to absolute urls
    if url[0] == '/': return 'https://www.youtube.com' + url
    else: return url


def get_playlists(section):
   
    print(f"  getting playlists for section: {section['title']}")
    soup = get_soup(section['link'])
    if soup == None: # no playlist, create dummy playlist and default link
        return [{'title':'No Playlists', 'link':f'https://youtube.com/{channel_name}/videos'}]
    atags = soup('a', class_="yt-uix-tile-link")
   
    playlists = []
    for a in atags:  # find title and link
        title = a.text
        if title != "Liked videos": # skip these
            link = fix_url(a['href'])
            playlists.append({'title':title, 'link':link})
    if playlists == []: return [{'title':'No Playlists',
                                 'link':f'https://youtube.com/{channel_name}/videos'}]
    
    
   
    playlists=playlists[1:2]#Set number of playlists
   # list[:11]
    
    return playlists

def add_videos(playlist):
    #find videos in playlist[link] and add their info as playlist[videos] as list
    soup = get_soup(playlist['link'])
    print(f"    getting videos for playlist: {playlist['title']}")
    items = soup('a', class_="yt-uix-tile-link") # items are list of video a links from list
    videos = []
    for i in items: 
        d = {} # collect video info in dict
        d['title'] = i.text.strip()
        if d['title'] == '[Private video]':# To skip the private videos 
            continue
        if d['title'] =='[Deleted video]':# To skip the deleted videos 
            continue
        link = fix_url(i['href'])
        d['link'] = link
        t = i.find_next('span', { 'aria-label': True})
        d['time'] = t.text if t else 'NA'
        print(f"      open video '{d['title']}' for details", end=" ")# printing the title of video

        vsoup = get_soup(link) # now get video page and pull information from it
        print("* read, now processing",end="")
        
        views= vsoup.find("div", class_='watch-view-count').text
        
        d['views'] = ''.join(c for c in views if c in "0123456789")
        d['publication_date'] = vsoup.find('strong',
                                class_="watch-time-text").text[len('Published on ')-1:]
        d['description'] = vsoup.find('div',id='watch-description-text').text
        id = vsoup.find('meta', itemprop="videoId")['content']
        d['short_link'] = f'https://youtu.be/{id}'
        likebutton = vsoup.find('button', class_="like-button-renderer-like-button")
        if likebutton.find('span',class_ = 'yt-uix-button-content').text==[]:
            d['likes']=0
        else:
            d['likes'] = likebutton.find('span',class_ = 'yt-uix-button-content').text
        disbutton = vsoup.find('button',class_='like-button-renderer-dislike-button')
        d['dislikes'] = disbutton.find('span',class_ = 'yt-uix-button-content').text
        videos.append(d)
        print("* finished video")

        playlist['videos'] = videos # add new key to this playlist of list of video infos

def tag(t,c): return f'<{t}>{c}</{t}>' # return html tag with content
def link(text, url): return f'{text}' # return a tag with content and link


def csv_out(channel, sections):
    #create and output channel_name.csv file for import into a spreadsheet or DB
    headers = 'channel,section,playlist,video,' \
              'link,time,views,publication date,likes,dislikes,description'.split(',')

    with open(f'{channel}.csv', "w",encoding="utf-8") as csv_file:
        csvf = csv.writer(csv_file, delimiter=',')
        csvf.writerow(headers)
        for section in sections:
            for playlist in section['playlists']:
                for video in playlist['videos']:
                    v = video
                    line = [ channel, section['title'], playlist['title'], v['title']]
                    line.extend([v['short_link'],v['time'], v['views'], v['publication_date'],
                                 v['likes'], v['dislikes'], v['description']])
                    csvf.writerow(line)

if __name__ == '__main__':
    

    print("finding sections")
    sections = channel_section_links()
    for section in sections:
        section['playlists'] = get_playlists(section)
        for playlist in section['playlists']:
            add_videos(playlist)

    
    csv_out(channel_name, sections) # create a csv file of video info for import into spreadsheet

    print(f"Program Complete,\n  '{channel_name}.htm' and" \
          f" '{channel_name}.csv' have been written to current directory")

finding sections
printing elements
  getting playlists for section: no sections
    getting videos for playlist: Vice President Pence
      open video 'Vice President Pence Delivers Remarks at the 37th Annual National Peace Officers' Memorial Service' for details * read, now processing* finished video
      open video 'Vice President Pence Delivers Remarks During a Protocolary Meeting at the OAS' for details * read, now processing* finished video
      open video 'Vice President Pence Delivers Remarks at the National Rifle Association Leadership Forum' for details * read, now processing* finished video
      open video 'Vice President Pence Participates in the Swearing-in Ceremony of the U.S. Ambassador to Germany' for details * read, now processing* finished video
      open video 'Swearing-In Ceremony for the U.S. Permanent Representative to the Organization of American States' for details * read, now processing* finished video
      open video 'Vice President Pence Holds a Lunch in 