# Scraping YouTube

## Initial Setup

In [15]:
from bs4 import BeautifulSoup
import requests

## Connect to webpage

In [18]:
#r = requests.get("https://www.youtube.com/")
#r.status_code


# Defining the url of the site
base_site = "https://www.youtube.com/"

# Making a get request
response = requests.get(base_site)
response

<Response [200]>

In [19]:
# get HTML
html = response.content

In [20]:
# convert HTML to BeautifulSoup object
soup = BeautifulSoup(html)

## 1) Scrape the text from each span tag
## 2) How many images are on YouTube'e homepage?
## 3) Can you find the URL of the link with title = "Movies"?  Music? Sports?
## 4) Now, try connecting to and scraping https://www.youtube.com/results?search_query=stairway+to+heaven
## a) Can you get the names of the first few videos in the search results?
## b) Next, connect to one of the search result videos - https://www.youtube.com/watch?v=qHFxncb1gRY
## c) Can you find the "related" videos?  What are their titles?  Durations?  URLs? Number of views?
## d) Try finding (and scraping) the Twitter description of the video.

# Scrap the text from each span tag

In [21]:
# Checking that the reply is indeed an HTML code by inspecting the first 100 symbols
html[:100]

b'<!doctype html><html  style="font-size: 10px;font-family: Roboto, Arial, sans-serif;" lang="de-DE" d'

In [22]:
# Convert HTML to a BeautifulSoup object. This will allow us to parse out content from the HTML more easily.
# Using the default parser as it is included in Python
soup = BeautifulSoup(html, "html.parser")

In [23]:

# It is extremely useful to be able to check this file when searching where some info is located
# or to see how was the document parsed

# Exporting the HTML to a file
with open('Wiki_response.html', 'wb') as file:
    file.write(soup.prettify('utf-8'))

In [24]:

# The soup variable (BeautifulSoup object) we defined earlier can be seen as representing the whole document
soup

<!DOCTYPE html>
<html dir="ltr" gl="DE" lang="de-DE" style="font-size: 10px;font-family: Roboto, Arial, sans-serif;"><head><meta content="IE=edge" http-equiv="X-UA-Compatible"/><meta content="AhbmRDASY7NuOZD9cFMgQihZ+mQpCwa8WTGdTx82vSar9ddBQbziBfZXZg+ScofvEZDdHQNCEwz4yM7HjBS9RgkAAABneyJvcmlnaW4iOiJodHRwczovL3lvdXR1YmUuY29tOjQ0MyIsImZlYXR1cmUiOiJXZWJDb21wb25lbnRzVjAiLCJleHBpcnkiOjE2MDM0ODY4NTYsImlzU3ViZG9tYWluIjp0cnVlfQ==" data-expires="2020-10-23" data-feature="Web Components V0" http-equiv="origin-trial"/><meta content="Av2+1qfUp3MwEfAFcCccykS1qFmvLiCrMZ//pHQKnRZWG9dldVo8HYuJmGj2wZ7nDg+xE4RQMQ+Ku1zKM3PvYAIAAABmeyJvcmlnaW4iOiJodHRwczovL2dvb2dsZS5jb206NDQzIiwiZmVhdHVyZSI6IldlYkNvbXBvbmVudHNWMCIsImV4cGlyeSI6MTYwMzgzNjc3MiwiaXNTdWJkb21haW4iOnRydWV9" data-expires="2020-10-27" data-feature="Web Components V0" http-equiv="origin-trial"/><meta content="AixUK+8UEShlt6+JX1wy9eg+XL+eV5PYSEDPH3C90JNVbIkE1Rg1FyVUfu2bZ/y6Pm1xbPLzuwHYHjv4uKPNnA4AAABqeyJvcmlnaW4iOiJodHRwczovL2dvb2dsZXByb2QuY29tOjQ0My

In [25]:
# If there is no result it returns None
# Note: None is not displayed in IPython unless print() or repr() is used
soup.find('video')

In [26]:
# Display the None value
print(soup.find('video'))

None


In [27]:

# verify the type of output
type(soup.find('video'))

NoneType

In [28]:
# .find() returns only the first such result
soup.find('a')

<a href="https://www.youtube.com/about/" slot="guide-links-primary" style="display: none;">Über YouTube</a>

In [29]:

# If we want all the results we use find_all() 
links = soup.find_all('a')
links

[<a href="https://www.youtube.com/about/" slot="guide-links-primary" style="display: none;">Über YouTube</a>,
 <a href="https://www.youtube.com/about/press/" slot="guide-links-primary" style="display: none;">Presse</a>,
 <a href="https://www.youtube.com/about/copyright/" slot="guide-links-primary" style="display: none;">Urheberrecht</a>,
 <a href="/t/contact_us" slot="guide-links-primary" style="display: none;">Kontakt</a>,
 <a href="https://www.youtube.com/creators/" slot="guide-links-primary" style="display: none;">Creator</a>,
 <a href="https://www.youtube.com/ads/" slot="guide-links-primary" style="display: none;">Werbung</a>,
 <a href="https://developers.google.com/youtube" slot="guide-links-primary" style="display: none;">Entwickler</a>,
 <a dir="ltr" href="https://www.youtube.com/t/impressum?hl=de&amp;gl=DE" slot="guide-links-primary" style="display: none">Impressum</a>,
 <a dir="ltr" href="https://transparencyreport.google.com/netzdg/youtube" slot="guide-links-primary" style="d

In [31]:

# find_all returns a list of all results
isinstance(links, list)

True

In [32]:

# We must be careful when using find_all()
# If no result is found it returns an empty list
soup.find_all('video')


[]

In [33]:

# How many links are on the page?
len(links)

15

In [34]:
# Usually, we prefer to store the result in a variable
# Let's store the body of a table in a table variable
table = soup.find('tbody')

In [35]:
# Inspect the value of the variable
table

In [36]:
# Inspect the type of the variable
type(table)

NoneType

In [39]:
soup.find('div', id = 'siteSub')

In [41]:
# We can filter against multiple attributes at once
soup.find('a', class_ = 'mw-jump-link', href = '#p-search')

In [42]:

# By writting the attributes in a dictionary
soup.find('a', attrs={ 'class':'mw-jump-link', 'href':'#p-search' })

In [43]:
soup.find('div', {'id' : 'footer'})

# Downloading 

In [57]:
url = "https://www.youtube.com/results?search_query=stairway+to+heaven"

In [62]:
# Making a get request
response = requests.get(url)
response

<Response [200]>

In [63]:
html = response.content

In [64]:
# convert HTML to BeautifulSoup object
soup = BeautifulSoup(html)

In [65]:
# Checking that the reply is indeed an HTML code by inspecting the first 100 symbols
html[:100]

b'<!doctype html><html  style="font-size: 10px;font-family: Roboto, Arial, sans-serif;" lang="de-DE" d'

In [66]:

# Convert HTML to a BeautifulSoup object. This will allow us to parse out content from the HTML more easily.
# Using the default parser as it is included in Python
soup = BeautifulSoup(html, "html.parser")

In [67]:

# Find all links on the page 
links = soup.find_all('a')
links

[<a href="https://www.youtube.com/about/" slot="guide-links-primary" style="display: none;">Über YouTube</a>,
 <a href="https://www.youtube.com/about/press/" slot="guide-links-primary" style="display: none;">Presse</a>,
 <a href="https://www.youtube.com/about/copyright/" slot="guide-links-primary" style="display: none;">Urheberrecht</a>,
 <a href="/t/contact_us" slot="guide-links-primary" style="display: none;">Kontakt</a>,
 <a href="https://www.youtube.com/creators/" slot="guide-links-primary" style="display: none;">Creator</a>,
 <a href="https://www.youtube.com/ads/" slot="guide-links-primary" style="display: none;">Werbung</a>,
 <a href="https://developers.google.com/youtube" slot="guide-links-primary" style="display: none;">Entwickler</a>,
 <a dir="ltr" href="https://www.youtube.com/t/impressum?hl=de&amp;gl=DE" slot="guide-links-primary" style="display: none">Impressum</a>,
 <a dir="ltr" href="https://transparencyreport.google.com/netzdg/youtube" slot="guide-links-primary" style="d

In [73]:
# Notice that some links don't have URL (None appears)

# Dropping the links without href attribute
clean_links = [l for l in links if l.get('href') != None]
clean_links

[<a href="https://www.youtube.com/about/" slot="guide-links-primary" style="display: none;">Über YouTube</a>,
 <a href="https://www.youtube.com/about/press/" slot="guide-links-primary" style="display: none;">Presse</a>,
 <a href="https://www.youtube.com/about/copyright/" slot="guide-links-primary" style="display: none;">Urheberrecht</a>,
 <a href="/t/contact_us" slot="guide-links-primary" style="display: none;">Kontakt</a>,
 <a href="https://www.youtube.com/creators/" slot="guide-links-primary" style="display: none;">Creator</a>,
 <a href="https://www.youtube.com/ads/" slot="guide-links-primary" style="display: none;">Werbung</a>,
 <a href="https://developers.google.com/youtube" slot="guide-links-primary" style="display: none;">Entwickler</a>,
 <a dir="ltr" href="https://www.youtube.com/t/impressum?hl=de&amp;gl=DE" slot="guide-links-primary" style="display: none">Impressum</a>,
 <a dir="ltr" href="https://transparencyreport.google.com/netzdg/youtube" slot="guide-links-primary" style="d