
# Youtube - Scaper



In [24]:
from pyppeteer import launch
from fake_useragent import UserAgent
import time

# for extended documentation visit --> https://miyakogi.github.io/pyppeteer/
# !!! function could only be called with await !!!
async def scrape(url_: str, selector_: str, page_function_ = "(element) => element.outerHTML", log_ = True):
    if (log_) : print("-------------------------Scrape Log Begin--------------------------", "\n")
    #create random user agent so YouTube's algorithm gets pypassed
    ua = UserAgent()
    agent = ua.random
    
    # create browser, incognito context and page
    browser = await launch()
    context = await browser.createIncognitoBrowserContext()
    page = await context.newPage()
    if (log_) : print("Browser, Incognito Context and Page created")
    
    # set user agent
    await page.setUserAgent(agent)
    if (log_) : print("User Agent:", agent)
    
    # open url
    await page.goto(url_)
    if (log_) : print("Url opened:", url_)
        
    await page.waitForSelector("h1.title")
    await page.click("h1.title")
    
    time.sleep(5)
    
    await page.keyboard.press("End")
    
    # wait until page gets loaded
    await page.waitForSelector(selector_)
    if (log_) : print("Selector loaded:", selector_)
        
    time.sleep(3)
        
    await page.click(selector_)
    
    # get element from query selector and relating function
    result = await page.querySelectorEval(selector_, page_function_)
    if (log_) : print("Result loaded:", result)
    
    # close browser
    await browser.close()
    if (log_) : print("Browser closed", "\n")
    if (log_) : print("-------------------------Scrape Log End----------------------------", "\n")
    
    return result


# get YouTube Video Title

url = "https://www.youtube.com/watch?v=dyN_WtjdfpA&list=PLhTjy8cBISEoOtB5_nwykvB9wfEDscuEo"
query_selector = "h1.title"
function = "(element) => element.firstChild.innerHTML"

title = await scrape(url, query_selector, function)
                      
print(title)

-------------------------Scrape Log Begin-------------------------- 

Browser, Incognito Context and Page created
User Agent: Opera/9.80 (X11; Linux x86_64; U; pl) Presto/2.7.62 Version/11.00
Url opened: https://www.youtube.com/watch?v=dyN_WtjdfpA&list=PLhTjy8cBISEoOtB5_nwykvB9wfEDscuEo
Selector loaded: h1.title
Result loaded: Sentiment Analysis Python - 1 -  Introduction to Emotion Analysis  (NLP)
Browser closed 

-------------------------Scrape Log End---------------------------- 

Sentiment Analysis Python - 1 -  Introduction to Emotion Analysis  (NLP)


In [28]:
url = "https://www.youtube.com/watch?v=dyN_WtjdfpA&list=PLhTjy8cBISEoOtB5_nwykvB9wfEDscuEo"
query_selector = "ytd-comment-renderer"
function = "(element) => element.outerHTML"

result = await scrape(url, query_selector, function)
                      
print(result)

-------------------------Scrape Log Begin-------------------------- 

Browser, Incognito Context and Page created
User Agent: Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36
Url opened: https://www.youtube.com/watch?v=dyN_WtjdfpA&list=PLhTjy8cBISEoOtB5_nwykvB9wfEDscuEo
Selector loaded: ytd-comment-renderer
Result loaded: <ytd-comment-renderer id="comment" class="style-scope ytd-comment-thread-renderer" comment-style="unknown"><div id="body" class="style-scope ytd-comment-renderer">
  <div id="author-thumbnail" class="style-scope ytd-comment-renderer">
    <a class="yt-simple-endpoint style-scope ytd-comment-renderer" href="/channel/UCRUPE-NvJOOCaRbcw52VY0A">
      <yt-img-shadow fit="" height="40" width="40" class="style-scope ytd-comment-renderer no-transition" style="background-color: transparent;" loaded=""><img id="img" class="style-scope yt-img-shadow" alt="morthim" height="40" width="40" src="https://yt3.ggpht.com/ytc/AAUvwnhBM

In [7]:

# Scraping transaction : 

def _scrape_and_store(tx, httpUrl): # "tx" is a neo4j transaction...
    #... TODO ...
    result = tx.run("")
    return result.single()[0] # TODO


## Using Neo4j for data storage : ##

In [None]:
from neo4j import GraphDatabase

uri, user, password = 'bolt://localhost:7687', 'neo4j', 'neo4j_'


In [None]:
httpUrls = [
    #...
]


driver = GraphDatabase.driver(uri, auth=(user, password))

with driver.session() as session:
    for url in httpUrls :
        result = session.write_transaction(_scrape_and_store, url)
        print(result)

driver.close()