In [None]:
from selenium import webdriver
import time
import pandas as pd
from scrapy.selector import Selector
from selenium.webdriver.chrome.options import Options

In [None]:
def get_data():
    '''
    Function to obtain the data and store it in a Pandas DataFrame format
    Inputs:
    No Inputs
    Return Values:
    The function returns a pandas DataFrame with only one column by the name of 'video link'.
    '''
    ## Reading a csv with links of random youtube videos 
    df=pd.read_csv('Youtube_links.csv')
    
    ## Formatting the data frame
    df.drop('Unnamed: 0',axis=1,inplace=True)
    df.columns=[ 'video link']
    
    ## Taking a look at the dataframe
    df.head()
    return df

In [None]:
def data_crawler(youtube_data):
    '''
    This fuction is used to scrape the required content from each of the Youtube URLs and return
    the data in an appropriate format.
    Inputs:
    youtube_data: takes a dataframe with one column by the name of 'video link' which contains the video urls.
    Return Values:
    The function returns a dictonary with all the extracted information, which can then be exported to a csv format using Pandas
    '''
    
    ##initializing the lists which will be then used to store the data we have scraped from 
    
    dates=[]
    comments=[]
    likes=[]
    dislikes=[]
    views=[]
    links=[]
    count=0
    try:
        
        ## The following for loop iterates through all the urls to scrape the required information
        for link in youtube_data['video link']:

            ##The scraper uses Selenium Webdriver to get the information, Below is the code to add options for the webdriver.
            options = Options()
            options.add_argument("--no-sandbox")
            options.add_argument("--disable-dev-shm-usage")
            options.add_argument("--disable-gpu")
            options.page_load_strategy = 'eager'

            ##initializing the webdriver
            driver = webdriver.Chrome(options=options)

            ##Opening the url with the help of the webdriver
            driver.get(link)

            ## A delay is added to give time for the webpage to load 
            time.sleep(2)

            ## Initializing a scrapy selector, which helps us in finding the elements required
            sel=Selector(text=driver.page_source)

            ##Scrolling down to the portion of the webpage where the number of contents have been mentioned
            driver.execute_script('window.scrollTo(1, 500);')

            #now we wait let load the comments
            time.sleep(5)

            ## The following chunk of code finds out the number of comments. If the number is not present, it implies that 
            #  the comments have been turned off for the video.
            try:
                comment_num=driver.find_element_by_xpath("//h2[@id='count']").text[:-9]
            except:
                comment_num="Comments are turned off"

            ## The following line of code helps us in identifying the date of upload
            date=sel.xpath('//*[@id="date"]/yt-formatted-string/text()').get()

            ##  The following chunk of code extracts the number of likes and dislikes. If an element for the number of likes/dislikes is
            #   absent, it signifies that there are no likes/dislikes for the particular video 
            try:
                like=sel.xpath('//*[@id="top-level-buttons"]/ytd-toggle-button-renderer[1]/a/yt-formatted-string/@aria-label').get()[:-6]
            except:
                like=0
            try:
                dislike=sel.xpath('//*[@id="top-level-buttons"]/ytd-toggle-button-renderer[2]/a/yt-formatted-string/@aria-label').get()[:-9]
            except:
                dislike=0

            ## The next chunk of code identifies the number of views for a video and appends it to the respective list.
            view=driver.find_element_by_xpath('//*[@id="count"]/yt-view-count-renderer/span[1]').text[60:-12]

            ## The driver is not required 
            driver.close()
            dates.append(date)
            comments.append(comment_num)
            likes.append(like)
            dislikes.append(dislike)
            views.append(view)
            links.append(link)
            count=count+1
    
    ##  The try except block is to ensure that if the process gets interrupted due to a faulty internet connection, then the process is 
    #   resumed from where it got left off. If the webpage does not load then the webdriver is not able to find the required elements on
    #   the webpage
    #   This is essential because the process is lengthy as we are scraping data for 500 urls.
    except Exception as e:
        print("The number of files that have been downloaded are: ",count)
        print("The process was interrupted. The error that occured was :",e)
        
        ## We count how many files have been downloaded and the function calls itself to resume the process from the required start point
        if(count!=len(youtube_data['video link'])):
            data_crawler(youtube_data.loc[count:])
    
    return {'video link':links,
            'video views':views,
            'uploaded date':dates,  
            'comments':comments,
            'likes':likes,
            'dislikes':dislikes}

In [None]:
df=data_crawler(get_data())

In [None]:
if __name__ == "__main__": 
    
    ## Running the get_data function to load the csv
    youtube_data=get_data
    
    ## Calling the data_crawler function to extract the required information from the urls
    data=data_crawler(youtube_data)
    
    ##Since the data that has been obtained is in the form of a dictonary its converted to a data frame
    data_df=pd.DataFrame(data)
    
    ##Now the data is exported in the form of a csv 
    data_df.to_csv('Youtube_Data.csv')