In [1]:
import requests , nltk , time , os
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from pandas import DataFrame
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from selenium.webdriver.chrome.options import Options

In [2]:
def set_driver(driver_name):
    options = Options()
    options.headless = True
        
    if(driver_name == 'GoogleChrome'):
        CHROMEDRIVER_PATH = os.getcwd() + '\chromedriver.exe'
        driver = webdriver.Chrome(options=options)
    
    elif(driver_name == 'FireFox'):
        pass
    
    return driver

In [3]:
def google_news_scraper(driver , keyword , number_of_pages):
    url = 'https://www.google.com/search?q='+keyword+'&hl=en&source=lnms&tbm=nws&sa=X&ved=0ahUKEwi39q3H_ZjdAhXnMewKHbszDXIQ_AUICygC&biw=1536&bih=728&dpr=1.25'
    source_list = []
    date_list = []
    title_list = []
    description_list = []
    page_count = 1
    delay = False
    
    while(page_count != number_of_pages):
        try:
            if(delay):
                #Some delay in order to not get blocked [Remove it if you don't have too much pages to review]
                time.sleep(1)
                delay = False
                
            else:
                #Parse the webpage
                response = requests.get(url)
                soup = BeautifulSoup(response.text , 'html5lib')

                #Grab the Headline
                for title in soup.find_all('h3'):
                    title_list.append(title.text)

                #Grab the Soruce and Date
                for sd in soup.find_all('div',{'class':'slp'}):
                    info = sd.text.split(' - ')
                    source = info[0]
                    source_list.append(source)
                    date = info[len(info)-1]
                    date_list.append(date)

                #Grab the Description
                for desc in soup.find_all('div',{'class':'st'}):
                    description_list.append(desc.text)

                #Get the next page url
                driver.get(url)
                driver.find_element_by_link_text('Next').click()
                url = driver.current_url

                page_count += 1
                
                if(page_count%10 == 0):
                    delay = True
            
        except NoSuchElementException:
            print("No more pages left!")
            driver.quit()
            break
        
            
    return title_list , source_list , date_list , description_list , description_list

In [4]:
def get_sentiment(text):
    sentiment = ""
    analyzer = SentimentIntensityAnalyzer()
    ps = analyzer.polarity_scores(text)
    
    #Positive sentiments
    if ps['neg'] <= 0.5 and ps['pos'] > 0:
        if ps['pos'] - ps['neg'] >= 0:
            sentiment = '1'
        elif ps['neu'] >= 0.5:
            sentiment = '1'
        else:
            sentiment = '0'
    
    #Negative sentiments
    elif ps['pos'] <= 0.5 and ps['neg'] > 0:
        if ps['neg'] - ps['pos'] >= 0:
            sentiment = '0'
        elif ps['neu'] <= 0.5:
            sentiment = '0'
        else:
            sentiment = '1'
    
    else:
        sentiment = get_sentiment('Good '+ text)
    
    return sentiment

In [5]:
def news_analysis(title_list , description_list):
    i = 0
    labels_list = []

    for sentence in title_list:
        sentiment = get_sentiment(sentence)
        if(sentiment == '1' or sentiment == '0'):
            labels_list.append(sentiment)
        elif(sentiment == '?'):
            labels_list.append(get_sentiment(description_list[i]))
        else:
            labels_list.append('?')

        i += 1
    
    positive_news_percentage = labels_list.count('1')/len(description_list)*100
    negative_news_percentage = labels_list.count('0')/len(description_list)*100
    unknown_news_percentage = labels_list.count('?')/len(description_list)*100

#     print("Positive News Percentage : " , labels_list.count('1')/len(description_list)*100 , "%")
#     print("Negative News Percentage : " , labels_list.count('0')/len(description_list)*100 , "%")
#     print("Unknowns : " , labels_list.count('?') ,'sentences , ' , labels_list.count('?')/len(description_list)*100 , "%")

    return labels_list , positive_news_percentage , negative_news_percentage , unknown_news_percentage

In [6]:
def save_to_csv(file_name , title_list , source_list , date_list , description_list , labels_list):
    df = DataFrame({'Headline': title_list , 'Source': source_list , 'Date/Time': date_list , 'Description': description_list , 'Labels': labels_list})
    df.to_excel(str(file_name)+'.xlsx' , sheet_name = 'sheet1' , index = False)

In [7]:
driver = set_driver('GoogleChrome')
title_list , source_list , date_list , description_list , description_list = google_news_scraper(driver , 'canada' , 20)
labels_list , positive_news_percentage , negative_news_percentage , unknown_news_percentage = news_analysis(title_list , description_list)
save_to_csv('Canada News' , title_list , source_list , date_list , description_list , labels_list)