In [13]:
from selenium.webdriver import Firefox
import datetime
import time
import pandas as pd
from nordvpn_switcher import initialize_VPN,rotate_VPN,terminate_VPN

In [14]:
def fix_query(query):
    '''Function to add the "%20" tag to indicate space between multiworded phrases
    
    Parameters:
    query (str): Word to be corrected
    
    Returns:
    _ (str): Keyword corrected'''
    return "%20".join(query.split(" "))

In [18]:
key_word_list = ["Stock", "Price", "Growth", "Production",
                 "Demand", "Supply", "Outlook", "Market", "Freight"]

def get_data(big_query_list, last_two_weeks=True, bs4_call=False, word_list=key_word_list):
    '''Function to get the news articles from Google news
    
    Parameters:
    big_query_list (list[str]): List of string search words
    last_year (bool): Bool that indicates that the function should only focus on news articles posted in the last year
    bs4_call (bool): Bool to indicate that Beautiful Soup 4 shold be used to scrape the news articles (default is PhantomJS)
    words_list (list[str]): List of string additional words that provide additional search queries, added with each of the big_query_words

    Returns:
    df (pd.DataFrame): Pandas dataframe of the articles the function has scraped. Columns are, article_headlines, subject (big query word) & the link of the article
    articles_so_far (int): Number of articles that the function has scraped from google news 
    '''

    #Initialize the number of articles
    articles_so_far = 0

    #Google Search Base
    article_link_pre = "https://news.google.com/"
    
    article_headlines = []
    article_link = []
    subject_list = []

   
    
    
    #For word in key_word_list that enhances the big query words
    for key_word in key_word_list:

        #For the root word that is in the big_query_list
        for root_query in big_query_list:

            #Fixing the query so it can be searched 
            query = root_query + " " + key_word
            query_fixed = fix_query(query)

            #Query manipulation to decide on exact search term, based on duration of search 
            if last_two_weeks:
                search_term = "https://news.google.com/search?q=" + \
                    query_fixed + "%20when%3A14d&hl=en-CA&gl=CA&ceid=CA%3Aen"
            else:
                search_term = "https://news.google.com/search?q=" + query_fixed + "&hl=en-CA&gl=CA&ceid=CA%3Aen"

            #Scraping headlines elements based on chosen method 
            driver=Firefox("D:/geckodriver-v0.32.0-win-aarch64")     
            print(search_term)
            driver.get(search_term)
            #lastCount = driver.find_elements_by_xpath("//div[@class='m5k28']")
            lastCount = driver.find_elements("xpath","//div[contains(@class,'m5k28']")
            while True:
                #driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                #time.sleep(0.5)
                newCount = driver.find_elements("xpath","//div[contains(@class,'m5k28']")
                if lastCount == newCount:
                    break
                lastCount = newCount
            headline_list = driver.find_elements("xpath","//div[contains(@class,'m5k28']")

            #Identifying the headline text, link and subject 
            for item in headline_list:
                article_headline = item.text
                if article_headline in article_headlines:
                    continue
                if root_query.lower() not in article_headline.lower():
                    continue
                article_headlines.append(article_headline)
                if bs4_call:
                    article_link.append(article_link_pre + item.find("a")["href"][2:])
                else:
                    article_link.append(item.find_element_by_tag_name("a").get_attribute("href"))
                subject_list.append(root_query)
                articles_so_far += 1

            time.sleep(1)
    print("Total news headlines analyzed: {}".format(articles_so_far))
    df = pd.DataFrame(list(zip(article_headlines, subject_list, article_link)),
                      columns=["article_headline", "subject", "article_link"])
    now=datetime.datetime.now()
    code='Data/'+str(now.year)+'_'+str(now.month)+'_'+str(now.day)+'_'+str(now.hour)+'.csv'
    df.drop_duplicates(inplace=True)
    df.to_csv(code)
    return df, articles_so_far

In [19]:
big_query_list = ["Acid",
                    "Alum and Water Treatment"
                    "Aluminum",
                    "Chemtrade"
                    "Commodity Industry",
                    "Commodities",
                    "Copper",
                    "Ethanol",
                    "Fertilizer",
                    "Glencore",
                    "High Value Metals",
                    "Industrial Chemical",
                    "Kennecott",
                    "Lead Acid Batteries",
                    "Lithium",
                    "Mining"
                    "Nickel",
                    "Paper",
                    "Petroleum",
                    "Potable Water",
                    "Pulp"
                    "Rail Cars",
                    "Rio Tinto",
                    "Rail",
                    "Ship",
                    "Shrieve Chemical",
                    "Steel",
                    "Sulfur",
                    "Tampa Sulfur",
                    "Vale",
                    "Vanadium",
                    "Vessel Charges",
                    "Vessel",
                    "Water Treatment",
                    "Zinc"
                  ]

In [20]:
get_data(big_query_list)

  driver=Firefox("D:/geckodriver-v0.32.0-win-aarch64")


https://news.google.com/search?q=Acid%20Stock%20when%3A14d&hl=en-CA&gl=CA&ceid=CA%3Aen


InvalidSelectorException: Message: Given xpath expression "//div[contains(@class,'m5k28']" is invalid: SyntaxError: Document.evaluate: The expression is not a legal expression
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:189:5
InvalidSelectorError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:428:5
find_@chrome://remote/content/shared/DOM.sys.mjs:166:11
dom.find/</findElements<@chrome://remote/content/shared/DOM.sys.mjs:114:24
evalFn@chrome://remote/content/marionette/sync.sys.mjs:117:7
PollPromise/<@chrome://remote/content/marionette/sync.sys.mjs:137:5
PollPromise@chrome://remote/content/marionette/sync.sys.mjs:108:10
dom.find/<@chrome://remote/content/shared/DOM.sys.mjs:112:24
dom.find@chrome://remote/content/shared/DOM.sys.mjs:111:10
findElements@chrome://remote/content/marionette/actors/MarionetteCommandsChild.sys.mjs:264:21
receiveMessage@chrome://remote/content/marionette/actors/MarionetteCommandsChild.sys.mjs:95:31
