In [None]:
import pandas as pd
from os import path
from selenium import webdriver
from trafilatura import extract


def query(string):
    '''
    This function checks all columns for a given string, starting on Article Text column
    
    Parameters
    ----------
    string : str
    
    Returns
    -------
    list
        contains the content of rows associated with the respective column where string matched
    '''
    for column in db.columns:
        contents = [i for i in db[column].dropna()
                    if str(string).lower() in i.lower()]
        if contents:
            return contents
        else:
            pass
    print('No match')

In [None]:
# if the database already exists, read it
if path.isfile('db.h5'):
    db = pd.read_hdf('db.h5')

# otherwise initialize it with all the columns
else:
    db = pd.DataFrame({},columns=['Article Text','Author','Headline','Article URL','Source Name'])

In [None]:
# launch chromedriver and start reading the designated webpage
driver = webdriver.Chrome()
driver.get('https://www.bloomberg.com/technology')

In [None]:
# look for all the links in the webpage
elements = driver.find_elements_by_xpath('//a[contains(@href, "%s")]' % '/articles/')

# on every link, identify headline, url and attribute them to the designated source
for article in elements:
    entry = len(db)
    headline = article.text.split('\n')[0]
    if headline:
        db.loc[entry, 'Headline'] = headline
        db.loc[entry, 'Article URL'] = article.get_attribute('href').split('?')[0]
        db.loc[entry, 'Source Name'] = 'Bloomberg Technology'

# grab the index entries of this source links
bloomberglinks = db[(db['Source Name'] == 'Bloomberg Technology')
                    & pd.isna(db['Article Text'])].index

# for each entry, visit the url and collect the contents
for link in bloomberglinks:
    try:
        driver.get(db['Article URL'][link])
        keys = driver.find_element_by_xpath('//*[address]')

        for values in keys.text.split('\n'):
            if values[0:2] == 'By':
                db.loc[link, 'Author'] = values[3:]

        db.loc[link, 'Article Text'] = extract(driver.page_source)
        sleep(1)
    except:
        pass

# close chromedriver
driver.quit()

In [None]:
# sometimes there are duplicates articles in this webpage
db = db.drop_duplicates().reset_index().drop('index',axis=1)

In [None]:
driver = webdriver.Chrome()
driver.get('https://elcomercio.pe/economia/?ref=ecr')

In [None]:
# look for all the headlines, authors and links in the webpage
elements = driver.find_elements_by_xpath('//h2[a]')
authors = driver.find_elements_by_xpath('//a[contains(@href, "%s")]' % '/autor')
links = driver.find_elements_by_xpath('//h2//*[@href]')

# save each in the database and attribute them to the designated source
for i, headline in enumerate(elements):
    entry = len(db)
    db.loc[entry, 'Headline'] = headline.text
    db.loc[entry, 'Author'] = authors[i].text
    db.loc[entry, 'Article URL'] = links[i].get_attribute('href')
    db.loc[entry, 'Source Name'] = 'El Comercio'

elcomerciolinks = db[(db['Source Name'] == 'El Comercio')
                     & pd.isna(db['Article Text'])].index

for link in elcomerciolinks:
    try:
        driver.get(db['Article URL'][link])
        db.loc[link, 'Article Text'] = extract(driver.page_source)
        sleep(1)
    except:
        pass

driver.quit()

In [None]:
len(query('Alphabet'))

In [None]:
query('Alphabet')[0]#.split('\n')

In [None]:
db.to_hdf('db.h5', key='articles')