In [1]:
import pandas as pd
from os import path
from time import sleep
from selenium import webdriver
from trafilatura import extract


def query(string):
    '''
    This function checks all columns for a given string, starting on Article Text column
    
    Parameters
    ----------
    string : str
    
    Returns
    -------
    list
        contains the content of rows associated with the respective column where string matched
    '''
    for column in db.columns:
        contents = [i for i in db[column].dropna()
                    if str(string).lower() in i.lower()]
        if contents:
            return contents
        else:
            pass
    print('No match')

In [2]:
# if the database already exists, read it
if path.isfile('db.h5'):
    db = pd.read_hdf('db.h5')

# otherwise initialize it with all the columns
else:
    db = pd.DataFrame({},columns=['Article Text','Author','Headline','Article URL','Source Name'])

In [3]:
# in order to launcher chromedriver you must first download it from
# https://sites.google.com/chromium.org/driver/ and specify its
# directory, this assumes its on the same folder as the notebook

# launch chromedriver and start reading the designated webpage
driver = webdriver.Chrome()
driver.get('https://www.bloomberg.com/technology')

In [4]:
# look for all the links in the webpage
elements = driver.find_elements_by_xpath('//a[contains(@href, "%s")]' % '/articles/')

# on every link, identify headline, url and attribute them to the designated source
for article in elements:
    entry = len(db)
    headline = article.text.split('\n')[0]
    if headline:
        db.loc[entry, 'Headline'] = headline
        db.loc[entry, 'Article URL'] = article.get_attribute('href').split('?')[0]
        db.loc[entry, 'Source Name'] = 'Bloomberg Technology'

# grab the index entries of this source links
bloomberglinks = db[(db['Source Name'] == 'Bloomberg Technology')
                    & pd.isna(db['Article Text'])].index

# for each entry, visit the url and collect the contents
for link in bloomberglinks:
    try:
        driver.get(db['Article URL'][link])
        keys = driver.find_element_by_xpath('//*[address]')

        for values in keys.text.split('\n'):
            if values[0:2] == 'By':
                db.loc[link, 'Author'] = values[3:]

        db.loc[link, 'Article Text'] = extract(driver.page_source)
        sleep(1)
    except:
        pass

# close chromedriver
driver.quit()

In [5]:
# sometimes there are duplicates articles in this webpage
db = db.drop_duplicates().reset_index().drop('index',axis=1)

In [6]:
driver = webdriver.Chrome()
driver.get('https://elcomercio.pe/economia/?ref=ecr')

In [7]:
# look for all the headlines, authors and links in the webpage
elements = driver.find_elements_by_xpath('//h2[a]')
authors = driver.find_elements_by_xpath('//a[contains(@href, "%s")]' % '/autor')
links = driver.find_elements_by_xpath('//h2//*[@href]')

# save each in the database and attribute them to the designated source
for i, headline in enumerate(elements):
    entry = len(db)
    db.loc[entry, 'Headline'] = headline.text
    db.loc[entry, 'Author'] = authors[i].text
    db.loc[entry, 'Article URL'] = links[i].get_attribute('href')
    db.loc[entry, 'Source Name'] = 'El Comercio'

elcomerciolinks = db[(db['Source Name'] == 'El Comercio')
                     & pd.isna(db['Article Text'])].index

for link in elcomerciolinks:
    try:
        driver.get(db['Article URL'][link])
        db.loc[link, 'Article Text'] = extract(driver.page_source)
        sleep(1)
    except:
        pass

driver.quit()

In [8]:
db

Unnamed: 0,Article Text,Author,Headline,Article URL,Source Name
0,Alibaba-Backed Trendyol Seeks Over $1 Billion ...,Myriam Balezou and Ercan Ersoy,Alibaba-Backed Trendyol Seeks Over $1 Billion ...,https://www.bloomberg.com/news/articles/2021-0...,Bloomberg Technology
1,Roku Says YouTube TV May Go Dark on Its Platfo...,Gerry Smith,Roku Says YouTube TV May Go Dark on Its Platfo...,https://www.bloomberg.com/news/articles/2021-0...,Bloomberg Technology
2,Australian Minister’s Phone Hacked as Report R...,Jason Scott and Jamie Tarabay,Australian Minister’s Phone Hacked as Report R...,https://www.bloomberg.com/news/articles/2021-0...,Bloomberg Technology
3,HBO Max Mobile Downloads Dip as ‘Mortal Kombat...,John J Edwards III,HBO Max Mobile Downloads Dip as ‘Mortal Kombat...,https://www.bloomberg.com/news/articles/2021-0...,Bloomberg Technology
4,Apple Ups U.S. Investments Over Five Years to ...,Molly Schuetz,Apple Ups U.S. Investments Over Five Years to ...,https://www.bloomberg.com/news/articles/2021-0...,Bloomberg Technology
...,...,...,...,...,...
85,|CHARTBEAT||https://static.chartbeat.com/opt-o...,CLAUDIA INGA MARTÍNEZ,Emblemático hotel Maury será parte de cadena ‘...,https://elcomercio.pe/economia/dia-1/hoteles-l...,El Comercio
86,,REDACCIÓN EC,Bolsa de Valores de Lima cerró operaciones con...,https://elcomercio.pe/economia/mercados/bolsa-...,El Comercio
87,El tipo de cambio operaba a la baja el viernes...,REDACCIÓN EC,México: ¿cuál es el precio del dólar hoy viern...,https://elcomercio.pe/economia/mercados/mexico...,El Comercio
88,e informes especiales\ndesde la App\nen la App...,REDACCIÓN EC,Congreso aprueba por insistencia la ley de neg...,https://elcomercio.pe/economia/peru/congreso-a...,El Comercio


In [9]:
len(query('Apple'))

5

In [10]:
query('Apple')[0].split('\n')

['Apple Ups U.S. Investments Over Five Years to $430 Billion',
 '- Spending outpaced its original five-year goal of $350 billion',
 '- Aims to create 20,000 new jobs across the country, new campus',
 'Apple Inc. is increasing its U.S. investments by 20% over the next five years, allocating $430 billion to develop next-generation silicon and spur 5G wireless innovation across nine U.S. states, after outstripping its growth expectations during the pandemic.',
 'The iPhone maker will create 20,000 new jobs in the U.S. and fund a new campus in North Carolina, the company said in a statement Monday.',
 'Over the past three years, Apple’s investments have outpaced its original five-year goal of $350 billion set in 2018, the company said. But Apple has also been growing. Its shares have gained about 200% since its last investment plan was announced in January 2018, and the company is now worth more than $2 trillion. Cupertino, California-based Apple said it’s also the U.S.’s biggest taxpayer,

In [11]:
db.to_hdf('db.h5', key='articles')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['Article Text', 'Author', 'Headline', 'Article URL', 'Source Name'], dtype='object')]

  pytables.to_hdf(
