# News API  
https://newsapi.org/

Install modules

In [1]:
import sys
# !{sys.executable} -m pip install 

Load modules

In [2]:
import os
import requests
import json
import pandas as pd
from cassandra.cluster import Cluster
from tqdm import tqdm
from datetime import datetime, date, timedelta

Set paths

In [3]:
base_path = os.path.join(os.path.expanduser('~'), 'bakery')
creds_path = os.path.join(base_path, 'news_api/creds.json')

Load credential file

In [4]:
with open(creds_path, 'rb') as f:
    data = f.read()
    creds = json.loads(data)

Connect to Cassandra cluster

In [5]:
cass_cluster = Cluster()
cass_session = cass_cluster.connect()

##### Get News Table from Cassandra

Send request

In [87]:
endpoint = 'https://newsapi.org/v2/top-headlines/sources'
headers = {'Authorization' : creds['api_key']}
params = {'language': 'en'}

r = requests.get(endpoint, headers=headers, params=params)

In [230]:
r.json()

{'status': 'ok',
 'totalResults': 267,
 'articles': [{'source': {'id': 'associated-press',
    'name': 'Associated Press'},
   'author': 'By FRANK BAJAK - Associated Press',
   'title': 'Twitter breach exposed anonymous account owners',
   'description': 'Twitter says vulnerability in its software that exposed an undetermined number of owners of anonymous accounts to potential identity compromise last year was apparently exploited by a malicious actor. It did not confirm a report that data on 5.4 million…',
   'url': 'https://apnews.com/27382aa1537c97e6266f8a7d9ad977d6',
   'urlToImage': 'https://bloximages.newyork1.vip.townnews.com/roanoke.com/content/tncms/custom/image/00f1ee32-f2d0-11e6-9926-f7249dbfe53a.jpg?resize=600%2C315',
   'publishedAt': '2022-08-05T21:56:49Z',
   'content': "A vulnerability in Twitter's software that exposed an undetermined number of owners of anonymous accounts to potential identity compromise last year was apparently exploited by a malicious actor, the… [+

In [89]:
len(r.json()['sources'])

81

Drop table if exists

In [90]:
cass_session.execute(
    '''
    drop table if exists bakery.sources;
    '''
)

<cassandra.cluster.ResultSet at 0x7fe429703b50>

Create table if not exists

In [91]:
cass_session.execute(
    '''
    create table if not exists bakery.sources (
        id text,
        name text,
        description text,
        url text,
        category text,
        language text,
        country text,
        primary key (id)
    )
    '''
)

<cassandra.cluster.ResultSet at 0x7fe4296b91f0>

Write data to DB table

In [92]:
for s in tqdm(r.json()['sources'], total=len(r.json()['sources'])):
    values = (
        s['id'],
        s['name'],
        s['description'].replace("'", ""),
        s['url'],
        s['category'],
        s['language'],
        s['country']
    )
    cass_session.execute(
        f'''
        insert into bakery.sources (
            id, name, description, url, category, language, country
        ) values {values}
        '''
    )

100%|█████████████████████████████████████████| 81/81 [00:00<00:00, 1638.73it/s]


##### Get top headlines from yesterday

In [234]:
# Get list of sources from Cassandra table
# def list_factory(colnames, rows):
#     return [[row[0], row[1], row[2]] for row in rows]
# cass_session.row_factory = list_factory

# query = "select id, category, language from bakery.sources;"
# sources = cass_session.execute(query, timeout=None)
# sources = [source[0] for source in sources if source[1] in ['business', 'general'] and source[2] == 'en']

In [235]:
# sources

In [50]:
sources = [
    'time',
    'associated-press',
    'the-washington-post',
    'fox-news',
    'abc-news',
    'cnn',
    'nbc-news',
    'msnbc',
    'newsweek',
    'usa-today',
    'cbs-news',
    'reuters',
    'bbc-news',
    'the-wall-street-journal',
    'business-insider',
    'fortune',
    'bloomberg'
]

In [51]:
current_dttm = datetime.now()
from_dttm = (current_dttm - timedelta(days=1)).strftime('%Y-%m-%dT%H:%M:%S')
to_dttm = current_dttm.strftime('%Y-%m-%dT%H:%M:%S')
print(from_dttm, to_dttm)

2022-08-07T11:56:12 2022-08-08T11:56:12


In [52]:
yesterday_dttm = str(date.today() - timedelta(days=1))
max_results = 500
endpoint = 'https://newsapi.org/v2/everything'
headers = {'Authorization' : creds['api_key']}
params = {
    'from': from_dttm,
    'to': to_dttm,
    'language': 'en',
    'sortBy': 'popularity'
}
articles = []

def chunker(ls, size):
    return (ls[pos:pos+size] for pos in range(0, len(ls), size))

for chunk in chunker(sources, 3):
    print(f'Sources: {", ".join(chunk)}')
    page = 1
    params['sources'] = ','.join(chunk)
    params['page'] = 1
    first_r = requests.get(endpoint, headers=headers, params=params)
    articles.extend(first_r.json()['articles'])
    total_results = first_r.json()['totalResults']
    print(f'\tTotal results: {total_results}')
    num_results = len(first_r.json()['articles'])
    if total_results > max_results:
        remaining_results = max_results - num_results
    else:
        remaining_results = total_results - num_results
    print('\tPage: {}, Status: {}, Remaining results: {}'.format(
                params['page'], 
                first_r.json()['status'],
                remaining_results
            )
         )
    while remaining_results > 0:
        params['page'] += 1
        r = requests.get(endpoint, headers=headers, params=params)
        articles.extend(r.json()['articles'])
        remaining_results -= len(r.json()['articles'])
        print('\tPage: {}, Status: {}, Remaining results: {}'.format(
                params['page'], 
                r.json()['status'],
                remaining_results
            )
         )

Sources: time, associated-press, the-washington-post


KeyError: 'articles'

In [53]:
first_r.json()

{'status': 'error',
 'code': 'rateLimited',
 'message': 'You have made too many requests recently. Developer accounts are limited to 100 requests over a 24 hour period (50 requests available every 12 hours). Please upgrade to a paid plan if you need more requests.'}

In [17]:
articles_df = pd.DataFrame(articles)

In [49]:
articles_df[articles_df['title'].str.startswith('Hope fades as ')]

Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content
549,"{'id': 'usa-today', 'name': 'USA Today'}","USA TODAY, George Petras and Janet Loehrke, US...",Hope fades as starving Beluga whale stuck in F...,"50 miles from Paris, Beluga continues to weake...",https://www.usatoday.com/restricted/?return=ht...,,2022-08-07T19:37:35Z,Skip to main content\r\nThis content is only a...


In [15]:
articles_df[]

Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content
0,"{'id': 'bloomberg', 'name': 'Bloomberg'}",Amina Niasse,'Bullet Train' Overcomes Mixed Reviews to Top ...,"“Bullet Train,” a Sony Corp. thriller starring...",https://www.bloomberg.com/news/articles/2022-0...,https://assets.bwbx.io/images/users/iqjWHBFdfx...,2022-08-07T15:49:49Z,"Bullet Train, a Sony Corp. thriller starring B..."
1,"{'id': 'bloomberg', 'name': 'Bloomberg'}",Catarina Saraiva,Daly Says Fed Is 'Far From Done Yet' on Bringi...,San Francisco Fed President Mary Daly suggeste...,https://www.bloomberg.com/news/articles/2022-0...,https://assets.bwbx.io/images/users/iqjWHBFdfx...,2022-08-07T15:26:37Z,San Francisco Fed President Mary Daly suggeste...
2,"{'id': 'bloomberg', 'name': 'Bloomberg'}",Rich Miller,US Inflation Peak in Sight But Debate Rages Ov...,Sky-high US inflation may finally be approachi...,https://www.bloomberg.com/news/articles/2022-0...,https://assets.bwbx.io/images/users/iqjWHBFdfx...,2022-08-07T13:00:01Z,Sky-high US inflation may finally be approachi...
3,"{'id': 'bloomberg', 'name': 'Bloomberg'}","Erik Wasson, Steven T. Dennis, Laura Davison","Senate Passes Democrats' Landmark Tax, Climate...","The Senate passed a landmark tax, climate and ...",https://www.bloomberg.com/news/articles/2022-0...,https://assets.bwbx.io/images/users/iqjWHBFdfx...,2022-08-07T19:18:33Z,"The Senate passed a landmark tax, climate and ..."
4,"{'id': 'bloomberg', 'name': 'Bloomberg'}",Albertina Torsoli,Renault Is On Right Track With New Electric Ve...,Renault SA Chief Executive Officer Luca de Meo...,https://www.bloomberg.com/news/articles/2022-0...,https://assets.bwbx.io/images/users/iqjWHBFdfx...,2022-08-07T11:06:54Z,Renault SA Chief Executive Officer Luca de Meo...
...,...,...,...,...,...,...,...,...
98,"{'id': 'bloomberg', 'name': 'Bloomberg'}",Min Jeong Lee,SoftBank Reports Record $23.4 Billion Loss as ...,SoftBank Group Corp. reported a record 3.16 tr...,https://www.bloomberg.com/news/articles/2022-0...,https://assets.bwbx.io/images/users/iqjWHBFdfx...,2022-08-08T06:58:55Z,SoftBank Group Corp. reported a record 3.16 tr...
99,"{'id': 'bloomberg', 'name': 'Bloomberg'}",,Dennis Gartman on the Markets (Radio),Live market coverage co-anchored from Hong Kon...,https://www.bloomberg.com/news/audio/2022-08-0...,https://assets.bwbx.io/images/users/iqjWHBFdfx...,2022-08-07T22:39:40Z,Live market coverage co-anchored from Hong Kon...
100,"{'id': 'bloomberg', 'name': 'Bloomberg'}",,George Boubouras on the Markets (Radio),Live market coverage co-anchored from Hong Kon...,https://www.bloomberg.com/news/audio/2022-08-0...,https://assets.bwbx.io/images/users/iqjWHBFdfx...,2022-08-08T01:00:36Z,Live market coverage co-anchored from Hong Kon...
101,"{'id': 'bloomberg', 'name': 'Bloomberg'}",,Selina Sia on China Markets (Radio),Live market coverage co-anchored from Hong Kon...,https://www.bloomberg.com/news/audio/2022-08-0...,https://assets.bwbx.io/images/users/iqjWHBFdfx...,2022-08-08T01:40:54Z,Live market coverage co-anchored from Hong Kon...


In [37]:
r.json()['articles'][0]['description'].replace("'", '"')

'Live market coverage co-anchored from Hong Kong and New York. Overnight on Wall Street is daytime in Asia. Markets never sleep, and neither does Bloomberg.'

In [42]:
str(r.json()['articles'][0]['author'])

'None'