# NewsAPI

NewsAPI is a useful tool to get recent (up to 1 month old, 10000 requests/day, for [free](https://newsapi.org/pricing)) news headlines and articles from various news sources and blogs. I used NewsAPI primarily to get URLs from a plethora of sources. [Using NewsAPI on Python](https://newsapi.org/docs/client-libraries/python) is straightforward, but one needs to learn [Lucene syntax](http://www.lucenetutorial.com/lucene-query-syntax.html) to get a comprehensive list of all the related articles.

The code here consists of:
1. Querying news metadata (most importantly, URLs and sources) from all possible sources
2. ... specifically from quality sources: NYT, BBC, Bloomberg, WSJ, Washington Post, Economist, AP, Reuters, Politico, National Geographic, New Scientist, Next Big Future
3. ... specifically from conservative sources: Breitbart, Fox News, American Conservative, Washington Times

The metadata was stored as pickled dictionaries before dumping them as dataframes. In retrospect, it was definitely not the most efficient method, but it was used because I wanted to be familiar with pickling objects at the time.

In [None]:
from newsapi import NewsApiClient
import cnfg
from IPython import display
from pprint import pprint
import pickle
key = cnfg.load('/Users/lkchemposer/.newsapi_config')
newsapi = NewsApiClient(api_key=key['api_key']) # registered from https://newsapi.org/

import pandas as pd
%pylab inline
import seaborn as sns
sns.set()

In [None]:
# Lucene syntax queries (required by NewsAPI)
queries = '''"environment" AND
            ("energy" OR "green energy" OR "go green" OR
             "pollution" OR "air pollution" OR "water pollution" OR "ocean pollution" OR "land pollution" OR "noise pollution" OR
             "waste" OR "waste management" OR
             "water quality" OR "air quality" OR
             "global warming" OR
            ("global warming" AND ("polar bears" OR "ice cap melting")) OR
             "solar energy" OR "solar power" OR "solar panels" OR
             "climate change" OR "climate march" OR
             "recycling" OR
             "endangered species" OR
            ("electric cars" AND "pollution") OR
             "wind energy" OR "geothermal energy" OR
             "deforestation" OR
            ("al gore" AND "pollution") OR
            ("planet earth" OR "mother earth" AND "nature" AND "pollution") OR
             "epa" OR
             "greenhouse effect" OR "greenhouse gases" OR
            ("fossil fuels" AND "pollution") OR
            ("natural resources" AND "pollution") OR
            ("sutainability" AND "green") OR
             "alternative energy" OR "renewable energy" OR
             "earth day" OR
            ("carbon dioxide" AND "pollution") OR "carbon footprint" OR
             "water conservation" OR "energy conservation" OR "conservation" OR
             "electronic waste" OR "landfill" OR "composting" OR
             "department of energy" OR
             "earth science") OR 
             "environmental health" OR
             "environmental engineer" OR
             "environmental justice" OR "environmental ethics" OR "environmental racism" OR "environmental sociology" OR
             "environmental geography" OR
            ("environmental education" OR "environmental studies" OR "environmental science" AND ("pollution" OR "nature"))'''

In [None]:
# metadata (with exception handling for null results)
def author(article):
    try:
        return article['author']
    except: return

def publishedAt(article):
    try:
        return article['publishedAt']
    except: return    

def title(article):
    try:
        return article['title']
    except: return   

def url(article):
    try:
        return article['url']
    except: return    

def source(article):
    try:
        return article['source']['name']
    except: return

## ... from Any Sources

In [None]:
newsl = []
for i in range(1, 100): # maximum 99 pages
    req = newsapi.get_everything(q=queries, language='en', sort_by='relevancy', page_size=100, page=i)
    articles = req['articles']
    for article in articles:
        web = url(article)
        tle = title(article)
        aut = author(article)
        pub = publishedAt(article)
        sce = source(article)
        newsl.append(dict(zip(['url', 'title', 'author', 'publishedAt', 'source'], [web, tle, aut, pub, sce])))
    if (i % 10 == 0) | (i == 99): # pickle data
        with open('newsapi{}.pkl'.format(i * 100), 'wb') as p:
            pickle.dump(newsl, p)  
        newsl = [] # reset list

In [None]:
news = pd.DataFrame(columns=['pub_date', 'source', 'title', 'url'])
for i in range(1, 11):
    with open('newsapi{}.pkl'.format(i * 1000), 'rb') as p:
        articles = pickle.load(p)
    for article in articles:
        news = news.append({'pub_date': article['publishedAt'],
                            'source': article['source'],
                            'title': article['title'],
                            'url': article['url']}, ignore_index=True)

news.drop_duplicates(inplace=True) # cleaning

news['pub_date'] = pd.to_datetime(news['pub_date'], infer_datetime_format=True)

## ... from Quality Sources

In [None]:
sces = 'the-new-york-times,bbc-news,bloomberg,the-wall-street-journal,the-washington-post,the-economist,associated-press,reuters,politico,national-geographic,new-scientist,next-big-future'

newsl = []
for i in range(1, 100):
    req = newsapi.get_everything(q=queries, language='en', sort_by='relevancy', page_size=100, page=i, sources=sces)
    articles = req['articles']
    for article in articles:
        web = url(article)
        tle = title(article)
        aut = author(article)
        pub = publishedAt(article)
        sce = source(article)
        newsl.append(dict(zip(['url', 'title', 'author', 'publishedAt', 'source'], [web, tle, aut, pub, sce])))
    if (i % 10 == 0) | (i == 99):
        with open('newsapi_legit{}.pkl'.format(i * 100), 'wb') as p:
            pickle.dump(newsl, p)  
        newsl = []

In [None]:
legit = pd.DataFrame(columns=['pub_date', 'source', 'title', 'url'])
for i in range(1, 11):
    with open('newsapi_legit{}.pkl'.format(i * 1000), 'rb') as p:
        articles = pickle.load(p)
    for article in articles:
        legit = legit.append({'pub_date': article['publishedAt'],
                              'source': article['source'],
                              'title': article['title'],
                              'url': article['url']}, ignore_index=True)

legit.drop_duplicates(inplace=True) # cleaning

legit['pub_date'] = pd.to_datetime(legit['pub_date'], infer_datetime_format=True)

## ... from Conservative Sources

In [None]:
sces = 'breitbart-news,fox-news,the-american-conservative,the-washington-times'

newsl = []
for i in range(1, 100):
    req = newsapi.get_everything(q=queries, language='en', sort_by='relevancy', page_size=100, page=i, sources=sces)
    articles = req['articles']
    for article in articles:
        web = url(article)
        tle = title(article)
        aut = author(article)
        pub = publishedAt(article)
        sce = source(article)
        newsl.append(dict(zip(['url', 'title', 'author', 'publishedAt', 'source'], [web, tle, aut, pub, sce])))
    if (i % 10 == 0) | (i == 99):
        with open('newsapi_con{}.pkl'.format(i * 100), 'wb') as p:
            pickle.dump(newsl, p)  
        newsl = []

In [None]:
con = pd.DataFrame(columns=['pub_date', 'source', 'title', 'url'])
for i in range(1, 11):
    with open('newsapi_con{}.pkl'.format(i * 1000), 'rb') as p:
        articles = pickle.load(p)
    for article in articles:
        con = con.append({'pub_date': article['publishedAt'],
                          'source': article['source'],
                          'title': article['title'],
                          'url': article['url']}, ignore_index=True)

con.drop_duplicates(inplace=True) # cleaning

con['pub_date'] = pd.to_datetime(con['pub_date'], infer_datetime_format=True)

In [None]:
# combine all dataframes into one

env = news.append(legit).append(con)
env.drop_duplicates(inplace=True)

In [None]:
# top 20 news sources by volume
env['source'].value_counts().head(20).plot('barh')