In [1]:
import pandas as pd
import json
import requests
from datetime import date, time, datetime, timedelta
import numpy as np
import sklearn
import plotly.express as p

### Associated Press Media API

Learned that this doesn't work! We'd need to pay for access to an API key and there's simply some things that aren't worth it.

In [7]:
url = "https://api.ap.org/media/v/content/search?q=Schindler's+List"

api_key = "api_key"  # this would be the thing we'd need to pay for

headers = {
    "x-api-key": api_key
}

url_text = requests.get(url, headers=headers).text    
url_text

'{ "api_version": "v", "api_mode": "", "id": "cd365c55-7b4c-4f16-960a-397fb206fb90", "method": "/content/search.GET" , "error":{"status":401, "code":2000, "message":"Supply a valid apikey" , "timestamp":"2024-05-26T08:47:30.631Z" } }'

### World News API

In [2]:
pip install worldnewsapi

Note: you may need to restart the kernel to use updated packages.


In [None]:
import worldnewsapi as wn
from worldnewsapi.rest import ApiException

api_key = 'd4e50c1f711f4abbb8c8a5b5381ec0df'
config = wn.Configuration(api_key={'apiKey': api_key})

try:
    instance = wn.NewsApi(wn.ApiClient(config))
    response = instance.search_news(
        text='Russia', 
        earliest_publish_date='2000-01-01', 
        latest_publish_date='2024-05-26')
except ApiException as e:
    print("Exception when calling NewsApi:", e)
    
for article in response:
    print(article)

### World News API: Search Version
- Url: https://api.worldnewsapi.com/search-news
- API Key: d4e50c1f711f4abbb8c8a5b5381ec0df
- Params: text=`'search term'`, language=`'en'` or `'ru'`

In [329]:
def search_api(query, api_key, min_date, max_date):
    """
    Description: Grabs intial request from a scrape of the World News Search API.
    
    Args: 
        query(string): args for search term, typically in phrase form
        api_key(string): api key for query
        min_date(string): news must be published after this date 'YYYY-MM-DD'
        max_date(string: news must be published before this date 'YYYY-MM-DD'
        
    Returns:
        df(DataFrame): of 
    """
    df = pd.DataFrame()
    
    headers = {'x-api-key': api_key}
    
    url = f'https://api.worldnewsapi.com/search-news?&text={query}&number=10&earliest-publish-date={min_date}&latest-publish-date={max_date}'
    
    response = requests.get(url, headers=headers).json() # keys = 'offset', 'number', 'available', 'news'
    news = response['news'] # array of dicts
    
    # can also add more to this as we need, but this is the basic stuff as of now
    data = {
        'date': [], 
        'sentiment': [], 
        'text': [], 
        'source_country': [],
        'queried_country': [], 
        'url': [],
#         'summary': []
    }
    
    for item in response.get('news', []):
        if (item.get('sentiment', None) != None) & (item.get('source_country', None) != None):
            data['queried_country'].append(query)
            data['date'].append(item.get('publish_date', None))
            data['sentiment'].append(item.get('sentiment'))
            data['source_country'].append(item.get('source_country'))
            data['text'].append(item.get('text', None))   
            data['url'].append(item.get('url', None))
#             date['summary'].append(item.get('summary', None))
        
    return pd.DataFrame.from_dict(data)

Testing the above funciton and seeing what it outputs below using a day in 2022 with the target query of "Russia".

In [317]:
api_key = 'd4e50c1f711f4abbb8c8a5b5381ec0df'
query = 'russia'
min_date = '2022-11-11'
max_date = '2022-11-12'
response = search_api(query, api_key, min_date, max_date)

In [291]:
response.head()

Unnamed: 0,date,sentiment,text,source_country,queried_country,url
0,2022-11-11 12:00:28,-0.318,"JERUSALEM — Rania Elias, a Palestinian mother ...",us,russia,https://www.washingtonpost.com/world/2022/11/1...
1,2022-11-11 12:02:00,0.258,"PHNOM PENH, Cambodia—President Ferdinand Marco...",ph,russia,https://mb.com.ph/2022/11/11/marcos-urges-noko...
2,2022-11-11 12:02:05,-0.205,Russia's defence ministry said on Friday it ha...,in,russia,https://indianexpress.com/article/world/russia...
3,2022-11-11 12:03:06,-0.154,Journalist Candice Owens on the American Fox N...,in,russia,https://thenewsglory.com/fox-news-host-called-...
4,2022-11-11 12:04:02,-0.489,"MYKOLAIV, Ukraine -- The Russian Defense Minis...",us,russia,https://abcnews.go.com/International/wireStory...


Formatting and grabbing more of the data (getting a lot of data over a long period of time).

In [330]:
def df_data_over_time(query, api_key):
    """
    Description: Grabs information about different countries for one day each month for provided date range.
    
    Args:
        query(array): names of countries to be queried, ideally in full name
        api_key(str): api key
        
    Returns:
        df(DataFrame): extended dataframe containing information about each provided country over past 5 years
    """
    date_str = '%Y-%m-%d'
    end = datetime.today()
    start = end -  timedelta(days=5*365)
    mid = start + timedelta(days=30)
    
    df = pd.DataFrame()
        
    while mid < end:
        for country in query:
            results = search_api(country, api_key, datetime.strftime(start, date_str), datetime.strftime(mid, date_str))
            df = pd.concat([df, results])
            start += timedelta(days=30)
            mid += timedelta(days=30)
            
    return df

In [332]:
query = ['Russia', 'China', 'Belgium', 'United States']
results = df_data_over_time(query, '501f9b4a88b44f148089f24e11da5ae2')

Quick save of the phase 2 data scraping from the World News API.

In [394]:
results.to_csv('Worldwide News.csv')
results.head()

Unnamed: 0,date,sentiment,text,source_country,queried_country,url,safety_index,Safety Index,Country
0,2019-06-12 18:48:59,-0.308,It’s time we start talking about climate chang...,mx,Russia,https://www.amnesty.org/en/latest/news/2019/06...,0.535,0.535,United States
0,2019-07-12 17:14:00,-0.108,"Even now, as more frequent ""king tides"" bubble...",us,China,https://edition.cnn.com/2019/07/11/us/miami-li...,0.535,0.535,United States
0,2019-10-23 15:32:04,0.292,The second meeting of the Board of senior memb...,uz,Russia,http://www.uzreport.com/sco-interbank-associat...,0.535,0.535,United States
1,2019-10-23 15:34:13,0.398,The Shanghai Cooperation Organization establis...,uz,Russia,http://www.uzreport.com/entrepreneur-committee...,0.535,0.535,United States
2,2019-10-23 15:37:39,0.146,All participants of the exhibition “Tea and Co...,uz,Russia,http://www.uzreport.com/over-7000-people-visit...,0.535,0.535,United States


In [398]:
test = results.iloc[:,:6].reset_index().iloc[:,1:]

In [375]:
# df_original = data obtained by the api
# df_lookup = data with codes and country codes
lookup_data = {
'Country': ['United States', 'Russia', 'Belgium', 'China'],
'Safety Index': [0.535, 0.585, 0.7, 0.784286],
'country code': ['us', 'ru', 'be', 'cn']
}

df_lookup = pd.DataFrame(lookup_data)
df_lookup

Unnamed: 0,Country,Safety Index,country code
0,United States,0.535,us
1,Russia,0.585,ru
2,Belgium,0.7,be
3,China,0.784286,cn


In [396]:
for index_results, row_results in test.iterrows():
    print(index_results)
    print(row_results)
    country = row_results['queried_country']
    
    for index_look, row_look in df_lookup.iterrows():
        country_code_look = row_look['Country']
        
        if country == country_code_look:
            test.loc[index_results, 'Safety Index'] = row_look['Safety Index']


0
date                                             2019-06-12 18:48:59
sentiment                                                     -0.308
text               It’s time we start talking about climate chang...
source_country                                                    mx
queried_country                                               Russia
url                https://www.amnesty.org/en/latest/news/2019/06...
Name: 0, dtype: object
1
date                                             2019-07-12 17:14:00
sentiment                                                     -0.108
text               Even now, as more frequent "king tides" bubble...
source_country                                                    us
queried_country                                                China
url                https://edition.cnn.com/2019/07/11/us/miami-li...
Name: 1, dtype: object
2
date                                             2019-10-23 15:32:04
sentiment                                          

date                                             2023-11-07 19:29:12
sentiment                                                        0.0
text               Belgium’s drug gangs want their cocaine back. ...
source_country                                                    eu
queried_country                                              Belgium
url                https://www.politico.eu/article/belgium-race-b...
Name: 343, dtype: object
344
date                                             2023-11-11 07:30:00
sentiment                                                     -0.069
text               Inverness War Memorial to our fallen heroes st...
source_country                                                    gb
queried_country                                              Belgium
url                https://www.inverness-courier.co.uk/news/expla...
Name: 344, dtype: object
345
date                                             2023-11-14 17:08:39
sentiment                                    

The below DataFrame contains the information that we ended up continuing with. It is saved as a CSV locally.

In [397]:
test.head()

Unnamed: 0,date,sentiment,text,source_country,queried_country,url,Safety Index
0,2019-06-12 18:48:59,-0.308,It’s time we start talking about climate chang...,mx,Russia,https://www.amnesty.org/en/latest/news/2019/06...,0.585
1,2019-07-12 17:14:00,-0.108,"Even now, as more frequent ""king tides"" bubble...",us,China,https://edition.cnn.com/2019/07/11/us/miami-li...,0.784286
2,2019-10-23 15:32:04,0.292,The second meeting of the Board of senior memb...,uz,Russia,http://www.uzreport.com/sco-interbank-associat...,0.585
3,2019-10-23 15:34:13,0.398,The Shanghai Cooperation Organization establis...,uz,Russia,http://www.uzreport.com/entrepreneur-committee...,0.585
4,2019-10-23 15:37:39,0.146,All participants of the exhibition “Tea and Co...,uz,Russia,http://www.uzreport.com/over-7000-people-visit...,0.585
