In [2]:
!pip install newsapi

Collecting newsapi
  Downloading newsapi-0.1.1-py2.py3-none-any.whl (4.1 kB)
Installing collected packages: newsapi
Successfully installed newsapi-0.1.1


In [10]:
!pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-0.21.0-py3-none-any.whl (18 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-0.21.0


In [23]:
import numpy as np
import pandas as pd
import requests
#from newsapi import NewsApiClient
from datetime import datetime
from dotenv import load_dotenv, find_dotenv
import os

In [24]:
def get_newskey():
    """
    This function will fetch your NEWS_API from the .env file in the root folder.
    Your .env file should contain a like like: NEWS_API='asdfasdfasdfsadf'
    """
    env_path = find_dotenv()
    file = load_dotenv(env_path)
    return os.getenv('NEWS_API')

In [41]:
def get_news(keyword):
    """
    This function will fetch data from NEWS API based on the keyword entered. 
    The API Key required is taken from the function 'get_newskey'.
    
    Mandatory criteria to be included in the News API request:
    
    - personal api key
    - base url "https://newsapi.org/v2/everything?"
    
    Optional criteria that can be added to refine the request:
    
    - searchIn: what parts of the news shall the request refer to 
    to identify matching articles. Possible choices: "content", "title", "description"
    - sortBy: define the way in which the output of the API shall be sorted. 
    Possible choices: "relevancy, popularity, publishedAt"
    - sources: which sources shall the API refer to for the request. 
    List of strings. The Id's of the various sources available can be 
    found via the function get_list_of_API_news_providers() available below
    - from: date where the search shall start. Default: current date
    - q: Keywords to be used for the search to identify suitable results containing such keywords
    
    Further optional criteria that could be included:
    
    - domains: A comma-seperated string of domains (eg bbc.co.uk, techcrunch.com, engadget.com) 
    to restrict the search to.
    - excludeDomains: A comma-seperated string of domains (eg bbc.co.uk, techcrunch.com, engadget.com) 
    to remove from the results.
    - to: A date and optional time for the newest article allowed. This 
    should be in ISO 8601 format (e.g. 2022-11-19 or 2022-11-19T14:34:03)
    - language: The 2-letter ISO-639-1 code of the language you want to 
    get headlines for. Possible options: ar de en es fr he it nl no pt ru sv ud zh.
    - pageSize: int, The number of results to return per page., Default: 100. Maximum: 100.
    - page:int, Use this to page through the results., Default: 1.
    
    Response object
    
    - status: string, If the request was successful or not. 
    Options: ok, error. In the case of error a code and message 
    property will be populated.

    - totalResults: int, The total number of results available for your 
    request. Only a limited number are shown at a time though, so use the 
    page parameter in your requests to page through them.

    - articles: array[article], The results of the request.

    - source: object, The identifier id and a display name name 
    for the source this article came from.

    - author: string, The author of the article

    - title: string, The headline or title of the article.

    - description: string, A description or snippet from the article.

    - url: string, The direct URL to the article.

    - urlToImage: string, The URL to a relevant image for the article.

    - publishedAt: string, The date and time that the article was published, in UTC (+000)

    - content: string, The unformatted content of the article, where available. This is truncated to 200 chars.
    
    """
    
    
    
    
    
    #api_key taken from the get_newskey function
    api_key = get_newskey()
    
    #base url of the API forming the basis for the request
    base_url = "https://newsapi.org/v2/everything?"
    
    #parts of the news articles that the search shall refer to. It is possible to choose between "content", "title", "content"
    search_in = "content"
    
    #criteria for sorting the output of the API
    sort = "popularity" #relevancy, popularity, publishedAt
    
    #web sources to be used
    sources = "cnn" #domains where we would like to search
    
    #date where the search shall start, default via datetime.today: the current date when the API request is made
    date = datetime.today().strftime('%Y-%m-%d')
    
    # line of code to make the actual request based on the variables defined before
    source_url = f'{base_url}q={keyword}&from="{date}"&sortBy={sort}&sources={sources}&searchIn={search_in}&apiKey={api_key}'
    
    news = requests.get(source_url, allow_redirects=True).json()

    return(news)

In [42]:
def get_urls(keyword):
    """
    This function will filter the output of the API resulting in a list 
    of the URL's of the articles included in that output from NEWS API 
    based on the keyword entered. 
    """
    
    api_result = get_news(keyword)
    
    list_of_urls = []
    
    for i in range(len(api_result['articles'])):
        
        list_of_urls.append(api_result['articles'][i]['url'])
    
    return list_of_urls

In [43]:
get_urls("house")

['https://www.cnn.com/2022/11/15/politics/house-republican-vote-kevin-mccarthy/index.html',
 'https://www.cnn.com/2022/11/16/opinions/biden-roadblocks-house-under-gop-control-zelizer/index.html',
 'https://www.cnn.com/2022/11/16/politics/virginia-mclaurin-dies/index.html',
 'https://www.cnn.com/2022/10/29/politics/nancy-pelosi-paul-pelosi-attack-statement/index.html',
 'https://www.cnn.com/2022/10/27/politics/new-york-red-wave-biden-maloney/index.html',
 'https://www.cnn.com/2022/11/14/politics/takeaways-midterm-election-analysis/index.html',
 'https://www.cnn.com/2022/10/26/politics/paul-ryan-donald-trump-president-2024/index.html',
 'https://www.cnn.com/2022/10/26/politics/kevin-mccarthy-house-gop-ukraine-funding-blank-check/index.html',
 'https://www.cnn.com/2022/11/10/politics/what-to-know-vote-counting-arizona-nevada/index.html',
 'https://www.cnn.com/2022/11/05/politics/gallery/road-to-2022-midterms/index.html',
 'https://www.cnn.com/2022/11/05/media/paul-pelosi-attack-nbc-report

In [37]:
def get_API_sources():
    """
    This function will fetch data about the sources available on the News API
    """  
 
    #api_key taken from the get_newskey function
    api_key = get_newskey()

    # line of code to make the actual request based on the variables defined before
    API_sources_url = f"https://newsapi.org/v2/top-headlines/sources?apiKey={api_key}"
    
    API_sources = requests.get(API_sources_url, allow_redirects=True).json()

    return(API_sources)

In [38]:
get_API_sources()

{'status': 'ok',
 'sources': [{'id': 'abc-news',
   'name': 'ABC News',
   'description': 'Your trusted source for breaking news, analysis, exclusive interviews, headlines, and videos at ABCNews.com.',
   'url': 'https://abcnews.go.com',
   'category': 'general',
   'language': 'en',
   'country': 'us'},
  {'id': 'abc-news-au',
   'name': 'ABC News (AU)',
   'description': "Australia's most trusted source of local, national and world news. Comprehensive, independent, in-depth analysis, the latest business, sport, weather and more.",
   'url': 'http://www.abc.net.au/news',
   'category': 'general',
   'language': 'en',
   'country': 'au'},
  {'id': 'aftenposten',
   'name': 'Aftenposten',
   'description': 'Norges ledende nettavis med alltid oppdaterte nyheter innenfor innenriks, utenriks, sport og kultur.',
   'url': 'https://www.aftenposten.no',
   'category': 'general',
   'language': 'no',
   'country': 'no'},
  {'id': 'al-jazeera-english',
   'name': 'Al Jazeera English',
   'descr

In [39]:
def get_list_of_API_news_providers():
    """
    This function will create a list of the id's of the news providers available on the News API
    """
    
    api_sources = get_API_sources()
    
    list_of_API_news_providers = []
    
    for i in range(len(api_sources['sources'])):
        
        list_of_API_news_providers.append(api_sources['sources'][i]['id'])
    
    return list_of_API_news_providers

In [40]:
get_list_of_API_news_providers()

['abc-news',
 'abc-news-au',
 'aftenposten',
 'al-jazeera-english',
 'ansa',
 'argaam',
 'ars-technica',
 'ary-news',
 'associated-press',
 'australian-financial-review',
 'axios',
 'bbc-news',
 'bbc-sport',
 'bild',
 'blasting-news-br',
 'bleacher-report',
 'bloomberg',
 'breitbart-news',
 'business-insider',
 'business-insider-uk',
 'buzzfeed',
 'cbc-news',
 'cbs-news',
 'cnn',
 'cnn-es',
 'crypto-coins-news',
 'der-tagesspiegel',
 'die-zeit',
 'el-mundo',
 'engadget',
 'entertainment-weekly',
 'espn',
 'espn-cric-info',
 'financial-post',
 'focus',
 'football-italia',
 'fortune',
 'four-four-two',
 'fox-news',
 'fox-sports',
 'globo',
 'google-news',
 'google-news-ar',
 'google-news-au',
 'google-news-br',
 'google-news-ca',
 'google-news-fr',
 'google-news-in',
 'google-news-is',
 'google-news-it',
 'google-news-ru',
 'google-news-sa',
 'google-news-uk',
 'goteborgs-posten',
 'gruenderszene',
 'hacker-news',
 'handelsblatt',
 'ign',
 'il-sole-24-ore',
 'independent',
 'infobae',
 '

In [49]:
def get_headlines_for_categories(category, country):
    """
    This function will fetch live, top, breaking headlines from NEWS API based on the category entered. 
    The API Key required is taken from the function 'get_newskey'.
    
    Mandatory criteria to be included in the News API request:
    
    - personal api key
    - base url "https://newsapi.org/v2/top-headlines?"
    
    Possible categories: business, entertainment, general, health, science, sports, technology
    
    Possible countries: ae, ar, at, au, be, bg, br, ca, ch, cn, co, 
    cu, cz, de, eg, fr, gb, gr, hk, hu, id, ie, il, in, it, jp, kr, 
    lt, lv, ma, mx, my, ng, nl, no, nz, ph, pl, pt, ro, rs, ru, sa, 
    se, sg, si, sk, th, tr, tw, ua, us, ve, za, 
    
    IMPORTANT: It is not possible to restrict the search to certain 
    sources if we want to search for a certain category. Therefore, 
    the sources parameter has been left out from the source_url
    """

    
    #api_key taken from the get_newskey function
    api_key = get_newskey()
    
    #base url of the API forming the basis for the request
    base_url = "https://newsapi.org/v2/top-headlines?"
    
    # line of code to make the actual request based on the variables defined before
    source_url = f'{base_url}category={category}&country={country}&apiKey={api_key}'
    
    headlines_for_category = requests.get(source_url, allow_redirects=True).json()

    return(headlines_for_category)   
    

    
    
    

In [53]:
get_headlines_for_categories("sports", "us")

{'status': 'ok',
 'totalResults': 69,
 'articles': [{'source': {'id': None, 'name': 'ESPN'},
   'author': 'Pete Thamel',
   'title': 'Sources - Oregon QB Bo Nix a game-time decision vs. Utah - ESPN',
   'description': "Oregon quarterback Bo Nix remains a game-time decision for Saturday's matchup with No. 10 Utah, sources told ESPN.",
   'url': 'https://www.espn.com/college-football/story/_/id/35056883/oregon-ducks-qb-bo-nix-game-decision-vs-utah-utes',
   'urlToImage': 'https://a4.espncdn.com/combiner/i?img=%2Fphoto%2F2022%2F1119%2Fr1093116_1296x729_16%2D9.jpg',
   'publishedAt': '2022-11-19T14:36:01Z',
   'content': "Oregon quarterback Bo Nix remains a game-time decision for the No. 12 Ducks' matchup with No. 10 Utah on Saturday, sources told ESPN.\r\nNix injured his ankle in Oregon's loss to Washington last week, … [+1370 chars]"},
  {'source': {'id': None, 'name': 'Eleven Warriors'},
   'author': None,
   'title': 'Miyan Williams and Jaxon Smith-Njigba Out, TreVeyon Henderson To Be 

In [60]:
def get_urls_for_categories(category, country):
    """
    This function will filter the output of "get_headlines_for_categories" resulting in a list 
    of the URL's of the articles included in that output from NEWS API 
    based on the category and country entered. 
    """
    
    api_result_for_category = get_headlines_for_categories(category, country)
    
    list_of_urls_for_category = []
    
    for i in range(len(api_result_for_category['articles'])):
        
        list_of_urls_for_category.append(api_result_for_category['articles'][i]['url'])
    
    return list_of_urls_for_category

In [61]:
get_urls_for_categories("sports", "us")

['https://www.espn.com/college-football/story/_/id/35056883/oregon-ducks-qb-bo-nix-game-decision-vs-utah-utes',
 'https://www.elevenwarriors.com/ohio-state-football/2022/11/134957/miyan-williams-and-jaxon-smith-njigba-out-treveyon-henderson-to-be-available-against-maryland',
 'https://www.mlb.com/news/mets-trade-for-elieser-hernandez-jeff-brigham-from-marlins',
 'https://www.cbssports.com/college-football/news/college-football-schedule-games-2022-what-to-watch-in-week-12-tv-channels-saturday-kickoff-times/',
 'https://www.cnn.com/2022/11/19/football/qatar-world-cup-2022-lgbtq-rights-spt-intl/index.html',
 'https://www.foxsports.com/stories/soccer/brazil-argentina-most-popular-picks-in-world-cup-bracket-challenge-enter-now',
 'https://www.freep.com/story/sports/college/michigan-state/spartans/2022/11/19/michigan-state-basketball-jaden-akins-villanova-injury/69663008007/',
 'https://www.youtube.com/watch?v=gz9l1RQ8IEY',
 'https://www.silverscreenandroll.com/2022/11/18/23467518/lakers-vs-