In [2]:
!pip install newsapi
!pip install python-dotenv
!pip install beautifulsoup4



In [3]:
import numpy as np
import pandas as pd
import requests
#from newsapi import NewsApiClient
from datetime import datetime
from dotenv import load_dotenv, find_dotenv
import os

from bs4 import BeautifulSoup
import re
from itertools import chain
from collections import Counter

# Functions to interact with News API

In [4]:
def get_newskey():
    """
    This function will fetch your NEWS_API from the .env file in the root folder.
    Your .env file should contain a like like: NEWS_API='asdfasdfasdfsadf'
    """
    env_path = find_dotenv()
    file = load_dotenv(env_path)
    return os.getenv('NEWS_API')

In [5]:
def get_news(keyword):
    """
    This function will fetch data from NEWS API based on the keyword entered. 
    The API Key required is taken from the function 'get_newskey'.
    
    Mandatory criteria to be included in the News API request:
    
    - personal api key
    - base url "https://newsapi.org/v2/everything?"
    
    Optional criteria that can be added to refine the request:
    
    - searchIn: what parts of the news shall the request refer to 
    to identify matching articles. Possible choices: "content", "title", "description"
    - sortBy: define the way in which the output of the API shall be sorted. 
    Possible choices: "relevancy, popularity, publishedAt"
    - sources: which sources shall the API refer to for the request. 
    List of strings. The Id's of the various sources available can be 
    found via the function get_list_of_API_news_providers() available below
    - from: date where the search shall start. Default: current date
    - q: Keywords to be used for the search to identify suitable results containing such keywords
    
    Further optional criteria that could be included:
    
    - domains: A comma-seperated string of domains (eg bbc.co.uk, techcrunch.com, engadget.com) 
    to restrict the search to.
    - excludeDomains: A comma-seperated string of domains (eg bbc.co.uk, techcrunch.com, engadget.com) 
    to remove from the results.
    - to: A date and optional time for the newest article allowed. This 
    should be in ISO 8601 format (e.g. 2022-11-19 or 2022-11-19T14:34:03)
    - language: The 2-letter ISO-639-1 code of the language you want to 
    get headlines for. Possible options: ar de en es fr he it nl no pt ru sv ud zh.
    - pageSize: int, The number of results to return per page., Default: 100. Maximum: 100.
    - page:int, Use this to page through the results., Default: 1.
    
    Response object
    
    - status: string, If the request was successful or not. 
    Options: ok, error. In the case of error a code and message 
    property will be populated.

    - totalResults: int, The total number of results available for your 
    request. Only a limited number are shown at a time though, so use the 
    page parameter in your requests to page through them.

    - articles: array[article], The results of the request.

    - source: object, The identifier id and a display name name 
    for the source this article came from.

    - author: string, The author of the article

    - title: string, The headline or title of the article.

    - description: string, A description or snippet from the article.

    - url: string, The direct URL to the article.

    - urlToImage: string, The URL to a relevant image for the article.

    - publishedAt: string, The date and time that the article was published, in UTC (+000)

    - content: string, The unformatted content of the article, where available. This is truncated to 200 chars.
    
    """
    
    
    
    
    
    #api_key taken from the get_newskey function
    api_key = get_newskey()
    
    #base url of the API forming the basis for the request
    base_url = "https://newsapi.org/v2/everything?"
    
    #parts of the news articles that the search shall refer to. It is possible to choose between "content", "title", "content"
    search_in = "content"
    
    #criteria for sorting the output of the API
    sort = "popularity" #relevancy, popularity, publishedAt
    
    #web sources to be used
    sources = "cnn" #domains where we would like to search
    
    #date where the search shall start, default via datetime.today: the current date when the API request is made
    date = datetime.today().strftime('%Y-%m-%d')
    
    # line of code to make the actual request based on the variables defined before
    source_url = f'{base_url}q={keyword}&from="{date}"&sortBy={sort}&sources={sources}&searchIn={search_in}&apiKey={api_key}'
    
    news = requests.get(source_url, allow_redirects=True).json()

    return(news)

In [6]:
def get_urls(keyword, limit=10):
    """
    This function will filter the output of the API resulting in a list 
    of the URL's of the articles included in that output from NEWS API 
    based on the keyword entered. 
    """
    
    api_result = get_news(keyword)
    
    list_of_urls = []
    
    for i in range(len(api_result['articles'])):
        
        list_of_urls.append(api_result['articles'][i]['url'])
    
    return list_of_urls[:limit]

In [7]:
def get_API_sources():
    """
    This function will fetch data about the sources available on the News API
    """  
 
    #api_key taken from the get_newskey function
    api_key = get_newskey()

    # line of code to make the actual request based on the variables defined before
    API_sources_url = f"https://newsapi.org/v2/top-headlines/sources?apiKey={api_key}"
    
    API_sources = requests.get(API_sources_url, allow_redirects=True).json()

    return(API_sources)

In [8]:
def get_list_of_API_news_providers():
    """
    This function will create a list of the id's of the news providers available on the News API
    """
    
    api_sources = get_API_sources()
    
    list_of_API_news_providers = []
    
    for i in range(len(api_sources['sources'])):
        
        list_of_API_news_providers.append(api_sources['sources'][i]['id'])
    
    return list_of_API_news_providers

In [9]:
def get_headlines_for_categories(category, country):
    """
    This function will fetch live, top, breaking headlines from NEWS API based on the category entered. 
    The API Key required is taken from the function 'get_newskey'.
    
    Mandatory criteria to be included in the News API request:
    
    - personal api key
    - base url "https://newsapi.org/v2/top-headlines?"
    
    Possible categories: business, entertainment, general, health, science, sports, technology
    
    Possible countries: ae, ar, at, au, be, bg, br, ca, ch, cn, co, 
    cu, cz, de, eg, fr, gb, gr, hk, hu, id, ie, il, in, it, jp, kr, 
    lt, lv, ma, mx, my, ng, nl, no, nz, ph, pl, pt, ro, rs, ru, sa, 
    se, sg, si, sk, th, tr, tw, ua, us, ve, za, 
    
    IMPORTANT: It is not possible to restrict the search to certain 
    sources if we want to search for a certain category. Therefore, 
    the sources parameter has been left out from the source_url
    """

    
    #api_key taken from the get_newskey function
    api_key = get_newskey()
    
    #base url of the API forming the basis for the request
    base_url = "https://newsapi.org/v2/top-headlines?"
    
    # line of code to make the actual request based on the variables defined before
    source_url = f'{base_url}category={category}&country={country}&apiKey={api_key}'
    
    headlines_for_category = requests.get(source_url, allow_redirects=True).json()

    return(headlines_for_category)  

In [10]:
def get_urls_for_categories(category, country='us', limit=10):
    """
    This function will filter the output of "get_headlines_for_categories" resulting in a list 
    of the URL's of the articles included in that output from NEWS API 
    based on the category and country entered. 
    """
    
    api_result_for_category = get_headlines_for_categories(category, country)
    
    list_of_urls_for_category = []
    
    for i in range(len(api_result_for_category['articles'])):
        
        list_of_urls_for_category.append(api_result_for_category['articles'][i]['url'])
    
    return list_of_urls_for_category[:limit]

In [11]:
# get_urls("house")

In [12]:
# get_API_sources()

In [13]:
# get_list_of_API_news_providers()

In [14]:
# get_headlines_for_categories("sports", "us")

In [15]:
# get_urls_for_categories("sports", "us")

# Webscraper functions

In [16]:
def CNN_scraper(url):
    """
    Input: 'str'
    Output: 'dict'
    
    The function recieve an url (must be from CNN), fetch for the html and uses BS4 to extract the paragraph
    with the class='paragraph inline-placeholder' which contains the text. Then clean and merge the strings.
    It returns a dictionary with the title and the text of the news
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    text = soup.find_all('p', class_='paragraph inline-placeholder')
    text = [item.text.strip() for item in text]
    text = ''.join(text).replace('\xa0', ' ')
    title = soup.title.string.split('|')[0]

    return {'title': title, 'article': text, 'id': 0, 'orig_id': 0}

In [17]:
def DailyMail_scraper(url):
    """
    Input: 'str'
    Output: 'dict'
    
    The function recieve an url (must be from DailyMail), fetch for the html and uses BS4 to extract the paragraph
    with the class='paragraph inline-placeholder' which contains the text. Then clean and merge the strings.
    It returns a dictionary with the title and the text of the news
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    text = soup.find_all('p', class_='mol-para-with-font')
    text = [item.text.strip() for item in text]
    text = ''.join(text).replace('\xa0', ' ')
    title = soup.title.string.split('|')[0]

    return {'title': title, 'article': text, 'id': 0, 'orig_id': 0}

In [18]:
def General_scraper(url):
    """
    Input: 'str'
    Output: 'dict'
    
    The function recieve an url, fetch for the html and uses BS4 to extract the paragraph tags. Then it
    counts the number of times that each paragraph is repeated and uses the most repeated (in a news must be text)
    to scrape the news from the website. It returns a dictionary with the title and the text of the news.
    """
    
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    text = soup.find_all('p')

    attrs_list = [' '.join(item.attrs['class']) for item in text if len(item.attrs) > 0]
    values, counts = np.unique(attrs_list, return_counts=True)
    idx = np.where(counts == np.max(counts))
    text_class = str(values[idx]).strip('[]\'')

    text = soup.find_all('p', class_=text_class)
    text = [item.text.strip() for item in text]
    text = ' '.join(text).replace('\xa0', ' ')
    title = soup.title.string.split('|')[0]

    return {'title': title, 'article': text, 'id': 0, 'orig_id': 0}

# Testing the combination of both set of functions

In [19]:
urls = get_urls("house")

In [20]:
General_scraper(urls[0])

{'title': 'Kevin McCarthy beats far-right challenger 188-31 to lead House GOP ',
 'article': 'The House Republican conference voted for House Minority Leader Kevin McCarthy to be its leader after an underwhelming midterm election performance launched a search among conservatives for a challenger. The vote puts McCarthy in line to be the next speaker of the House, presuming Republicans win at least three more seats of the 16 that have not yet been projected by CNN. McCarthy won 188-31 against Arizona Rep. Andy Biggs, according to multiple sources in the room. It was a secret ballot, and McCarthy only needed to earn a simple majority of the conference. While Biggs’ challenge was always viewed as a long shot, the number of Republicans who voted for him underscored the level of frustration with the Republican leadership following the 2022 elections. In January, McCarthy must win 218 votes, a majority of the entire House, to wield the speaker’s gavel. Three House Republicans — Kat Cammack o

In [21]:
def get_articles(keyword):
    urls = get_urls(keyword)
    articles = [General_scraper(url) for url in urls]
    df = pd.DataFrame(articles)
    return df    

In [22]:
keyword='business'
df = get_articles(keyword)

In [23]:
df

Unnamed: 0,title,article,id,orig_id
0,Retail sales surged by 1.3% in October as cons...,"American shoppers spent more in October, showi...",0,0
1,Jeff Bezos' top tips for managing the economic...,Amazon founder Jeff Bezos recently warned cons...,0,0
2,Opinion: I wouldn't bet against Bob Iger's ret...,Bob Iger’s return to The Walt Disney Company i...,0,0
3,Oatly plans job cuts as investors sour,Oat milk brand Oatly is planning to cut jobs a...,0,0
4,The US monthly budget deficit dropped to $88 b...,The US government recorded a monthly budget de...,0,0
5,Inflation data shows US prices were still unco...,A new batch of inflation data released Friday ...,0,0
6,Sam Bankman-Fried's $16 billion fortune evapor...,Sam Bankman-Fried woke up on Monday still a bi...,0,0
7,Consumer confidence fell in October as inflati...,US consumer confidence fell in October to the ...,0,0
8,Taylor Swift fans irate at Ticketmaster as hug...,There’s some bad blood brewing between Taylor ...,0,0
9,George W. Bush: Former US president to hold a ...,Former US President George W. Bush will hold a...,0,0
