# Exploring news data to plan feature pipeline

## Get API key

In [1]:
# temporary key access method (add a API secret to repo)
with open("./NEWS_DATA_KEY.txt", "r") as file:
    api_key = file.read()

## Get articles

In [2]:
from newsdataapi import NewsDataApiClient
from datetime import datetime

# Create a NewsData API client
client = NewsDataApiClient(apikey=api_key)

# Today's date in the required format
today = datetime.now().strftime('%Y-%m-%d')

# Collect articles from many countries in a list
all_articles = []
country_codes = ['us', 'ca', 'ie', 'gb', 'au', 'nz']

for country_code in country_codes:
    # Fetch the news
    response = client.news_api(country=country_code)

    # Process the response
    if 'status' in response and response['status'] == 'success':
        articles = response['results']
        # Process the articles as needed
        for article in articles:
            all_articles.append(article)
    else:
        print("Failed to retrieve data:", response.get('message', 'Unknown Error'))

In [5]:
print(all_articles[15])

{'article_id': 'db8df3ebaa48637892ea9fc72851ac79', 'title': 'Switzerland’s Greens fail in a long-shot bid to enter the national government', 'link': 'https://www.winnipegfreepress.com/world/2023/12/13/switzerlands-greens-fail-in-a-long-shot-bid-to-enter-the-national-government', 'keywords': ['World'], 'creator': ['The Associated Press'], 'video_url': None, 'description': 'BERLIN (AP) — Switzerland’s environmentalist Greens failed in a long-shot bid to enter the national government Wednesday as lawmakers elected a new center-left minister to the Alpine country’s executive Federal […]', 'content': "BERLIN (AP) — Switzerland's environmentalist Greens failed in a long-shot bid to enter the national government Wednesday as lawmakers elected a new center-left minister to the Alpine country's executive Federal Council. Read this article for free: Already have an account? To continue reading, please subscribe: * BERLIN (AP) — Switzerland’s environmentalist Greens failed in a long-shot bid to e

## Create dataframe

In [20]:
import pandas as pd
news_df = pd.DataFrame(all_articles)
print(len(news_df))

60


## Remove null values (to avoid errors with Hopsworks)

In [21]:
news_df.isna().sum()

article_id          0
title               0
link                0
keywords           21
creator            24
video_url          60
description         5
content             0
pubDate             0
image_url          24
source_id           0
source_priority     0
country             0
category            0
language            0
dtype: int64

In [22]:
news_df = news_df.drop(['keywords', 'creator', 'video_url', 'image_url'], axis=1)

In [23]:
news_df.isna().sum()

article_id         0
title              0
link               0
description        5
content            0
pubDate            0
source_id          0
source_priority    0
country            0
category           0
language           0
dtype: int64

In [24]:
news_df = news_df.dropna()
print(len(news_df))

55
