# Exploring news data to plan feature pipeline

## Get API key

In [1]:
import os
api_key= os.environ["NEWS_DATA_KEY"]


In [2]:
# temporary key access method (add a API secret to repo)
#with open("./NEWS_DATA_KEY.txt", "r") as file:
#    api_key = file.read()

## Get articles

In [3]:
from newsdataapi import NewsDataApiClient
from datetime import datetime

# Create a NewsData API client
client = NewsDataApiClient(apikey=api_key)

# Today's date in the required format
today = datetime.now().strftime('%Y-%m-%d')

# Collect articles from many countries in a list
all_articles = []
country_codes = ['us', 'ca', 'ie', 'gb', 'au', 'nz']
language = 'en'

for country_code in country_codes:
    # Fetch the news
    response = client.news_api(country=country_code, language=language)

    # Process the response
    if 'status' in response and response['status'] == 'success':
        articles = response['results']
        # Process the articles as needed
        for article in articles:
            all_articles.append(article)
    else:
        print("Failed to retrieve data:", response.get('message', 'Unknown Error'))

In [4]:
print(all_articles[15])

{'article_id': '4ca7f6091b05942f37deb2a97f2eb7de', 'title': 'Ian Wright to step down as Match of the Day pundit at end of current season', 'link': 'https://www.theguardian.com/football/2023/dec/17/ian-wright-match-of-the-day-pundit-bbc', 'keywords': ['Football', 'Sport', 'Arsenal', 'BBC', 'Media', 'Television & radio'], 'creator': ['Guardian sport'], 'video_url': None, 'description': 'Former Arsenal striker announces he will leave role in MayBBC show ‘will always be my Graceland’, says 60-year-oldIan Wright will step down from his regular punditry role with the BBC’s Match of the Day programme at the end of the football season in May 2024. The former Arsenal and England striker announced the news via social media on Sunday morning.“After my debut show whilst still a player in 1997 and many more memorable years, I’ll be stepping back from BBC MOTD at the end of this season,” Wright posted on X, formerly Twitter. “I feel very privileged to have had such an incredible run on the most icon

## Create dataframe

In [5]:
import pandas as pd
news_df = pd.DataFrame(all_articles)
print(len(news_df))

60


## Remove null values (to avoid errors with Hopsworks)

In [6]:
news_df.isna().sum()

article_id          0
title               0
link                0
keywords           21
creator            25
video_url          60
description        14
content             0
pubDate             0
image_url          12
source_id           0
source_priority     0
country             0
category            0
language            0
dtype: int64

In [7]:
news_df = news_df.drop(['keywords', 'creator', 'video_url', 'image_url'], axis=1)

In [8]:
news_df.isna().sum()

article_id          0
title               0
link                0
description        14
content             0
pubDate             0
source_id           0
source_priority     0
country             0
category            0
language            0
dtype: int64

In [9]:
news_df = news_df.dropna()
print(len(news_df))

46


In [16]:
news_df['pubDate'] = pd.to_datetime(news_df['pubDate']).dt.date
news_df['pubDate'] = news_df['pubDate'].astype("string")

In [19]:
type(news_df['pubDate'].iloc[0])

str