# Exploring news data to plan feature pipeline

## Get API key

In [1]:
import os
api_key= os.environ["NEWS_DATA_KEY"]


In [2]:
# temporary key access method (add a API secret to repo)
#with open("./NEWS_DATA_KEY.txt", "r") as file:
#    api_key = file.read()

## Get articles

In [3]:
from newsdataapi import NewsDataApiClient
from datetime import datetime

# Create a NewsData API client
client = NewsDataApiClient(apikey=api_key)

# Today's date in the required format
today = datetime.now().strftime('%Y-%m-%d')

# Collect articles from many countries in a list
all_articles = []
country_codes = ['us', 'ca', 'ie', 'gb', 'au', 'nz']
language = 'en'

for country_code in country_codes:
    # Fetch the news
    response = client.news_api(country=country_code, language=language)

    # Process the response
    if 'status' in response and response['status'] == 'success':
        articles = response['results']
        # Process the articles as needed
        for article in articles:
            all_articles.append(article)
    else:
        print("Failed to retrieve data:", response.get('message', 'Unknown Error'))

In [4]:
print(all_articles[:4])

[{'article_id': '4d48ce03a75e7d160a33bfd3a06bf0ca', 'title': 'The 16 Best Burger Chains In The USA', 'link': 'https://www.thedailymeal.com/1479303/best-american-burger-chains/', 'keywords': ['restaurants'], 'creator': ['staff@thedailymeal.com (Trevor Carlson)'], 'video_url': None, 'description': "For as easy as it is to find a burger in America, the quality is across the board. To help you choose, we've compiled some of the best burger chains in the USA.", 'content': "It's hard to beat a good burger. Rich and tasty yet widely accessible and unpretentious, burgers pack a big punch of beefy flavor like no other, reigning supreme in the land of handheld fare. There's something undeniably satisfying about biting into a burger that's nice and juicy with a luscious touch of char. Fortunately, there are countless burger chains here in the USA to help us get our fix when hunger strikes. Yet, overall burger quality tends to vary quite dramatically between chains. Some chains have mastered the a

## Create dataframe

In [10]:
import pandas as pd
news_df = pd.DataFrame(all_articles)
print(len(news_df))
print(list(news_df.columns))

60
['article_id', 'title', 'link', 'keywords', 'creator', 'video_url', 'description', 'content', 'pubDate', 'image_url', 'source_id', 'source_priority', 'country', 'category', 'language', 'ai_tag', 'sentiment', 'sentiment_stats']


In [11]:
columns_to_drop = ['keywords', 'creator', 'video_url', 'image_url', 'source_priority', 'ai_tag', 'sentiment', 'sentiment_stats']
columns_to_keep = [f for f in list(news_df.columns) if f not in columns_to_drop]
print(columns_to_keep)

['article_id', 'title', 'link', 'description', 'content', 'pubDate', 'source_id', 'country', 'category', 'language']


In [12]:
news_df = news_df[columns_to_keep]
news_df.head()

Unnamed: 0,article_id,title,link,description,content,pubDate,source_id,country,category,language
0,c0285a29e85cb5cc1b2e7834e66c23bf,Bacon-Wrapped Corn On The Cob Is The Ultimate ...,https://www.thedailymeal.com/1481043/bacon-wra...,Everyone knows that bacon makes everything bet...,Bacon tastes so good that it makes almost any ...,2023-12-31 14:15:35,thedailymeal,[united states of america],[top],english
1,c52b59ce885adcc27cb0492fdaa27be0,Germany's Tiny But Powerful Tank: The Wiesel AWC,https://www.slashgear.com/1479748/germanys-tin...,While your immediate concept of a tank might b...,From the first tank offensive ever on Septembe...,2023-12-31 14:15:16,slashgear,"[united kingdom, united states of america, sin...",[top],english
2,392a094ef21a0ded775c5730946ca111,Good news you may have missed in 2023,https://www.cbsnews.com/news/good-news-you-may...,From technology and medicine to the environmen...,"Let's tune in to ""Sunny Side Up News,"" with yo...",2023-12-31 14:13:41,minnesotacbslocal,[united states of america],[top],english
3,1d786a2b49e98d4397406de7c9e75f24,Liverpool vs. Newcastle United live stream: Ho...,https://www.cbssports.com/soccer/news/liverpoo...,Liverpool are in the thick of the title race,After a strong start to the holiday season for...,2023-12-31 14:08:22,cbssports,[united states of america],[top],english
4,55e408f00465eaf2f36e4aa3e062410c,Tottenham vs. Bournemouth live stream: How to ...,https://www.cbssports.com/soccer/news/tottenha...,Ange Postecoglou's Spurs look to bounce back p...,Tottenham Hotspur welcome Bournemouth in the P...,2023-12-31 14:08:06,cbssports,[united states of america],[top],english


## Remove null values (to avoid errors with Hopsworks)

In [6]:
news_df.isna().sum()

article_id          0
title               0
link                0
keywords           21
creator            18
video_url          60
description        16
content             0
pubDate             0
image_url          12
source_id           0
source_priority     0
country             0
category            0
language            0
ai_tag              0
sentiment           0
sentiment_stats     0
dtype: int64

In [7]:
print(news_df.columns)

Index(['article_id', 'title', 'link', 'keywords', 'creator', 'video_url',
       'description', 'content', 'pubDate', 'image_url', 'source_id',
       'source_priority', 'country', 'category', 'language', 'ai_tag',
       'sentiment', 'sentiment_stats'],
      dtype='object')


In [8]:
news_df = news_df.drop(['keywords', 'creator', 'video_url', 'image_url'], axis=1)

In [9]:
news_df.isna().sum()

article_id          0
title               0
link                0
description        16
content             0
pubDate             0
source_id           0
source_priority     0
country             0
category            0
language            0
ai_tag              0
sentiment           0
sentiment_stats     0
dtype: int64

In [10]:
news_df = news_df.dropna()
print(len(news_df))

44


In [11]:
news_df['pubDate'] = pd.to_datetime(news_df['pubDate']).dt.date
news_df['pubDate'] = news_df['pubDate'].astype("string")

In [12]:
type(news_df['pubDate'].iloc[0])

str

In [13]:
print(news_df['category'])

0               [top]
1               [top]
3               [top]
5             [world]
6               [top]
7              [food]
8               [top]
9               [top]
10       [technology]
11    [entertainment]
12              [top]
13              [top]
14              [top]
15              [top]
16              [top]
17           [sports]
18           [sports]
20              [top]
23              [top]
29              [top]
32         [politics]
33              [top]
34              [top]
35    [entertainment]
36              [top]
37              [top]
38              [top]
43           [sports]
44              [top]
45           [sports]
46              [top]
47           [sports]
48              [top]
49    [entertainment]
50              [top]
51            [world]
52              [top]
53              [top]
54              [top]
55              [top]
56              [top]
57              [top]
58              [top]
59              [top]
Name: category, dtype: object
