# Exploring news data to plan feature pipeline

## Get API key

In [2]:
import os
api_key= os.environ["NEWS_DATA_KEY"]


pub_340711b5edb9d1e7f3b5dac8e7342e5a913b3


In [1]:
# temporary key access method (add a API secret to repo)
#with open("./NEWS_DATA_KEY.txt", "r") as file:
#    api_key = file.read()

## Get articles

In [18]:
from newsdataapi import NewsDataApiClient
from datetime import datetime

# Create a NewsData API client
client = NewsDataApiClient(apikey=api_key)

# Today's date in the required format
today = datetime.now().strftime('%Y-%m-%d')

# Collect articles from many countries in a list
all_articles = []
country_codes = ['us', 'ca', 'ie', 'gb', 'au', 'nz']
language = 'en'

for country_code in country_codes:
    # Fetch the news
    response = client.news_api(country=country_code, language=language)

    # Process the response
    if 'status' in response and response['status'] == 'success':
        articles = response['results']
        # Process the articles as needed
        for article in articles:
            all_articles.append(article)
    else:
        print("Failed to retrieve data:", response.get('message', 'Unknown Error'))

In [19]:
print(all_articles[15])

{'article_id': '4a3e7c5b0356439cd887c443334e82c4', 'title': 'Madison Beer: “I can’t spend forever trying to beg people who refuse to give me a shot”', 'link': 'https://www.nme.com/features/madison-beer-i-cant-spend-forever-trying-to-beg-people-who-refuse-to-give-me-a-shot-3559521?utm_source=rss&utm_medium=rss&utm_campaign=madison-beer-i-cant-spend-forever-trying-to-beg-people-who-refuse-to-give-me-a-shot', 'keywords': ['Features', 'Music Features', 'Music Interviews'], 'creator': ['Paul Bugler'], 'video_url': None, 'description': 'Madison Beer on new album ‘Silence Between Songs’, her memoir ‘The Half of It” and love for 60’s music The post Madison Beer: “I can’t spend forever trying to beg people who refuse to give me a shot” appeared first on NME.', 'content': 'Madison Beer on new album ‘Silence Between Songs’, her memoir ‘The Half of It” and love for 60’s music. Madison Beer is an artist who has experienced the best – and very worst – of the internet. Her origin story reads like a G

## Create dataframe

In [20]:
import pandas as pd
news_df = pd.DataFrame(all_articles)
print(len(news_df))

60


## Remove null values (to avoid errors with Hopsworks)

In [21]:
news_df.isna().sum()

article_id          0
title               0
link                0
keywords           26
creator            28
video_url          60
description        15
content             0
pubDate             0
image_url           8
source_id           0
source_priority     0
country             0
category            0
language            0
dtype: int64

In [22]:
news_df = news_df.drop(['keywords', 'creator', 'video_url', 'image_url'], axis=1)

In [23]:
news_df.isna().sum()

article_id          0
title               0
link                0
description        15
content             0
pubDate             0
source_id           0
source_priority     0
country             0
category            0
language            0
dtype: int64

In [24]:
news_df = news_df.dropna()
print(len(news_df))

45


In [25]:
news_df['pubDate'] = pd.to_datetime(news_df['pubDate']).dt.date

In [26]:
news_df['pubDate'] 

0     2023-12-16
1     2023-12-16
2     2023-12-16
3     2023-12-16
4     2023-12-16
5     2023-12-16
6     2023-12-16
7     2023-12-16
8     2023-12-16
9     2023-12-16
10    2023-12-16
11    2023-12-16
12    2023-12-16
13    2023-12-16
14    2023-12-16
15    2023-12-16
16    2023-12-16
17    2023-12-16
18    2023-12-16
19    2023-12-16
26    2023-12-16
30    2023-12-16
31    2023-12-16
34    2023-12-16
35    2023-12-16
36    2023-12-16
37    2023-12-16
38    2023-12-16
39    2023-12-16
40    2023-12-16
41    2023-12-16
42    2023-12-16
43    2023-12-16
44    2023-12-16
45    2023-12-16
46    2023-12-16
47    2023-12-16
49    2023-12-16
51    2023-12-16
52    2023-12-16
54    2023-12-16
55    2023-12-16
56    2023-12-16
57    2023-12-16
59    2023-12-16
Name: pubDate, dtype: object