### Extract and transform data

In [1]:
from datetime import timedelta
import json
import os
import pandas as pd
from newsapi import NewsApiClient
from dotenv import load_dotenv
import requests
import ast
import boto3

load_dotenv()

nd_api_key = os.getenv("NEWSDATA_API_KEY")
na_api_key = os.getenv("NEWSAPI_API_KEY")

def get_data_from_newsdata_api():
 params = {
     "apiKey" : nd_api_key,
     "q": "pegasus",
     "language": "en"
 }

 response = requests.get("https://newsdata.io/api/1/news", params=params)
 if response.status_code == 200:
        data = response.json()
        return data
 else:
        print("Error fetching data. Status code:", response.status_code)
        return None

def get_data_from_newsapi_api():
    newsapi = NewsApiClient(api_key=na_api_key)

    today = datetime.today().date()

    days_ago = today - timedelta(days=28)

    days_ago = days_ago.strftime('%Y-%m-%d')

    newsapi_articles = newsapi.get_everything(q='bitcoin',
                                      sources='bbc-news,the-verge',
                                      domains='bbc.co.uk,techcrunch.com',
                                      from_param=days_ago,
                                      language='en',
                                      sort_by='relevancy')

    return newsapi_articles

na_df = pd.DataFrame(get_data_from_newsapi_api()["articles"])
nd_df = pd.DataFrame(get_data_from_newsdata_api()["results"])

# Transform into correct json and get the source only from newsApi
def extract_name(row):
    correct_json = row.replace("'", "\"")
    data = json.loads(correct_json)
    return data['name']

na_df["source"] = na_df["source"].apply(extract_name)

# Format creator rows for newsData dataframe
def format_creator(row):
    if row is None or row == 'nan':
        return "Unknown"
    try:
        lists = ast.literal_eval(row)
        return lists[0]
    except (ValueError, SyntaxError):
        return "Unknown"

nd_df["creator"] = nd_df["creator"].apply(format_creator)

# Rename all columns for merging
nd_df.rename(columns = {
    "pubDate" : "publishedAt",
    "source_id" : "source",
    "creator": "author",
    "link" : "url",
    "image_url" : "urlToImage"
}, inplace=True)

# Fill all empty or null fields
na_df = na_df.fillna("Unknown")
nd_df = nd_df.fillna("Unknown")



### Upload to amazon S3

In [2]:
from datetime import datetime
import logging
from botocore.exceptions import ClientError

final_data_frame = pd.merge(na_df, nd_df, on=['source', 'author', 'title', 'description', 'url', 'urlToImage', 'publishedAt', 'content'], how='outer')

final_data_frame = final_data_frame.iloc[:, :-6]

final_data_frame.to_csv('final_dataframe.csv')

# Upload file to amazon s3
def upload_file(file_name, bucket):
    """Upload a file to an S3 bucket

    :param file_name: File to upload
    :param bucket: Bucket to upload to
    :return: True if file was uploaded, else False
    """

    AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
    AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")

    # If S3 object_name was not specified, use file_name

    object_name = datetime.today().date().strftime('%Y/%m/%d') + '.csv'

    # Upload the file
    s3_client = boto3.client('s3', aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
    try:
        response = s3_client.upload_file(file_name, bucket, object_name)
    except ClientError as e:
        logging.error(e)
        return False
    return True

upload_file('final_dataframe.csv', 'news-collectors')


True