In [70]:
import json
import os
import pandas as pd
from newsapi import NewsApiClient
from dotenv import load_dotenv
import requests
import ast

load_dotenv()

nd_api_key = os.getenv("NEWSDATA_API_KEY")
na_api_key = os.getenv("NEWSAPI_API_KEY")

def get_data_from_newsdata_api():
 params = {
     "apiKey" : nd_api_key,
     "q": "pegasus",
     "language": "en"
 }

 response = requests.get("https://newsdata.io/api/1/news", params=params)
 if response.status_code == 200:
        data = response.json()
        return data
 else:
        print("Error fetching data. Status code:", response.status_code)
        return None

def get_data_from_newsapi_api():
    newsapi = NewsApiClient(api_key=na_api_key)

    newsapi_articles = newsapi.get_everything(q='bitcoin',
                                      sources='bbc-news,the-verge',
                                      domains='bbc.co.uk,techcrunch.com',
                                      from_param='2023-07-14',
                                      language='en',
                                      sort_by='relevancy')

    return newsapi_articles

#na_df = pd.DataFrame(get_data_from_newsapi_api()["articles"])
#nd_df = pd.DataFrame(get_data_from_newsdata_api()["results"])

na_df = pd.read_csv('newsapi.csv')

nd_df = pd.read_csv('newsdata.csv')

# Transform into correct json and get the source only from newsApi
def extract_name(row):
    correct_json = row.replace("'", "\"")
    data = json.loads(correct_json)
    return data['name']

na_df["source"] = na_df["source"].apply(extract_name)

# Format creator rows for newsData dataframe
def format_creator(row):
    if row is None or row == 'nan':
        return "Unknown"
    try:
        lists = ast.literal_eval(row)
        return lists[0]
    except (ValueError, SyntaxError):
        return "Unknown"

nd_df["creator"] = nd_df["creator"].apply(format_creator)

# Rename all columns for merging
nd_df.rename(columns = {
    "pubDate" : "publishedAt",
    "source_id" : "source",
    "creator": "author",
    "link" : "url",
    "image_url" : "urlToImage"
}, inplace=True)

# Fill all empty or null fields
na_df = na_df.fillna("Unknown")
nd_df = nd_df.fillna("Unknown")




In [71]:
final_data_frame = pd.merge(na_df, nd_df, on=['source', 'author', 'title', 'description', 'url', 'urlToImage', 'publishedAt', 'content'], how='outer')

final_data_frame = final_data_frame.iloc[:, :-6]

final_data_frame.head()







Unnamed: 0,Unnamed: 0_x,source,author,title,description,url,urlToImage,publishedAt,content
0,0.0,The Verge,Emma Roth,PayPal launches PYUSD stablecoin backed by the...,PayPal has launched a stablecoin called PayPal...,https://www.theverge.com/2023/8/7/23822752/pay...,https://cdn.vox-cdn.com/thumbor/AzUxs8UmwIY2lO...,2023-08-07T14:07:51Z,PayPal launches PYUSD stablecoin backed by the...
1,1.0,BBC News,https://www.facebook.com/bbcnews,Razzlekhan and husband guilty of $4.5bn Bitcoi...,Heather Morgan and husband Ilya Lichtenstein p...,https://www.bbc.co.uk/news/technology-66390639,https://ichef.bbci.co.uk/news/1024/branded_new...,2023-08-03T18:25:11Z,A husband and wife cyber-crime team have plead...
2,2.0,BBC News,https://www.facebook.com/bbcnews,Scottish university allegedly targeted in rans...,Data supposedly belonging to the University of...,https://www.bbc.co.uk/news/uk-scotland-glasgow...,https://ichef.bbci.co.uk/news/1024/branded_new...,2023-07-27T21:48:39Z,Data supposedly belonging to the University of...
3,3.0,TechCrunch,Jacquelyn Melinek,Monthly NFT sales fell for fifth consecutive m...,"In July, NFT sales totaled $495.6 million, dow...",https://techcrunch.com/2023/08/03/monthly-nft-...,https://s.yimg.com/ny/api/res/1.2/haVeaRLpzcQz...,2023-08-03T19:00:40Z,Welcome back to Chain Reaction.\r\nTo get a ro...
4,4.0,BBC News,Unknown,Oceans hit hottest recorded temperature,The average global sea surface temperature rea...,https://www.bbc.co.uk/programmes/p0g4wcky,https://ichef.bbci.co.uk/images/ic/1200x675/p0...,2023-08-04T14:54:00Z,The average global sea surface temperature rea...
