# Imports

In [26]:
import requests
import pandas as pd
from pprint import pprint
import json
from dotenv import load_dotenv
import os
import time
import datetime
import pymongo
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

pio.templates.default = "plotly_dark"
load_dotenv()
KEY=os.getenv("APIKEY")
USERNAME=os.getenv("USERNAME")
USERPWD=os.getenv("USERPWD")
FOLDER_PATH = '/home/guillaume/Python_Projects/DST_DE_project'


# Articles Search

In [27]:
def get_articles_years(filename, year='full', clean=True):
    """Requests articles about Covid-19 from the New York Times API, returns a list of dictionaries, and saves it to a json file.

    Args:
        filename (str): Name of the json file to save the articles to.
        year (str, optional): Year range of the articles search, can be either 2020, 2021 or 2022. Any other value gets the full data starting from Jan 2020. Defaults to 'full'.
        clean (bool, optional): Toggles the cleaning of unwanted keys. Defaults to True.

    Returns:
        list: a list object containing dictionaries of articles data.
    """

    begin_date_list_2020 = ['20200101', '20200201', '20200301', '20200401', '20200501', '20200601', '20200701', '20200801', '20200901', '20201001', '20201101', '20201201']
    begin_date_list_2021 = ['20210101', '20210201', '20210301', '20210401', '20210501', '20210601', '20210701', '20210801', '20210901', '20211001', '20211101', '20211201']
    begin_date_list_2022 = ['20220101', '20220201', '20220301', '20220401', '20220501', '20220601', '20220701', '20220801', '20220901', '20221001', '20221101', '20221201']
    begin_date_list = begin_date_list_2020 + begin_date_list_2021 + begin_date_list_2022

    end_date_list_2020 = ['20200131', '20200229', '20200331', '20200430', '20200531', '20200630', '20200731', '20200831', '20200930', '20201031', '20201130', '20201231']
    end_date_list_2021 = ['20210131', '20210228', '20210331', '20210430', '20210531', '20210630', '20210731', '20210831', '20210930', '20211031', '20211130', '20211231']
    end_date_list_2022 = ['20220131', '20220228', '20220331', '20220430', '20220531', '20220630', '20220731', '20220831', '20220930', '20221031', '20221130', '20221231']
    end_date_list = end_date_list_2020 + end_date_list_2021 + end_date_list_2022

    if year == 2020:
        begin_date = begin_date_list_2020
        end_date = end_date_list_2020
    elif year == 2021:
        begin_date = begin_date_list_2021
        end_date = end_date_list_2021
    elif year == 2022:
        begin_date = begin_date_list_2022
        end_date = end_date_list_2022
    else:
        begin_date = begin_date_list
        end_date = end_date_list

    results_list = []
    requestHeaders = {"Accept": "application/json"}

    for i in range(len(begin_date)):
        for j in range(101):
            url = f"https://api.nytimes.com/svc/search/v2/articlesearch.json?begin_date={begin_date[i]}&end_date={end_date[i]}&fq=headline%3A(%22covid%22%20%22coronavirus%22)&page={j}&sort=oldest&api-key={KEY}"

            try:
                response = requests.get(url, headers=requestHeaders).json()
                response_list = response['response']['docs']

                for k in response_list:
                    results_list.append(k)

                time.sleep(6.1)

            except:
                break

    if clean:
        lst_clean = results_list.copy()
        for i in lst_clean:
            i.pop('multimedia', None)

        with open(f"src/{filename}", 'w') as outfile:
            json.dump(lst_clean, outfile, indent=4)

        return lst_clean

    else:
        with open(f"src/{filename}", 'w') as outfile:
            json.dump(results_list, outfile, indent=4)

        return results_list

In [28]:
def get_articles_update(filename, begin_date, end_date, clean=True):
    """Requests articles about Covid-19 from the New York Times API, returns a list of dictionaries, and saves it to a json file.

    Args:
        filename (str): Name of the json file to save the articles to.
        begin_date (str): Begin date of the articles search, in the format YYYYMMDD.
        end_date (str): End date of the articles search, in the format YYYYMMDD.
        clean (bool, optional): Toggles the cleaning of unwanted keys. Defaults to True.

    Returns:
        list: a list object containing dictionaries of articles data.
    """

    results_list = []
    requestHeaders = {"Accept": "application/json"}

    for i in range(101):
        url = f"https://api.nytimes.com/svc/search/v2/articlesearch.json?begin_date={begin_date}&end_date={end_date}&fq=headline%3A(%22covid%22%20%22coronavirus%22)&page={i}&sort=oldest&api-key={KEY}"

        try:
            response = requests.get(url, headers=requestHeaders).json()
            response_list = response['response']['docs']

            for j in response_list:
                results_list.append(j)

            time.sleep(6.1)

        except:
            break

    if clean:
        lst_clean = results_list.copy()
        for i in lst_clean:
            i.pop('multimedia', None)

        with open(f"src/{filename}", 'w') as outfile:
            json.dump(lst_clean, outfile, indent=4)

        return lst_clean

    else:
        with open(f"src/{filename}", 'w') as outfile:
            json.dump(results_list, outfile, indent=4)

        return results_list

In [29]:
def clean_articles(lst):
    """Removes unwanted keys from a list of dictionaries.

    Args:
        lst (_type_): The input list of dictionaries to clean.

    Returns:
        _type_: A list of dictionaries with the unwanted keys removed.
    """

    lst_clean = lst.copy()
    for i in lst_clean:
        i.pop('multimedia', None)

    return lst_clean

# MongoDB

In [30]:
client = MongoClient(f"mongodb+srv://{USERNAME}:{USERPWD}@nyt-de.ganwi.mongodb.net/?retryWrites=true&w=majority", server_api=ServerApi('1'), serverSelectionTimeoutMS=5000)
db = client.test

try:
    pprint(client.server_info())
except Exception:
    pprint("Unable to connect to the server.")


{'$clusterTime': {'clusterTime': Timestamp(1669211556, 1),
                  'signature': {'hash': b'\xf7)>\xb9\xf6\x0f\xe2\xa7'
                                        b'\xee\xd3\x86=\xbe9E\xd5'
                                        b'\xee\x84\x1e\xcc',
                                'keyId': 7156162285394722821}},
 'allocator': 'tcmalloc',
 'bits': 64,
 'debug': False,
 'gitVersion': '0ca11aca38c75d3c8fb5bac5bd103b950718a896',
 'javascriptEngine': 'mozjs',
 'maxBsonObjectSize': 16777216,
 'modules': ['enterprise'],
 'ok': 1.0,
 'operationTime': Timestamp(1669211556, 1),
 'storageEngines': ['devnull', 'inMemory', 'queryable_wt', 'wiredTiger'],
 'sysInfo': 'deprecated',
 'version': '6.1.0',
 'versionArray': [6, 1, 0, 0]}


In [31]:
db_nyt = client.nyt

pprint(db.list_collection_names())

col_nyt = db_nyt['articles']

['articles']


In [32]:
db = client.nyt
col = db['articles']

In [33]:
results = list(col.find(projection={'pub_date': 1, 'news_desk': 1, '_id': 0}))

df_full = pd.DataFrame(results)
df = df_full.copy()
df = df[df['pub_date'] != '2021-01-27T17:00:00+0000']
df['pub_date'] = pd.to_datetime(df['pub_date']).dt.date
df['count'] = 1

In [74]:
df_grouped = df.groupby('pub_date').sum().reset_index()
df_category = df.groupby('news_desk').sum().reset_index()
df_category

Unnamed: 0,news_desk,count
0,,485
1,Arts&Leisure,8
2,AtHome,3
3,BookReview,12
4,Books,6
...,...,...
61,Washington,428
62,Weekend,7
63,Well,222
64,World,14


In [35]:
fig = px.line(df_grouped,
              x='pub_date',
              y='count',
              title='Number of Covid-19 Articles Published by the New York Times Per Day',
              color_discrete_sequence=['lightskyblue'],
              labels={'pub_date': 'Date', 'count': 'Number of Articles'})

fig.update_layout(width=800, height=500)
fig.show()

In [75]:
df_category = df_category[(df_category['news_desk'] != '') & (df_category['count'] > 100)].sort_values(by='count', ascending=False).reset_index(drop=True)
df_category

Unnamed: 0,news_desk,count
0,Foreign,1315
1,NYTNow,834
2,Science,789
3,OpEd,586
4,National,532
5,Business,436
6,Washington,428
7,Metro,272
8,Well,222
9,Sports,204


In [69]:
fig = px.bar(df_category,
             x='count',
             y='news_desk',
             title='Number of Covid-19 Articles Published by the New York Times Per Category',
             color='count',
             color_continuous_scale='Redor',
             orientation='h',
             labels={'news_desk': 'News Desk', 'count': 'Number of Articles'})

fig.update_layout(width=785, height=550)
fig.show()