# Imports

In [27]:
import requests
import pandas as pd
from pprint import pprint
import json
from dotenv import load_dotenv
import os
import time
import datetime
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

load_dotenv()
KEY=os.getenv("APIKEY")
USERNAME=os.getenv("USERNAME")
USERPWD=os.getenv("USERPWD")


# Articles Search

In [4]:
def get_articles_years(filename, year='full', clean=True):
    """Requests articles about Covid-19 from the New York Times API, returns a list of dictionaries, and saves it to a json file.

    Args:
        filename (str): Name of the json file to save the articles to.
        year (str, optional): Year range of the articles search, can be either 2020, 2021 or 2022. Any other value gets the full data starting from Jan 2020. Defaults to 'full'.
        clean (bool, optional): Toggles the cleaning of unwanted keys. Defaults to True.

    Returns:
        list: a list object containing dictionaries of articles data.
    """
    
    begin_date_list_2020 = ['20200101', '20200201', '20200301', '20200401', '20200501', '20200601', '20200701', '20200801', '20200901', '20201001', '20201101', '20201201']
    begin_date_list_2021 = ['20210101', '20210201', '20210301', '20210401', '20210501', '20210601', '20210701', '20210801', '20210901', '20211001', '20211101', '20211201']
    begin_date_list_2022 = ['20220101', '20220201', '20220301', '20220401', '20220501', '20220601', '20220701', '20220801', '20220901', '20221001', '20221101', '20221201']    
    begin_date_list = begin_date_list_2020 + begin_date_list_2021 + begin_date_list_2022

    end_date_list_2020 = ['20200131', '20200229', '20200331', '20200430', '20200531', '20200630', '20200731', '20200831', '20200930', '20201031', '20201130', '20201231']
    end_date_list_2021 = ['20210131', '20210228', '20210331', '20210430', '20210531', '20210630', '20210731', '20210831', '20210930', '20211031', '20211130', '20211231']
    end_date_list_2022 = ['20220131', '20220228', '20220331', '20220430', '20220531', '20220630', '20220731', '20220831', '20220930', '20221031', '20221130', '20221231']    
    end_date_list = end_date_list_2020 + end_date_list_2021 + end_date_list_2022
    
    if year == 2020:
        begin_date = begin_date_list_2020
        end_date = end_date_list_2020
    elif year == 2021:
        begin_date = begin_date_list_2021
        end_date = end_date_list_2021
    elif year == 2022:
        begin_date = begin_date_list_2022
        end_date = end_date_list_2022
    else:
        begin_date = begin_date_list
        end_date = end_date_list
    
    results_list = []    
    requestHeaders = {"Accept": "application/json"}
    
    for i in range(len(begin_date)):
        for j in range(101):
            url = f"https://api.nytimes.com/svc/search/v2/articlesearch.json?begin_date={begin_date[i]}&end_date={end_date[i]}&fq=headline%3A(%22covid%22%20%22coronavirus%22)&page={j}&sort=oldest&api-key={KEY}"
            
            try:
                response = requests.get(url, headers=requestHeaders).json()
                response_list = response['response']['docs']
                
                for k in response_list:
                    results_list.append(k)                        
                
                time.sleep(6.1)
                
            except:
                break
            
    if clean:
        lst_clean = results_list.copy()
        for i in lst_clean:
            i.pop('multimedia', None)

        with open(f"src/{filename}", 'w') as outfile:
            json.dump(lst_clean, outfile, indent=4)

        return lst_clean
    
    else:       
        with open(f"src/{filename}", 'w') as outfile:
            json.dump(results_list, outfile, indent=4)
        
        return results_list

In [5]:
def get_articles_update(filename, begin_date, end_date, clean=True):
    """Requests articles about Covid-19 from the New York Times API, returns a list of dictionaries, and saves it to a json file.

    Args:
        filename (str): Name of the json file to save the articles to.
        begin_date (str): Begin date of the articles search, in the format YYYYMMDD.
        end_date (str): End date of the articles search, in the format YYYYMMDD.
        clean (bool, optional): Toggles the cleaning of unwanted keys. Defaults to True.

    Returns:
        list: a list object containing dictionaries of articles data.
    """
    
    results_list = []    
    requestHeaders = {"Accept": "application/json"}
    
    for i in range(101):
        url = f"https://api.nytimes.com/svc/search/v2/articlesearch.json?begin_date={begin_date}&end_date={end_date}&fq=headline%3A(%22covid%22%20%22coronavirus%22)&page={i}&sort=oldest&api-key={KEY}"
        
        try:
            response = requests.get(url, headers=requestHeaders).json()
            response_list = response['response']['docs']
            
            for j in response_list:
                results_list.append(j)                        
            
            time.sleep(6.1)
            
        except:
            break
            
    if clean:
        lst_clean = results_list.copy()
        for i in lst_clean:
            i.pop('multimedia', None)

        with open(f"src/{filename}", 'w') as outfile:
            json.dump(lst_clean, outfile, indent=4)

        return lst_clean
    
    else:       
        with open(f"src/{filename}", 'w') as outfile:
            json.dump(results_list, outfile, indent=4)
        
        return results_list

In [6]:
def clean_articles(lst):
    """Removes unwanted keys from a list of dictionaries.

    Args:
        lst (_type_): The input list of dictionaries to clean.

    Returns:
        _type_: A list of dictionaries with the unwanted keys removed.
    """

    lst_clean = lst.copy()
    for i in lst_clean:
        i.pop('multimedia', None)
        
    return lst_clean

In [22]:
begin_date = '20221017'
end_date = '20221028'

In [11]:
articles_update = get_articles_update('articles_update.json', begin_date, end_date, clean=True)

In [13]:
len(articles_update)

11

In [12]:
with open('src/articles_full_clean.json', "r") as infile:
    articles_full_clean = json.load(infile)
    
len(articles_full_clean)

6997

In [14]:
articles_full_clean.extend(articles_update)

len(articles_full_clean)

7008

In [18]:

with open('src/articles_full_clean.json', "w") as outfile:
    json.dump(articles_full_clean, outfile, indent=4)

# Updating the query date bounds

In [40]:
with open('src/begin_date.txt', 'r') as infile:
    begin_date = infile.read()

begin_dt = datetime.datetime.strptime(begin_date, '%Y%m%d')
new_begin_dt = begin_dt + datetime.timedelta(days=8)
new_end_dt = new_begin_dt + datetime.timedelta(days=7)
new_begin_date = new_begin_dt.strftime('%Y%m%d')
new_end_date = new_end_dt.strftime('%Y%m%d')

with open('src/begin_date.txt', 'w') as outfile:
    outfile.write(new_begin_date)
    
new_end_date

'20221117'

In [31]:
end_dt = begin_dt + datetime.timedelta(days=7)
end_dt

datetime.datetime(2022, 10, 24, 0, 0)

In [33]:
end_date = end_dt.strftime('%Y%m%d')
end_date

'20221024'

In [None]:
def update_dates():
    """Updates the bounds of the articles search by 8 days and writes the new begin date to a file.

    Returns:
        None
    """
    
    with open('src/begin_date.txt', 'r') as infile:
        begin_date = infile.read()

    begin_dt = datetime.datetime.strptime(begin_date, '%Y%m%d')
    new_begin_dt = begin_dt + datetime.timedelta(days=8)
    new_end_dt = new_begin_dt + datetime.timedelta(days=7)
    new_begin_date = new_begin_dt.strftime('%Y%m%d')
    new_end_date = new_end_dt.strftime('%Y%m%d')

    with open('src/begin_date.txt', 'w') as outfile:
        outfile.write(new_begin_date)

# MongoDB

In [15]:
client = MongoClient(f"mongodb+srv://{USERNAME}:{USERPWD}@nyt-de.ganwi.mongodb.net/?retryWrites=true&w=majority", server_api=ServerApi('1'), serverSelectionTimeoutMS=5000)
db = client.test

try:
    pprint(client.server_info())
except Exception:
    pprint("Unable to connect to the server.")


{'$clusterTime': {'clusterTime': Timestamp(1666962217, 1),
                  'signature': {'hash': b'+\xc7f\xbb~\x1d\xf7^\xe5\xdc\x9f\xce'
                                        b'0\xd6&b\x87FI\x0b',
                                'keyId': 7156162285394722821}},
 'allocator': 'tcmalloc',
 'bits': 64,
 'debug': False,
 'gitVersion': '0ca11aca38c75d3c8fb5bac5bd103b950718a896',
 'javascriptEngine': 'mozjs',
 'maxBsonObjectSize': 16777216,
 'modules': ['enterprise'],
 'ok': 1.0,
 'operationTime': Timestamp(1666962217, 1),
 'storageEngines': ['devnull', 'inMemory', 'queryable_wt', 'wiredTiger'],
 'sysInfo': 'deprecated',
 'version': '6.1.0',
 'versionArray': [6, 1, 0, 0]}


In [16]:
db_nyt = client.nyt

pprint(db.list_collection_names())

col_nyt = db_nyt['articles']

['articles']


In [17]:
results = col_nyt.insert_many(articles_update)

# Covid Cases Data

In [None]:
# Covid US cumulative data
df_us = pd.read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us.csv')
df_states = pd.read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv')
df_counties = pd.read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv')

df_us

Unnamed: 0,date,cases,deaths
0,2020-01-21,1,0
1,2020-01-22,1,0
2,2020-01-23,1,0
3,2020-01-24,2,0
4,2020-01-25,3,0
...,...,...,...
992,2022-10-09,96447636,1058245
993,2022-10-10,96471883,1058408
994,2022-10-11,96522454,1059005
995,2022-10-12,96596469,1059792


In [None]:
df_states

Unnamed: 0,date,state,fips,cases,deaths
0,2020-01-21,Washington,53,1,0
1,2020-01-22,Washington,53,1,0
2,2020-01-23,Washington,53,1,0
3,2020-01-24,Illinois,17,1,0
4,2020-01-24,Washington,53,1,0
...,...,...,...,...,...
52921,2022-10-13,Virginia,51,2101702,22012
52922,2022-10-13,Washington,53,1826131,14468
52923,2022-10-13,West Virginia,54,603859,7457
52924,2022-10-13,Wisconsin,55,1879656,15344


In [None]:
df_counties

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061.0,1,0.0
1,2020-01-22,Snohomish,Washington,53061.0,1,0.0
2,2020-01-23,Snohomish,Washington,53061.0,1,0.0
3,2020-01-24,Cook,Illinois,17031.0,1,0.0
4,2020-01-24,Snohomish,Washington,53061.0,1,0.0
...,...,...,...,...,...,...
2502827,2022-05-13,Sweetwater,Wyoming,56037.0,11088,126.0
2502828,2022-05-13,Teton,Wyoming,56039.0,10074,16.0
2502829,2022-05-13,Uinta,Wyoming,56041.0,5643,39.0
2502830,2022-05-13,Washakie,Wyoming,56043.0,2358,44.0


In [None]:
# Covid US live data
df_us = pd.read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/live/us.csv')
df_states = pd.read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/live/us-states.csv')
df_counties = pd.read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/live/us-counties.csv')

df_us

Unnamed: 0,date,cases,deaths
0,2022-10-14,96671641,1060430
