In [1]:
# Initial imports
import os
import pandas as pd
from dotenv import load_dotenv
import nltk as nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

%matplotlib inline

import time
import requests
import datetime
import dateutil
from dateutil.relativedelta import relativedelta

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\GuilleMGN\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
end = datetime.date.today()
start = end - relativedelta(years=22)

In [3]:
months_in_range = [x.split(' ') for x in pd.date_range(start, end, freq='MS').strftime("%Y %#m").tolist()]
months_in_range

[['2000', '4'],
 ['2000', '5'],
 ['2000', '6'],
 ['2000', '7'],
 ['2000', '8'],
 ['2000', '9'],
 ['2000', '10'],
 ['2000', '11'],
 ['2000', '12'],
 ['2001', '1'],
 ['2001', '2'],
 ['2001', '3'],
 ['2001', '4'],
 ['2001', '5'],
 ['2001', '6'],
 ['2001', '7'],
 ['2001', '8'],
 ['2001', '9'],
 ['2001', '10'],
 ['2001', '11'],
 ['2001', '12'],
 ['2002', '1'],
 ['2002', '2'],
 ['2002', '3'],
 ['2002', '4'],
 ['2002', '5'],
 ['2002', '6'],
 ['2002', '7'],
 ['2002', '8'],
 ['2002', '9'],
 ['2002', '10'],
 ['2002', '11'],
 ['2002', '12'],
 ['2003', '1'],
 ['2003', '2'],
 ['2003', '3'],
 ['2003', '4'],
 ['2003', '5'],
 ['2003', '6'],
 ['2003', '7'],
 ['2003', '8'],
 ['2003', '9'],
 ['2003', '10'],
 ['2003', '11'],
 ['2003', '12'],
 ['2004', '1'],
 ['2004', '2'],
 ['2004', '3'],
 ['2004', '4'],
 ['2004', '5'],
 ['2004', '6'],
 ['2004', '7'],
 ['2004', '8'],
 ['2004', '9'],
 ['2004', '10'],
 ['2004', '11'],
 ['2004', '12'],
 ['2005', '1'],
 ['2005', '2'],
 ['2005', '3'],
 ['2005', '4'],
 ['2005',

In [4]:
# Read your api key environment variable
load_dotenv()
api_key = os.getenv("NEWS_API_KEY")

In [6]:
def send_request(date):
    '''Sends a request to the NYT Archive API for given date.'''
    base_url = 'https://api.nytimes.com/svc/archive/v1/'
    url = base_url + '/' + date[0] + '/' + date[1] + '.json?api-key=' + api_key
    response = requests.get(url).json()
    time.sleep(6)
    return response


def is_valid(article, date):
    '''An article is only worth checking if it is in range, and has a headline.'''
    is_in_range = date > start and date < end
    has_headline = type(article['headline']) == dict and 'main' in article['headline'].keys()
    return is_in_range and has_headline


def parse_response(response):
    '''Parses and returns response as pandas data frame.'''
    data = {'headline': [],  
        'date': [], 
        'doc_type': [],
        'material_type': [],
        'section': [],
        'keywords': []}
    
    articles = response['response']['docs'] 
    for article in articles: # For each article, make sure it falls within our date range
        date = dateutil.parser.parse(article['pub_date']).date()
        if is_valid(article, date):
            data['date'].append(date)
            data['headline'].append(article['headline']['main']) 
            if 'section' in article:
                data['section'].append(article['section_name'])
            else:
                data['section'].append(None)
            data['doc_type'].append(article['document_type'])
            if 'type_of_material' in article: 
                data['material_type'].append(article['type_of_material'])
            else:
                data['material_type'].append(None)
            keywords = [keyword['value'] for keyword in article['keywords'] if keyword['name'] == 'subject']
            data['keywords'].append(keywords)
    return pd.DataFrame(data) 


def get_data(dates):
    '''Sends and parses request/response to/from NYT Archive API for given dates.'''
    total = 0
    print('Date range: ' + str(dates[0]) + ' to ' + str(dates[-1]))
    if not os.path.exists('headlines'):
        os.mkdir('headlines')
    for date in dates:
        response = send_request(date)
        df = parse_response(response)
        total += len(df)
        df.to_csv('headlines/' + date[0] + '-' + date[1] + '.csv', index=False)
        print('Saving headlines/' + date[0] + '-' + date[1] + '.csv...')
    print('Number of articles collected: ' + str(total))

In [7]:
get_data(months_in_range)

Date range: ['2021', '4'] to ['2022', '3']
Saving headlines/2021-4.csv...
Saving headlines/2021-5.csv...
Saving headlines/2021-6.csv...
Saving headlines/2021-7.csv...
Saving headlines/2021-8.csv...
Saving headlines/2021-9.csv...
Saving headlines/2021-10.csv...
Saving headlines/2021-11.csv...
Saving headlines/2021-12.csv...
Saving headlines/2022-1.csv...
Saving headlines/2022-2.csv...
Saving headlines/2022-3.csv...
Number of articles collected: 47861


In [9]:
import glob
os.chdir("headlines")

extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
#combine all files in the list
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
#export to csv
combined_csv.to_csv( "combined_csv.csv", index=False, encoding='utf-8-sig')