In [1]:
import numpy as np
import pandas as pd
import regex as re
import requests
import time


In [14]:
apikey = ''

In [15]:
del apikey

Article Search documentation 

https://developer.nytimes.com/docs/articlesearch-product/1/overview

Article Search example, parameters, and testing  

https://developer.nytimes.com/docs/articlesearch-product/1/routes/articlesearch.json/get

### Testing API

In [26]:
url = 'https://api.nytimes.com/svc/search/v2/articlesearch.json'

res = requests.get(
    url,
    params = {
        'begin_date': '20120101',
        'end_date': '20200101',
        'sort': 'newest',
        'page': '0',
        
        'type': 'article',
        'q': 'flood',
        'fq': 'subject:("Floods")',
        'api-key': apikey
    }
)
assert res.status_code == 200

In [27]:
res.json()['response']['meta']

{'hits': 1244, 'offset': 0, 'time': 298}

In [159]:
res.json()['response']['docs'][0].keys()

dict_keys(['abstract', 'web_url', 'snippet', 'lead_paragraph', 'print_section', 'print_page', 'source', 'multimedia', 'headline', 'keywords', 'pub_date', 'document_type', 'news_desk', 'section_name', 'subsection_name', 'byline', 'type_of_material', '_id', 'word_count', 'uri'])

In [137]:
res.json()['response']['docs'][0]

{'abstract': 'A series of winter storms unleashed heavy rain and strong winds across Portugal, Spain, France and Britain.',
 'web_url': 'https://www.nytimes.com/2019/12/22/world/europe/storm-elsa-fabien-floods.html',
 'snippet': 'A series of winter storms unleashed heavy rain and strong winds across Portugal, Spain, France and Britain.',
 'lead_paragraph': 'VIGO, Spain — A one-two punch of winter storms has socked Europe with strong winds and rain over several days, leaving at least nine people dead. ',
 'print_section': 'A',
 'print_page': '9',
 'source': 'The New York Times',
 'multimedia': [{'rank': 0,
   'subtype': 'xlarge',
   'caption': None,
   'credit': None,
   'type': 'image',
   'url': 'images/2019/12/22/world/22europe-weather/merlin_166255482_a49b6b8c-dae1-4c7e-babd-de79ddb1e167-articleLarge.jpg',
   'height': 400,
   'width': 600,
   'legacy': {'xlarge': 'images/2019/12/22/world/22europe-weather/merlin_166255482_a49b6b8c-dae1-4c7e-babd-de79ddb1e167-articleLarge.jpg',
    '

In [138]:
res.json()['response']['docs'][0]['multimedia']

[{'rank': 0,
  'subtype': 'xlarge',
  'caption': None,
  'credit': None,
  'type': 'image',
  'url': 'images/2019/12/22/world/22europe-weather/merlin_166255482_a49b6b8c-dae1-4c7e-babd-de79ddb1e167-articleLarge.jpg',
  'height': 400,
  'width': 600,
  'legacy': {'xlarge': 'images/2019/12/22/world/22europe-weather/merlin_166255482_a49b6b8c-dae1-4c7e-babd-de79ddb1e167-articleLarge.jpg',
   'xlargewidth': 600,
   'xlargeheight': 400},
  'subType': 'xlarge',
  'crop_name': 'articleLarge'},
 {'rank': 0,
  'subtype': 'popup',
  'caption': None,
  'credit': None,
  'type': 'image',
  'url': 'images/2019/12/22/world/22europe-weather/merlin_166255482_a49b6b8c-dae1-4c7e-babd-de79ddb1e167-popup.jpg',
  'height': 433,
  'width': 650,
  'legacy': {},
  'subType': 'popup',
  'crop_name': 'popup'},
 {'rank': 0,
  'subtype': 'blog480',
  'caption': None,
  'credit': None,
  'type': 'image',
  'url': 'images/2019/12/22/world/22europe-weather/merlin_166255482_a49b6b8c-dae1-4c7e-babd-de79ddb1e167-blog480.

### Function to query articles
with subject "Floods"

In [26]:
# pub_date, multimedia, width, height, ['headline']['main'], web_url
def get_nyt_api_articles(begin_date, end_date, page, key):
    
    url = 'https://api.nytimes.com/svc/search/v2/articlesearch.json'
    res = requests.get(
        url,
        params = {
            'begin_date': begin_date,
            'end_date': end_date,
            'sort': 'newest',
            'page': page,

            'type': 'article',
            'q': 'flood',
            'fq': 'subject:("Floods")',
            'api-key': key
        }
    )
    assert res.status_code == 200
    
    pub_date_list = []
    width_list = []
    height_list = []
    image_url_list = []
    headline_list = []
    web_url_list = []
    for document in res.json()['response']['docs']:
        headline = document['headline']['main']
        web_url = document['web_url']
        pub_date = pd.to_datetime(document['pub_date'])
        
        for image in document['multimedia']:
            if image['crop_name'] == 'master675': # master version of image
                pub_date_list.append(pub_date)
                width_list.append(int(image['width']))
                height_list.append(int(image['height']))
                image_url_list.append(image['url'])
                headline_list.append(headline)
                web_url_list.append(web_url)
    
    return pd.DataFrame(
        data = {
            'pub_date': pub_date_list,
            'width': width_list,
            'height': height_list,
            'image_url': image_url_list,
            'headline': headline_list,
            'web_url': web_url_list
        }
    )

In [164]:
df = get_nyt_api_articles(
    begin_date = '20120101',
    end_date = '20200101',
    page = 0,
    key = apikey
)

In [166]:
df.dtypes

pub_date     object
width         int64
height        int64
image_url    object
headline     object
web_url      object
dtype: object

In [165]:
df

Unnamed: 0,pub_date,width,height,image_url,headline,web_url
0,2019-12-22T18:43:45+0000,600,400,images/2019/12/22/world/22europe-weather/merli...,Severe Weather Across Europe Leaves at Least 9...,https://www.nytimes.com/2019/12/22/world/europ...
1,2019-12-22T18:43:45+0000,650,433,images/2019/12/22/world/22europe-weather/merli...,Severe Weather Across Europe Leaves at Least 9...,https://www.nytimes.com/2019/12/22/world/europ...
2,2019-12-22T18:43:45+0000,480,320,images/2019/12/22/world/22europe-weather/merli...,Severe Weather Across Europe Leaves at Least 9...,https://www.nytimes.com/2019/12/22/world/europ...
3,2019-12-22T18:43:45+0000,533,355,images/2019/12/22/world/22europe-weather/merli...,Severe Weather Across Europe Leaves at Least 9...,https://www.nytimes.com/2019/12/22/world/europ...
4,2019-12-22T18:43:45+0000,427,285,images/2019/12/22/world/22europe-weather/merli...,Severe Weather Across Europe Leaves at Least 9...,https://www.nytimes.com/2019/12/22/world/europ...
...,...,...,...,...,...,...
652,2019-11-21T10:00:28+0000,1050,550,images/2019/11/22/climate/00CLI-AGRICULTURE-pr...,A Wet Year Causes Farm Woes Far Beyond the Flo...,https://www.nytimes.com/2019/11/21/climate/far...
653,2019-11-21T10:00:28+0000,312,348,images/2019/11/22/climate/00CLI-AGRICULTURE-pr...,A Wet Year Causes Farm Woes Far Beyond the Flo...,https://www.nytimes.com/2019/11/21/climate/far...
654,2019-11-21T10:00:28+0000,272,303,images/2019/11/22/climate/00CLI-AGRICULTURE-pr...,A Wet Year Causes Farm Woes Far Beyond the Flo...,https://www.nytimes.com/2019/11/21/climate/far...
655,2019-11-21T10:00:28+0000,735,1103,images/2019/11/22/climate/00CLI-AGRICULTURE-pr...,A Wet Year Causes Farm Woes Far Beyond the Flo...,https://www.nytimes.com/2019/11/21/climate/far...


### Get all 1000 items returned by query

In [36]:
def get_nyt_api_images(begin_date_, end_date_):
    start_time = time.time()
    
    print(f'Beginning requests for date range {begin_date_}-{end_date_}.')
    
    df_list = []
    for i in range(0, 100):
        if i % 10 == 0:
            print(f'Sending request {i} at time {round(time.time() - start_time)} s...')

        df_list.append(get_nyt_api_articles(
            begin_date = begin_date_,
            end_date = end_date_,
            page = i,
            key = apikey
        ))

        time.sleep(12) # never < 6

    print(f'Image links received. Took {round(time.time() - start_time)} s.')
    df = pd.concat(df_list)
    df.reset_index(drop=True, inplace=True)
    
    return df

### Get 1000 items from several different date ranges

each of which has ~1000 hits

In [37]:
def get_n_hits(begin_date, end_date):
    url = 'https://api.nytimes.com/svc/search/v2/articlesearch.json'
    res = requests.get(
        url,
        params = {
            'begin_date': begin_date,
            'end_date': end_date,

            'sort': 'newest',
            'page': '0',
            'type': 'article',
            'q': 'flood',
            'fq': 'subject:("Floods")',
            'api-key': apikey
        }
    )
    assert res.status_code == 200
    print(f'{begin_date}-{end_date}', res.json()['response']['meta'])

In [7]:
get_n_hits('20120101', '20200101')

20120101-20200101 {'hits': 1022, 'offset': 0, 'time': 145}


In [33]:
get_n_hits('20040101', '20120101')

20040101-20120101 {'hits': 1042, 'offset': 0, 'time': 89}


In [15]:
get_n_hits('19900101', '20040101')

19900101-20040101 {'hits': 1026, 'offset': 0, 'time': 252}


In [19]:
get_n_hits('19770101', '19900101')

19770101-19900101 {'hits': 1052, 'offset': 0, 'time': 109}


In [21]:
get_n_hits('19700101', '19770101')

19700101-19770101 {'hits': 1044, 'offset': 0, 'time': 187}


In [22]:
df_list = []

In [29]:
df_list.append(get_nyt_api_images('20120101', '20200101'))

Beginning requests for date range 20120101-20200101.
Sending request 0 at time 0 s...
Sending request 10 at time 114 s...
Sending request 20 at time 226 s...
Sending request 30 at time 340 s...
Sending request 40 at time 453 s...
Sending request 50 at time 566 s...
Sending request 60 at time 681 s...
Sending request 70 at time 793 s...
Sending request 80 at time 906 s...
Sending request 90 at time 1017 s...
Sending request 100 at time 1127 s...
Image links received. Took 1138 s.


In [31]:
df_list[0]

Unnamed: 0,pub_date,width,height,image_url,headline,web_url
0,2019-12-22 18:43:45+00:00,675.0,450.0,images/2019/12/22/world/22europe-weather/merli...,Severe Weather Across Europe Leaves at Least 9...,https://www.nytimes.com/2019/12/22/world/europ...
1,2019-12-19 10:00:24+00:00,675.0,450.0,images/2019/12/20/nyregion/20queenssewer3-prin...,Raw Sewage Flooded Their Homes. They Finally K...,https://www.nytimes.com/2019/12/19/nyregion/qu...
2,2019-12-18 10:04:31+00:00,675.0,420.0,images/2019/12/18/world/18xp-harvey-1/merlin_1...,Government Liable to Some Hurricane Harvey Vic...,https://www.nytimes.com/2019/12/18/us/harvey-f...
3,2019-12-04 17:55:29+00:00,675.0,450.0,images/2019/12/04/climate/04CLI-KEYS1/04CLI-KE...,Florida Keys Deliver a Hard Message: As Seas R...,https://www.nytimes.com/2019/12/04/climate/flo...
4,2019-12-03 07:37:28+00:00,675.0,450.0,images/2019/12/03/world/03phylippines-typhoon1...,Typhoon Kammuri Kills at Least 17 as It Powers...,https://www.nytimes.com/2019/12/03/world/asia/...
...,...,...,...,...,...,...
653,2013-05-29 23:47:51+00:00,675.0,450.0,images/2013/05/30/world/swiss/swiss-master675.jpg,"As Glaciers Melt, Alpine Mountains Lose Their ...",https://www.nytimes.com/2013/05/30/world/europ...
654,2013-05-28 23:30:58+00:00,675.0,450.0,images/2013/05/29/nyregion/29cityroom-coned2/2...,"As Hurricane Season Nears, Con Ed Shows Off Eq...",https://cityroom.blogs.nytimes.com/2013/05/28/...
655,2013-05-11 00:46:19+00:00,675.0,416.0,images/2013/05/11/business/Money1/Money1-maste...,"Rebuilding After Sandy, but With Costly New Rules",https://www.nytimes.com/2013/05/11/your-money/...
656,2012-10-31 01:35:07+00:00,675.0,481.0,images/2012/10/31/nyregion/FLOODZONE/FLOODZONE...,"For Years, Warnings That It Could Happen Here",https://www.nytimes.com/2012/10/31/nyregion/fo...


In [38]:
df_list.append(get_nyt_api_images('20040101', '20120101'))

Beginning requests for date range 20040101-20120101.
Sending request 0 at time 0 s...
Sending request 10 at time 131 s...
Sending request 20 at time 262 s...
Sending request 30 at time 393 s...
Sending request 40 at time 524 s...
Sending request 50 at time 654 s...
Sending request 60 at time 786 s...
Sending request 70 at time 917 s...
Sending request 80 at time 1048 s...
Sending request 90 at time 1179 s...
Image links received. Took 1312 s.


In [40]:
df = pd.concat(df_list).reset_index(drop = True)

In [41]:
df

Unnamed: 0,pub_date,width,height,image_url,headline,web_url
0,2019-12-22 18:43:45+00:00,675.0,450.0,images/2019/12/22/world/22europe-weather/merli...,Severe Weather Across Europe Leaves at Least 9...,https://www.nytimes.com/2019/12/22/world/europ...
1,2019-12-19 10:00:24+00:00,675.0,450.0,images/2019/12/20/nyregion/20queenssewer3-prin...,Raw Sewage Flooded Their Homes. They Finally K...,https://www.nytimes.com/2019/12/19/nyregion/qu...
2,2019-12-18 10:04:31+00:00,675.0,420.0,images/2019/12/18/world/18xp-harvey-1/merlin_1...,Government Liable to Some Hurricane Harvey Vic...,https://www.nytimes.com/2019/12/18/us/harvey-f...
3,2019-12-04 17:55:29+00:00,675.0,450.0,images/2019/12/04/climate/04CLI-KEYS1/04CLI-KE...,Florida Keys Deliver a Hard Message: As Seas R...,https://www.nytimes.com/2019/12/04/climate/flo...
4,2019-12-03 07:37:28+00:00,675.0,450.0,images/2019/12/03/world/03phylippines-typhoon1...,Typhoon Kammuri Kills at Least 17 as It Powers...,https://www.nytimes.com/2019/12/03/world/asia/...
...,...,...,...,...,...,...
657,2012-10-28 14:14:53+00:00,675.0,449.0,images/2012/10/28/us/29storm_earlyss-slide-EW0...,Sharp Warnings as Hurricane Churns In,https://www.nytimes.com/2012/10/29/us/east-coa...
658,2010-09-11 20:32:44+00:00,675.0,1013.0,images/2010/08/11/opinion/brisbane/brisbane-ma...,Readers’ Views: Equality Among the Dead,https://www.nytimes.com/2010/09/12/opinion/12p...
659,2007-12-16 05:00:00+00:00,675.0,338.0,images/2018/04/02/opinion/thomas-l-friedman/th...,It’s Too Late for Later,https://www.nytimes.com/2007/12/16/opinion/16f...
660,2006-09-29 04:00:00+00:00,675.0,463.0,images/2006/09/28/world/29water.1.ready/29wate...,"In Teeming India, Water Crisis Means Dry Pipes...",https://www.nytimes.com/2006/09/29/world/asia/...


In [45]:
df.to_csv('../data/nyt_images.csv')

In [8]:
df = pd.read_csv('../data/nyt_images.csv', index_col = 0)

In [9]:
df.iloc[0]['image_url']

'images/2019/12/22/world/22europe-weather/merlin_166255482_a49b6b8c-dae1-4c7e-babd-de79ddb1e167-master675.jpg'

### Download all images

In [12]:
def download_image(image_url, image_number, start_time):
    if image_number % 10 == 0:
        print(f'Getting image {image_number} at time {round(time.time() - start_time)} s...')
    
    res = requests.get('https://www.nytimes.com/' + image_url)
    assert res.status_code == 200
    
    with open(f'../images/nyt_image_{image_number}.jpg', 'wb') as file: # write, binary
        file.write(res.content)
    
    time.sleep(3)

In [13]:
start_time = time.time()
print('Beginning requests for images.')
for i, image_url in enumerate(df['image_url']):
    try:
        download_image(image_url, i, start_time)
    except AssertionError:
        print('Bad status code at image {i}. Stopping.')
        break
print(f'Images received. Took {round(time.time() - start_time)} s.')

Beginning requests for images.
Getting image 0 at time 0 s...
Getting image 10 at time 37 s...
Getting image 20 at time 73 s...
Getting image 30 at time 110 s...
Getting image 40 at time 146 s...
Getting image 50 at time 183 s...
Getting image 60 at time 219 s...
Getting image 70 at time 256 s...
Getting image 80 at time 292 s...
Getting image 90 at time 328 s...
Getting image 100 at time 364 s...
Getting image 110 at time 401 s...
Getting image 120 at time 437 s...
Getting image 130 at time 473 s...
Getting image 140 at time 509 s...
Getting image 150 at time 545 s...
Getting image 160 at time 581 s...
Getting image 170 at time 617 s...
Getting image 180 at time 653 s...
Getting image 190 at time 690 s...
Getting image 200 at time 727 s...
Getting image 210 at time 764 s...
Getting image 220 at time 800 s...
Getting image 230 at time 836 s...
Getting image 240 at time 872 s...
Getting image 250 at time 908 s...
Getting image 260 at time 945 s...
Getting image 270 at time 982 s...
Gett