In [23]:
import numpy as np
import pandas as pd

In [1]:
pip install --upgrade pynytimes

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import json
import time
import requests
import datetime
import dateutil
import pandas as pd
from dateutil.relativedelta import relativedelta

In [16]:
# функция, проверяющая наличие нужных слов для фильтрации статей
def is_rus(s):
    d = ['Russia', 'Putin', 'Kremlin', 'Vladimir', 'Kremlin', 'Yeltsyn', 'Medvedev']
    flag = False
    for el in d:
        if el in s:
            flag = True
            break
    return flag

In [17]:
# функция, отправляющая запрос архиву за определенную дату
def send_request(date):
    base_url = 'https://api.nytimes.com/svc/archive/v1/'
    url = base_url + '/' + date[0] + '/' + date[1] + '.json?api-key=' + 'B9KoXybdjViG9meU21nn9QAvrDbJdx26'
    response = requests.get(url).json()
    time.sleep(6)
    return response

# функция, проверяющая наличие заголовка и наличие необходимых слов в заголовке или абстракте
def is_valid(article, date):
    has_headline = type(article['headline']) == dict and 'main' in article['headline'].keys()
    return has_headline and (is_rus(article['headline']) or is_rus(article['abstract']))


# функция, осуществляющая парсинг данных и создание датафрейма
def parse_response(response):
    data = {'headline': [],  
        'date': [], 
        'doc_type': [],
        'material_type': [],
        'section': [],
        'keywords': [],
        'abstract': [],
        'lead': [],
        'word_count': []}
    
    articles = response['response']['docs'] 
    for article in articles: # For each article, make sure it falls within our date range
        date = dateutil.parser.parse(article['pub_date']).date()
        if is_valid(article, date):
            data['date'].append(date)
            data['headline'].append(article['headline']['main']) 
            if 'section' in article:
                data['section'].append(article['section_name'])
            else:
                data['section'].append(None)
            data['doc_type'].append(article['document_type'])
            if 'type_of_material' in article: 
                data['material_type'].append(article['type_of_material'])
            else:
                data['material_type'].append(None)
            keywords = [keyword['value'] for keyword in article['keywords'] if keyword['name'] == 'subject']
            data['keywords'].append(keywords)
            data['abstract'].append(article['abstract'])
            data['lead'].append(article['lead_paragraph'])
            if 'word_count' in article:
                data['word_count'].append(article['word_count'])
    return pd.DataFrame(data) 

# отправление и парсинг запроса
def get_data(dates):
    total = 0
    print('Date range: ' + str(dates[0]) + ' to ' + str(dates[-1]))
    if not os.path.exists('headlines'):
        os.mkdir('headlines')
    for date in dates:
        response = send_request(date)
        df = parse_response(response)
        total += len(df)
        df.to_csv('headlines/' + date[0] + '-' + date[1] + '.csv', index=False)
        print('Saving headlines/' + date[0] + '-' + date[1] + '.csv...')
    print('Number of articles collected: ' + str(total))

In [21]:
date_range = []
for i in range(1991, 2023):
    date_range.append([str(i), str(1)])
    date_range.append([str(i), str(6)])
    date_range.append([str(i), str(12)])

In [22]:
get_data(date_range)

Date range: ['1991', '1'] to ['2022', '12']
Saving headlines/1991-1.csv...
Saving headlines/1991-6.csv...
Saving headlines/1991-12.csv...
Saving headlines/1992-1.csv...
Saving headlines/1992-6.csv...
Saving headlines/1992-12.csv...
Saving headlines/1993-1.csv...
Saving headlines/1993-6.csv...
Saving headlines/1993-12.csv...
Saving headlines/1994-1.csv...
Saving headlines/1994-6.csv...
Saving headlines/1994-12.csv...
Saving headlines/1995-1.csv...
Saving headlines/1995-6.csv...
Saving headlines/1995-12.csv...
Saving headlines/1996-1.csv...
Saving headlines/1996-6.csv...
Saving headlines/1996-12.csv...
Saving headlines/1997-1.csv...
Saving headlines/1997-6.csv...
Saving headlines/1997-12.csv...
Saving headlines/1998-1.csv...
Saving headlines/1998-6.csv...
Saving headlines/1998-12.csv...
Saving headlines/1999-1.csv...
Saving headlines/1999-6.csv...
Saving headlines/1999-12.csv...
Saving headlines/2000-1.csv...
Saving headlines/2000-6.csv...
Saving headlines/2000-12.csv...
Saving headlines

In [29]:
df = pd.read_csv('headlines/1991-1.csv')
for i in range(1991, 2023):
    for el in [1, 6, 12]:
        if el != 1 and i != 1991:
            df1 = pd.read_csv(f'headlines/{i}-{el}.csv')
            df = pd.concat([df, df1], ignore_index=True)
df

Unnamed: 0,headline,date,doc_type,material_type,section,keywords,abstract,lead,word_count
0,NEWS SUMMARY,1991-01-01,article,Summary,,['NO INDEX TERMS'],INTERNATIONAL 2-7 Iraq would launch missi...,INTERNATIONAL 2-7,1042
1,Review/Television; A Maestro in Exile Is Welco...,1991-01-02,article,Review,,"['Music', 'Reviews', 'Television', 'Documentar...","""Soldiers of Music: Rostropovich Returns to ...","""Soldiers of Music: Rostropovich Returns to Ru...",708
2,Restive Soviet Republic Yields to the Kremlin,1991-01-02,article,News,,"['Language and Languages', 'MINORITIES (ETHNIC...",Acting to defuse one of the Soviet Union's m...,Acting to defuse one of the Soviet Union's mos...,382
3,NEWS SUMMARY,1991-01-02,article,Summary,,['NO INDEX TERMS'],International A2-10 U.S. military and pol...,International A2-10,1088
4,Detailing the Bolshevik Enormity,1991-01-03,article,Review,,['BOOK REVIEWS'],The Russian Revolution By Richard Pipes 944 ...,The Russian Revolution By Richard Pipes 944 pa...,1075
...,...,...,...,...,...,...,...,...,...
6066,"Xi and Putin Meet Again, Two Strongmen in a We...",2022-12-30,article,News,,"['International Relations', 'Russian Invasion ...",Russia is isolated by its invasion of Ukraine ...,"When China’s top leader, Xi Jinping, and Presi...",1562
6067,One Man Flees Putin’s Draft: An Update,2022-12-30,article,News,,['Draft and Recruitment (Military)'],"Kirill, a 24-year-old from the Moscow region, ...","This week, The Daily is revisiting some of our...",420
6068,‘The Daily’ checks in with a Russian man who f...,2022-12-30,article,News,,[],"Kirill, 24, worked at a nonprofit for homeless...","Kirill, 24, worked at a nonprofit for homeless...",99
6069,Putin and Zelensky will give dueling addresses...,2022-12-31,article,News,,['Russian Invasion of Ukraine (2022)'],Volodymyr Zelensky of Ukraine and Vladimir V. ...,With their troops locked in a bloody stalemate...,426


In [30]:
df.to_csv('nyt.csv')