In [113]:
import json
import time
import requests
import datetime
import dateutil
import pandas as pd
import module
from dateutil.relativedelta import relativedelta

#### Get data from NYT API

Send request to NYT

In [114]:
# Year and Month
date = ['2020', '1']

In [115]:
def send_request(date):
    base_url = 'https://api.nytimes.com/svc/archive/v1'
    url = base_url + '/' + date[0] + '/' + date[1] + '.json?api-key=' + '3YIxSGJ8fF20rV8LAKKKPx05mNoB9AFl'
    response = requests.get(url).json()
    time.sleep(6)
    return response

In [116]:
response = send_request(date)

Parse data and turn to data frame

In [117]:
def parse_response(response):
    data = {
        'date': [],
        'url' : [],
        'headline': [],  
        'articles' : [],
        'doc_type': [],
        'material_type': [],
        'section': [],
        'keywords': []
        }
    articles = response['response']['docs'] 
    for article in articles: # For each article, make sure it falls within our date range
        date = dateutil.parser.parse(article['pub_date']).date()
        data['date'].append(date)
        data['headline'].append(article['headline']['main']) 
        data['url'].append(article['web_url'])
        data['articles'].append(article['snippet'])
        if 'section' in article:
            data['section'].append(article['section_name'])
        else:
            data['section'].append(None)
        data['doc_type'].append(article['document_type'])
        if 'type_of_material' in article: 
            data['material_type'].append(article['type_of_material'])
        else:
            data['material_type'].append(None)
        keywords = [keyword['value'] for keyword in article['keywords'] if keyword['name'] == 'subject']
        data['keywords'].append(keywords)
    return pd.DataFrame(data) 

In [118]:
df = parse_response(response)

In [119]:
df

Unnamed: 0,date,url,headline,articles,doc_type,material_type,section,keywords
0,2020-01-01,https://www.nytimes.com/2019/12/31/us/texas-ch...,‘Battling a Demon’: Drifter Sought Help Before...,The gunman who shot two parishioners at the We...,article,News,,"[Churches (Buildings), Murders, Attempted Murd..."
1,2020-01-01,https://www.nytimes.com/2019/12/31/opinion/for...,Protect Veterans From Fraud,Congress could do much more to protect America...,article,Editorial,,"[Veterans, For-Profit Schools, Financial Aid (..."
2,2020-01-01,https://www.nytimes.com/2019/12/31/health/e-ci...,F.D.A. Plans to Ban Most E-Cigarette Flavors b...,The tobacco and vaping industries and conserva...,article,News,,"[E-Cigarettes, Recalls and Bans of Products, M..."
3,2020-01-01,https://www.nytimes.com/2019/12/31/crosswords/...,‘It’s Green and Slimy’,Christina Iverson and Jeff Chen ring in the Ne...,article,News,,[Crossword Puzzles]
4,2020-01-01,https://www.nytimes.com/2019/12/31/pageoneplus...,"Corrections: Jan. 1, 2020",Corrections that appeared in print on Wednesda...,article,Correction,,[]
...,...,...,...,...,...,...,...,...
4475,2020-01-31,https://www.nytimes.com/2020/01/31/sports/bask...,Lakers Fall to Blazers on Emotional Night Hono...,It was the Lakers’ first game since Bryant and...,article,News,,[Basketball]
4476,2020-01-31,https://www.nytimes.com/2020/01/31/sports/olym...,Alberto Salazar Is Suspended by SafeSport Afte...,The famed running coach was already barred fro...,article,News,,"[Running, Coaches and Managers]"
4477,2020-01-31,https://www.nytimes.com/2020/01/31/health/cpr-...,"CPR, by Default","When very old patients suffer cardiac arrest, ...",article,News,,"[Hospitals, Defibrillators, Living Wills and H..."
4478,2020-01-31,https://www.nytimes.com/video/us/politics/1000...,Impeachment Trial Highlights: A Showdown Over ...,Senators rejected a call for additional witnes...,multimedia,Video,,"[Impeachment, Trump-Ukraine Whistle-blower Com..."


In [120]:
def extract_label(x):
    '''To extract label in url'''
    df[x] = df['url']
    df[x] = df[x].str.replace(r'(https?:\/\/www.nytimes.com\/(interactive)\/\d+\/\d+\/\d+\/)','', regex=True)
    df[x] = df[x].str.replace(r'(https?:\/\/www.nytimes.com\/\d+\/\d+\/\d+\/)','', regex=True)
    df[x] = df[x].str.replace(r'(https?:\/\/www.nytimes.com\/(slideshow)\/\d+\/\d+\/\d+\/)','', regex=True)
    df[x] = df[x].str.replace(r'(https?:\/\/www.nytimes.com\/(interactive)\/\d+\/)','', regex=True)
    df[x] = df[x].str.replace(r'(https?:\/\/www.nytimes.com\/(video)\/)','', regex=True)
    df[x] = df[x].str.replace(r'(https?:\/\/www.nytimes.com\/)','', regex=True)
    df[x] = df[x].str.replace(r'(https?:\/\/brandedplaylist.nytimes.com\/)','', regex=True)
    df[x] = df[x].str.replace(r'((us)\/)','', regex=True)
    df[x] = df[x].str.replace(r'(\/.+)','', regex=True)
    df[x] = df[x].str.replace(r'\s+','', regex=True)
    df[x] = df[x].str.replace(r'(.+(.html))','us', regex=True)
    return df[x]

In [121]:
df['label'] = extract_label('label')

In [122]:
df

Unnamed: 0,date,url,headline,articles,doc_type,material_type,section,keywords,label
0,2020-01-01,https://www.nytimes.com/2019/12/31/us/texas-ch...,‘Battling a Demon’: Drifter Sought Help Before...,The gunman who shot two parishioners at the We...,article,News,,"[Churches (Buildings), Murders, Attempted Murd...",us
1,2020-01-01,https://www.nytimes.com/2019/12/31/opinion/for...,Protect Veterans From Fraud,Congress could do much more to protect America...,article,Editorial,,"[Veterans, For-Profit Schools, Financial Aid (...",opinion
2,2020-01-01,https://www.nytimes.com/2019/12/31/health/e-ci...,F.D.A. Plans to Ban Most E-Cigarette Flavors b...,The tobacco and vaping industries and conserva...,article,News,,"[E-Cigarettes, Recalls and Bans of Products, M...",health
3,2020-01-01,https://www.nytimes.com/2019/12/31/crosswords/...,‘It’s Green and Slimy’,Christina Iverson and Jeff Chen ring in the Ne...,article,News,,[Crossword Puzzles],crosswords
4,2020-01-01,https://www.nytimes.com/2019/12/31/pageoneplus...,"Corrections: Jan. 1, 2020",Corrections that appeared in print on Wednesda...,article,Correction,,[],us
...,...,...,...,...,...,...,...,...,...
4475,2020-01-31,https://www.nytimes.com/2020/01/31/sports/bask...,Lakers Fall to Blazers on Emotional Night Hono...,It was the Lakers’ first game since Bryant and...,article,News,,[Basketball],sports
4476,2020-01-31,https://www.nytimes.com/2020/01/31/sports/olym...,Alberto Salazar Is Suspended by SafeSport Afte...,The famed running coach was already barred fro...,article,News,,"[Running, Coaches and Managers]",sports
4477,2020-01-31,https://www.nytimes.com/2020/01/31/health/cpr-...,"CPR, by Default","When very old patients suffer cardiac arrest, ...",article,News,,"[Hospitals, Defibrillators, Living Wills and H...",health
4478,2020-01-31,https://www.nytimes.com/video/us/politics/1000...,Impeachment Trial Highlights: A Showdown Over ...,Senators rejected a call for additional witnes...,multimedia,Video,,"[Impeachment, Trump-Ukraine Whistle-blower Com...",politics


In [124]:
for col in df.columns:
    print(df[col])

0       2020-01-01
1       2020-01-01
2       2020-01-01
3       2020-01-01
4       2020-01-01
           ...    
4475    2020-01-31
4476    2020-01-31
4477    2020-01-31
4478    2020-01-31
4479    2020-01-31
Name: date, Length: 4480, dtype: object
0       https://www.nytimes.com/2019/12/31/us/texas-ch...
1       https://www.nytimes.com/2019/12/31/opinion/for...
2       https://www.nytimes.com/2019/12/31/health/e-ci...
3       https://www.nytimes.com/2019/12/31/crosswords/...
4       https://www.nytimes.com/2019/12/31/pageoneplus...
                              ...                        
4475    https://www.nytimes.com/2020/01/31/sports/bask...
4476    https://www.nytimes.com/2020/01/31/sports/olym...
4477    https://www.nytimes.com/2020/01/31/health/cpr-...
4478    https://www.nytimes.com/video/us/politics/1000...
4479    https://www.nytimes.com/2020/01/31/nyregion/pr...
Name: url, Length: 4480, dtype: object
0       ‘Battling a Demon’: Drifter Sought Help Before...
1               

In [127]:
pd.date_range('20220206', periods = 20)

DatetimeIndex(['2022-02-06', '2022-02-07', '2022-02-08', '2022-02-09',
               '2022-02-10', '2022-02-11', '2022-02-12', '2022-02-13',
               '2022-02-14', '2022-02-15', '2022-02-16', '2022-02-17',
               '2022-02-18', '2022-02-19', '2022-02-20', '2022-02-21',
               '2022-02-22', '2022-02-23', '2022-02-24', '2022-02-25'],
              dtype='datetime64[ns]', freq='D')

In [129]:
df.columns

Index(['date', 'url', 'headline', 'articles', 'doc_type', 'material_type',
       'section', 'keywords', 'label'],
      dtype='object')

#### Export data to directory

In [123]:
df.to_csv('../data/raw/raw-data.csv', index=None)