## New York Times API

#### With this Notebook it is possible to use the NYT API and select the categories manually as well as a certain start and end date

In [4]:
import pandas as pd
import requests
import json
import time
import datetime
import getpass
import os

apikey = getpass.getpass()

········


In [None]:
# selected categories for analysis (selected because they are still displayed until now)
validListNames = ['combined-print-and-e-book-fiction']

# adding month
def addOneMonth(date):
    try:
        newDate = date.replace(month=date.month+1)
    except ValueError:
        if date.month == 12:
            newDate = date.replace(year=date.year+1, month=1)
        else:
    # next month is too short to have "same date"
    # pick your own heuristic, or re-raise the exception:
            raise
    
    return newDate

#/lists/names.json
print("https://api.nytimes.com/svc/books/v3/lists/names.json?api-key="+apikey)

# get all lists available
listResponse = requests.get("https://api.nytimes.com/svc/books/v3/lists/names.json?api-key="+apikey)

# create dataframe results json object
listDataFrame = pd.DataFrame(listResponse.json()["results"])

# change datatype (string to date)
listDataFrame['oldest_published_date'] = listDataFrame['oldest_published_date'].astype('datetime64[ns]')
listDataFrame['newest_published_date'] = listDataFrame['newest_published_date'].astype('datetime64[ns]')

# save list of all available categories to csv
#listDataFrame.to_csv("lists.csv", index=False)

# filter dataframe with selected categories
filteredListDataFrame = listDataFrame[listDataFrame['list_name_encoded'].isin(validListNames)]

# initialize variable for dataframe
result = None

# iterate through dataframe with selected categories 
for index, row in filteredListDataFrame.iterrows():
    
    # get list name from row
    list_name_encoded     = row['list_name_encoded']
    
    # get published dates from row
    oldest_published_date = row['oldest_published_date']
    newest_published_date = row['newest_published_date']
    
    # adjust publish date to entire month (to have a monthly overview)
    oldest_published_date_cleaned = addOneMonth(oldest_published_date)
    oldest_published_date_cleaned = oldest_published_date_cleaned.replace(day=1)
    newest_published_date_cleaned = newest_published_date.replace(day=1)
    
    # set date boundaries
    customStartDate = datetime.datetime(2007, 2, 1)
    customEndDate   = datetime.datetime(2020, 1, 1)
    
     # set starting date for request )
    requestDate = max(oldest_published_date_cleaned, customStartDate)
    maxDate     = min(newest_published_date_cleaned, customEndDate)
    
    print (oldest_published_date, oldest_published_date_cleaned, customStartDate, 
requestDate)
    print(newest_published_date, newest_published_date_cleaned, customEndDate, maxDate)
    
    # iterate over all months between oldes and newest published dates for selected categories
    while requestDate < maxDate:
        
        try:
            #/lists/{date}/{list}.json
            print("https://api.nytimes.com/svc/books/v3/lists/"+requestDate.strftime('%Y-%m-%d')+"/"+list_name_encoded+".json?api-key="+apikey)
            bestsellerResponse = requests.get("https://api.nytimes.com/svc/books/v3/lists/"+requestDate.strftime('%Y-%m-%d')+"/"+list_name_encoded+".json?api-key="+apikey)
            bestsellerResponse.raise_for_status()
            

            
        except requests.exceptions.HTTPError as err:
            raise SystemExit(err)
        
        # new dataframe with bestseller books from selected categories
        bestsellerDataFrame = pd.DataFrame(bestsellerResponse.json()["results"]["books"])
        
        # add new columns for list(category) and date when book is ranked
        bestsellerDataFrame["list"] = list_name_encoded
        bestsellerDataFrame["date"] = requestDate
        
        # delete columns that are not useful
        bestsellerDataFrame = bestsellerDataFrame.drop(columns=['asterisk', 'dagger', 'contributor','contributor_note', 'book_image','book_image_width', 'book_image_height', 'amazon_product_url','first_chapter_link','sunday_review_link', 'article_chapter_link', 'book_review_link', 'buy_links', 'book_uri', 'isbns', 'rank_last_week', 'weeks_on_list', 'primary_isbn10', 'publisher', 'description', 'price', 'age_group'])

        # save dataframe
        if (result is None):
            result = bestsellerDataFrame
        else:
            result = result.append(bestsellerDataFrame)  
        
        # identify unique books
        uniqueIsbns = result['primary_isbn13'].unique()
        for isbn in uniqueIsbns:
            
            # count how many months is book on bestseller list
            length = result.loc[result['primary_isbn13'] == isbn].shape[0]          
            result.loc[result['primary_isbn13'] == isbn, 'month_on_list'] = int(length)
        
        # create csv with fin1l results
        result.to_csv("result_selectedcategory.csv", index=False)
        
        # set timer to avoid rate limit
        time.sleep(6)
        
        # add one month
        requestDate = addOneMonth(requestDate)