## Retrieving data through NYTs API and structuring the JSON files

In [None]:
# Needed libraries
import os # to use functions of the operating system
import pandas as pd # for data analysis and visualizations
import datetime # to process dates and times
import time # to process time related functions
import requests # to send HTTP requests
import json # to process JSON data
import signal # to handle signals to stop the program when it is unresponsive
import sys # for access to the interpreter


### Retrieving data through API

In [None]:
# Loading API key from a text file (--> no hard-coding of secret API key)
with open('NYT_apikey.txt') as keyFile:
    api_key = keyFile.read().strip()

# Maximum of 500 requests per day (--> limitation by the NYT) <--> But would only work properly when data spanning several months was accessed in a single sequence of requests
max_requests_per_day = 500
request_count = 0

# Global declaration of the articles list
articles = []

# Defining dates (initialized as datetime objects) --> NEED TO BE ADJUSTED MANUALLY FOR EACH MONTH
begin_date = datetime.datetime(2012, 5, 1)
end_date = datetime.datetime(2012, 5, 31)

# Signal handler for termination when the program becomes unresponsive
def signal_handler(sig, frame):
    global articles
    print('Interrupt signal received, terminating the program.')
    save_results_and_exit()

signal.signal(signal.SIGINT, signal_handler)

# Function for API query with retry mechanism
def fetch_data_with_retry(session, url, params, max_retries=3, timeout=10):
    """
    Attempts to retrieve data from a specified URL using provided parameters,
    with multiple retries in case of failures.
    
    @param session: requests.Session instance used for HTTP requests.
    @param url: URL of the API from which data is being fetched.
    @param params: Dictionary of parameters to be used in the API request.
    @param max_retries: Maximum number of retry attempts in case of failures (set to 3).
    @param timeout: Timeout for each request in seconds (set to 10).
    @return: JSON object with retrieved data, or None if all attempts fail.
    """
    attempt = 0
    while attempt < max_retries:
        try:
            response = session.get(url, params=params, timeout=timeout)
            response.raise_for_status()
            return response.json()
        except requests.exceptions.Timeout:
            print(f"Timeout occurred. Retrying {attempt + 1}/{max_retries}...")
        except requests.exceptions.RequestException as e:
            print(f"Request failed: {e}. Retrying {attempt + 1}/{max_retries}...")
        time.sleep(2 ** attempt)
        attempt += 1
    print("Max retries exceeded. Aborting.")
    return None

# Function to search for articles with pagination and direct API requests including error handling
def search_articles(api_key, begin_date, end_date, max_pages=100):
    """
    Searches and collects articles within a specified date range and up to a maximum number of pages.
    
    @param api_key: Individual and secret API key for accessing the New York Times API.
    @param begin_date: Start date of the search interval as a datetime object.
    @param end_date: End date of the search interval as a datetime object.
    @param max_pages: Maximum number of pages that can be queried per day (set to 100 --> limitation by the NYT).
    @return: A list of articles found within the specified period.
    """
    global articles  # Referencing the global articles list
    base_url = 'https://api.nytimes.com/svc/search/v2/articlesearch.json'
    global request_count
    
    with requests.Session() as session:
        while begin_date <= end_date:
            page = 0
            while page < max_pages:
                if request_count >= max_requests_per_day:
                    print("Daily limit reached.")
                    save_results_and_exit()

                if request_count > 0:
                    print("Waiting 12 seconds before the next request...") # limitation by the NYT: max. 5 requests per minute
                    time.sleep(12)
                
                params = {
                    'begin_date': begin_date.strftime('%Y%m%d'),
                    'end_date': begin_date.strftime('%Y%m%d'),
                    'api-key': api_key,
                    'page': page,
                    'fq': 'section_name:("U.S.") AND type_of_material:("News")'
                }

                print(f"Sending request for page {page + 1}...")
                data = fetch_data_with_retry(session, base_url, params, timeout=10)
                
                if data is None:
                    print("Max retries exceeded for current request. Daily limit possibly reached.")
                    save_results_and_exit()
                    
                if 'response' in data and 'docs' in data['response']:
                    docs = data['response']['docs']
                    articles.extend(docs)
                    print(f"Request {request_count + 1}: {len(docs)} articles found on page {page + 1}")
                    request_count += 1

                    if len(docs) < 10:
                        print("Less than 10 articles found on this page, moving to the next day.") # limitation by the NYT: max. 10 articles per page
                        break
                else:
                    print("No (further) results found.")
                    break

                page += 1
            
            begin_date += datetime.timedelta(days=1)
            print(f"Moving to the next day: {begin_date.strftime('%Y-%m-%d')}")
    
    return articles

# Function to save results when forced to terminate the program manually by the signal handler
def save_results_and_exit():
    """
    Saves the collected articles in a JSON file and terminates the program.

    articles "partial" name NEEDS TO BE ADJUSTED FOR EACH MONTH.
    """
    global articles
    print("Saving results to a JSON file...")
    with open('articles_May_12_partial.json', 'w') as file:
        json.dump(articles, file)
    print(f'{len(articles)} articles saved')
    sys.exit(0)

# Start searching for articles and output results
print("Starting article search...")
articles = search_articles(api_key, begin_date, end_date, max_pages=100)

# Saving results in a JSON file --> NAME NEEDS TO BE ADJUSTED FOR EACH MONTH
print("Saving results to a JSON file...")
with open('articles_May_12.json', 'w') as file:
    json.dump(articles, file)

print(f'{len(articles)} articles saved')

Starte die Artikelsuche...
Beginne Suche für Datum: 2012-05-01
Sende Anfrage für Seite 1...
Anfrage 1: 10 Artikel gefunden auf Seite 1
Warte 12 Sekunden vor der nächsten Anfrage...
Sende Anfrage für Seite 2...
Anfrage 2: 10 Artikel gefunden auf Seite 2
Warte 12 Sekunden vor der nächsten Anfrage...
Sende Anfrage für Seite 3...
Anfrage 3: 5 Artikel gefunden auf Seite 3
Weniger als 10 Artikel auf dieser Seite gefunden, gehe zum nächsten Tag über.
Wechsle zum nächsten Tag: 2012-05-02
Beginne Suche für Datum: 2012-05-02
Warte 12 Sekunden vor der nächsten Anfrage...
Sende Anfrage für Seite 1...
Anfrage 4: 10 Artikel gefunden auf Seite 1
Warte 12 Sekunden vor der nächsten Anfrage...
Sende Anfrage für Seite 2...
Anfrage 5: 10 Artikel gefunden auf Seite 2
Warte 12 Sekunden vor der nächsten Anfrage...
Sende Anfrage für Seite 3...
Anfrage 6: 1 Artikel gefunden auf Seite 3
Weniger als 10 Artikel auf dieser Seite gefunden, gehe zum nächsten Tag über.
Wechsle zum nächsten Tag: 2012-05-03
Beginne Suc

In [None]:
# Checking for consistency in generated data

unique_section_names = df['section_name'].unique()
print("Einzigartige Werte im section_name-Feld:")
print(unique_section_names)

unique_document_type = df['document_type'].unique()
print("Einzigartige Werte im document_type-Feld:")
print(unique_document_type)

unique_material_type = df['type_of_material'].unique()
print("Einzigartige Werte im type_of_material-Feld:")
print(unique_material_type)

### Sorting the JSON files

In [None]:
# Counting the articles found for each month

"""
# needed packages:
import os
import json
import pandas as pd
"""

def count_articles_in_files(directory):
    """
    Counts the number of articles in JSON files within a specified directory.
    This function expects that the JSON files contain lists of article data,
    where each file represents a batch of articles retrieved through an API.
    
   @param directory: The path to the directory containing JSON files.
                      Each file is expected to be a JSON file with a list of articles.
    
    @return: DataFrame listing each file with the number of articles it contains,
             sorted by the filename.
    """
    article_counts = []
    
    for filename in os.listdir(directory):
        if filename.startswith("articles") and filename.endswith(".json"):
            filepath = os.path.join(directory, filename)
            with open(filepath, 'r') as file:
                data = json.load(file)
                
                if isinstance(data, list):
                    # Extracting year and month from filename
                    parts = filename.split('_')
                    year = int(parts[1])
                    month = int(parts[2].split('.')[0])
                    article_count = len(data)
                    article_counts.append((year, month, article_count))
                else:
                    print(f"File {filename} is not structured as expected.")
    
    df = pd.DataFrame(article_counts, columns=["Year", "Month", "Article Count"])
    return df

cwd = os.getcwd()
articles_directory = os.path.join(cwd, 'articles')
article_counts_df = count_articles_in_files(articles_directory)

# Saving the DataFrame to a CSV file
csv_file_path = os.path.join(cwd, 'article_counts.csv')
article_counts_df.to_csv(csv_file_path, index=False)
print("Article counts saved to CSV.")


### Checking the structure of retrieved JSON files

In [1]:
# Checking the structure of retrieved JSON files to know how to extract requested data for further analyses

"""
# needed packages:
import json
import os
"""

# directory to a single JSON file
file_path = 'articles/articles_2012_05.json'

# number of results that should be displayed for an overview
num_results = 3

if not os.path.exists(file_path):
    print("File not found.")
else:
    with open(file_path, 'r', encoding='utf-8') as file:
        articles = json.load(file)

        # displaying only the defined number of results
        for i, article in enumerate(articles[:num_results]):
            print(f"Artikel {i+1}:")
            print(json.dumps(article, indent=4))


Artikel 1:
{
    "abstract": "The ad is the latest example of the president directly engaging with Mitt Romney.",
    "web_url": "https://thecaucus.blogs.nytimes.com/2012/05/01/obama-ad-attacks-romney-on-job-creation/",
    "snippet": "The ad is the latest example of the president directly engaging with Mitt Romney.",
    "lead_paragraph": "The Obama campaign on Tuesday released a new ad that will run in three crucial swing states, taking aim at Mitt Romney\u2019s job-creation record and accusing him of shipping jobs overseas.",
    "source": "The New York Times",
    "multimedia": [],
    "headline": {
        "main": "Obama Ad Attacks Romney on Swiss Bank Account",
        "kicker": null,
        "content_kicker": null,
        "print_headline": null,
        "name": null,
        "seo": null,
        "sub": null
    },
    "keywords": [
        {
            "name": "subject",
            "value": "Presidential Election of 2012",
            "rank": 1,
            "major": "N"
     