In [8]:
import requests
from pathlib import Path
import json
import pandas as pd
import urllib.request
import urllib.error
import time
from loguru import logger
from IPython.display import display, clear_output

In [12]:
# store the URL in url as 
# parameter for urlopen
api_prefix = "https://api.nhtsa.gov/complaints"

# Number of data you want to collect
num_complaints_needed = 100000

# Start position that data you want to collect
start_from = 10005719
count_from = 5521

dir_raw = Path("../raw/Complaints")
dir_raw.mkdir(parents=True, exist_ok=True)

In [13]:
def fetch_complaints(url):
    logger.info("Fetching %s" % url)
    try:
        response = requests.get(url)
        logger.info(response.status_code)
        raw_data = response.text
        return json.loads(raw_data)
    except urllib.error.URLError as e:
        e.reason 

def write_out_raw(complaint_count,data):
    title=data[0]['odiNumber']
    # File name format e.g.[complaint_Count]odi_Number.json
    fname = "[%d]%d.json" % (complaint_count,title)
    out_path = dir_raw / fname
    logger.info("Writing data to %s" % out_path)
#    print("Writing data to %s" % out_path)
    fout = open(out_path, "w")
    json.dump(data, fout, indent=4, sort_keys=True)
    fout.close()
    
def check_status(data):
    #number of complaint in this odi.
    complaint_num = data["count"]
    if data["message"] != "Results returned successfully":
        logger.warning("Fail to get from API, %s" % (data["status_message"]))
        return [False,-1]
    elif complaint_num <= 0:
        logger.warning("No info from current API address.")
        return [False,0]
    else:
        return [True,complaint_num]

In [None]:
odi_num = start_from
complaint_count = count_from

#logger.add(filename='fetch_log.log', encoding='utf-8', level="DEBUG")

while complaint_count < num_complaints_needed:
    # Update complaint serial number each time to get a new complaint data.
    api = "%s/odinumber?odinumber=%d" % (api_prefix,odi_num)
    odi_num+=1
    
    clear_output()
    
    # Fetch data.
    data = fetch_complaints(api)
    
    #time.sleep(2)
    
    # Check if the complaint data is fetch successfully.
    status,num_in_odi = check_status(data)
    if status:
        logger.info("Complaint Num: %d" % (complaint_count))
        write_out_raw(complaint_count,data["results"])
        complaint_count+=num_in_odi
        
logger.info("Retrieved %d complaints in total." % (complaint_count))

2022-04-04 21:45:17.616 | INFO     | __main__:fetch_complaints:2 - Fetching https://api.nhtsa.gov/complaints/odinumber?odinumber=10007363
