# Accessing the NHTSA API (crawler)
This notebook will focus on recalls data from NHTSA API.
## Recall Data 

In [1]:
import requests
from pathlib import Path
import json
import pandas as pd
import urllib.request
import urllib.error
import time
from loguru import logger
import pygame
from IPython.display import display, clear_output

pygame 2.1.2 (SDL 2.0.18, Python 3.9.7)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
# Set API prefix because our api headers are same.
api_prefix = "https://api.nhtsa.gov"

# Set API suffix for get year.
api_get_year = "/products/vehicle/modelYears?issueType=r"

# Set output path.
dir_raw = Path("../Raw_Data/Raw_API/Recalls")
dir_raw.mkdir(parents=True, exist_ok=True)
out_dir = Path("../Raw_Data/Raw_API/Recalls_buffer")
out_dir.mkdir(parents=True, exist_ok=True)

In [3]:
def fetch_recalls(url):
    logger.info("Fetching %s" % url)
    try:
        headers = {
          'User-Agent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"
        }
        response = requests.get(url,headers=headers)
        status = response.status_code
        logger.info(status)

        # Stop if the API cannot be accessed properly.
        if status != 200:
            return [-1,None]
        
        # Otherwise return retrieved data.
        raw_data = response.text
        return [1,json.loads(raw_data)]
    except urllib.error.URLError as e:
        e.reason 

# Output each recall for a json file.
def write_out_raw(recalls_count,year,make,model,data):
    # Check if '/' is already in make or model, replaced by '_or_'.
    if '/' in make:
        make = make.replace("/", "_or_")
    if '/' in model:
        model = model.replace("/", "_or_")
    # File name format [complaint_num]year_make_model.json
    #             e.g. [802]1994_MERCEDES BENZ_E.json
    fname = "[%d]%s_%s_%s.json" % (recalls_count,year,make,model)
    out_path = dir_raw / fname
    logger.info("Writing data to %s" % out_path)
    fout = open(out_path, "w")
    json.dump(data, fout, indent=4, sort_keys=True)
    fout.close()
    
# Check API status.
def check_status(data):
    #number of recall in this time.
    recall_num = data["count"]
    # Are results returned successfully?
    if data["message"] != "Results returned successfully":
        logger.warning("Fail to get from API, %s" % (data["status_message"]))
        return [False,-1]
    elif recall_num <= 0:
        logger.warning("No info from current API address.")
        return [False,0]
    else:
        return [True,recall_num]

# Output for "years", "makes", "models" which are dictionaries we get from first 3 steps.  
def write_out(name,data):
    fname = "%s.json" % (name)
    out_path = out_dir / fname
    logger.info("Writing data to %s" % out_path)
    fout = open(out_path, "w")
    json.dump(data, fout, indent=4, sort_keys=True)
    fout.close()

## Step-1: Get all Model Years 

In [4]:
# Construct API url.
api = api_prefix + api_get_year
# Fetch data.
status,data = fetch_recalls(api)
# Convert result data into list format.
years = (list(each["modelYear"] for each in data["results"]))
logger.info("Completed the task of getting all model years!")
# Store years data.
write_out("years",years)

2022-04-28 13:20:07.333 | INFO     | __main__:fetch_recalls:2 - Fetching https://api.nhtsa.gov/products/vehicle/modelYears?issueType=r
2022-04-28 13:20:08.006 | INFO     | __main__:fetch_recalls:9 - 200
2022-04-28 13:20:08.007 | INFO     | __main__:<module>:7 - Completed the task of getting all model years!
2022-04-28 13:20:08.008 | INFO     | __main__:write_out:55 - Writing data to ../raw/Recalls_buffer/years.json


## Step-2: Get all Makes for the Model Year

In [5]:
makes={}
for year in years:
    clear_output()
    # Construct API url.
    api = api_prefix + "/products/vehicle/makes?modelYear=%d&issueType=r" % (int(year))
    # Fetch data.
    status,data = fetch_recalls(api)
    # Store result data into a dict by year.
    for each in data["results"]:
        if year in makes:
            makes[year].append(each["make"])
        else:
            makes[year] = [each["make"]]
logger.info("Completed the task of getting all makes for the model year!")
# Store makes data.
write_out("recall_makes",makes)

2022-04-28 13:22:00.650 | INFO     | __main__:fetch_recalls:2 - Fetching https://api.nhtsa.gov/products/vehicle/makes?modelYear=9999&issueType=r
2022-04-28 13:22:01.664 | INFO     | __main__:fetch_recalls:9 - 200
2022-04-28 13:22:01.667 | INFO     | __main__:<module>:14 - Completed the task of getting all makes for the model year!
2022-04-28 13:22:01.669 | INFO     | __main__:write_out:55 - Writing data to ../raw/Recalls_buffer/recall_makes.json


## Step-3: Get all Models for the Make and Model Year 
Due to the website's restrictions on crawlers, we have to change IP frequently to bypass its restrictions. The following function will automatically stop when it received an access error (Typically a 403 error) and return the data we got so far. And it will give us a hint for next start.

In [6]:
def Get_model(years,skip):
    models={}
    start_year = years[0]
    for year in years:
        for make in makes[year]:
            # If meet some special case we need skip it.
            if make != skip:
                clear_output()
                api = api_prefix + "/products/vehicle/models?modelYear=%d&make=%s&issueType=r" % (int(year),make)
                status,data = fetch_recalls(api)
            # Status equal -1 means we are failing to use this API at this moment.
            if status == -1:
                logger.warning("Start year:%s, Current Year:%s(index:%d) Current Make:%s(index:%d)" % (start_year,year,years.index(year),make,makes[year].index(make)))
                return models
            # Otherwise, convert result data into list format then store in a dict by make.
            else:
                for each in data["results"]:
                    if make in models:
                        models[make].append(each["model"])
                    else:
                        models[make] = [each["model"]]
    end_year = year
    logger.info("Finished (%s-%s)" % (start_year,end_year))
    return models

In [7]:
models = Get_model(years,"")

2022-04-28 13:34:37.925 | INFO     | __main__:fetch_recalls:2 - Fetching https://api.nhtsa.gov/products/vehicle/models?modelYear=1984&make=EMERGENCY ONE&issueType=r
2022-04-28 13:34:38.139 | INFO     | __main__:fetch_recalls:9 - 403


#### We cannot access url related to **"EMERGENCY ONE"**, therefore we skip it.

In [8]:
start = years.index("1984")
model_1 = Get_model(years[start:],"EMERGENCY ONE")

2022-04-28 14:36:03.006 | INFO     | __main__:fetch_recalls:2 - Fetching https://api.nhtsa.gov/products/vehicle/models?modelYear=9999&make=ZEMCO&issueType=r
2022-04-28 14:36:03.344 | INFO     | __main__:fetch_recalls:9 - 200
2022-04-28 14:36:03.348 | INFO     | __main__:Get_model:23 - Finished (1984-9999)


In [10]:
len(models),len(model_1)

(393, 1319)

#### Define a function for merge two dictionary with out replacement & duplication.

In [11]:
def merge_2_dict(dict_1,dict_2):
    for key, value in dict_2.items():
        if key in dict_1:
            for v in value:
                if v not in dict_1[key]:
                    dict_1[key].append(v)
        else:
            dict_1[key] = value
    return dict_1

In [12]:
models = merge_2_dict(models,model_1)
len(models)

1520

#### Delete pair if the value of this key is empty.

In [13]:
clean_models = models.copy()
for key in list(clean_models.keys()):
    if not clean_models.get(key):
        del clean_models[key]
len(clean_models)

1520

#### Clean for redundancy

In [40]:
for key, val in clean_models.items():
    clean_models[key]=list(set(val))

#### Delect special 403 case from **makes** and **models**.

In [14]:
#Retrieve back from file

#f = open('../raw/Complaints-Plus/makes.json')
#data_m = json.load(f)
#f.close()
#makes = data_m.copy()

In [41]:
makes_clean_403 = makes.copy()
for k,v in makes_clean_403.items():
    if "EMERGENCY ONE" in v:
        id = v.index("EMERGENCY ONE")
        del v[id]
    if "SUNLIGHT" in v:
        id = v.index("SUNLIGHT")
        del v[id]
logger.info( "%d -> %d" %(len(makes),len(makes_clean_403)))

2022-04-28 15:13:15.293 | INFO     | __main__:<module>:9 - 76 -> 76


In [42]:
clean_models = clean_models.copy()
del clean_models["EMERGENCY ONE"]
del clean_models["SUNLIGHT"]
logger.info( "%d -> %d" %(len(models),len(clean_models)))

2022-04-28 15:13:46.527 | INFO     | __main__:<module>:4 - 1520 -> 1518


In [43]:
write_out("recall_models",clean_models)

2022-04-28 15:13:50.937 | INFO     | __main__:write_out:55 - Writing data to ../raw/Recalls_buffer/recall_models.json


## Step-4: Get all recalls for the selected Model Year, Make, Model

In [44]:
def Get_recalls(recalls_count,years,makes,models,continue_):
    recalls_count = recalls_count
    start_year = years[0]
    for year in years:
        # "continue_" means we've suffered an interruption, 
        # and we're going back to that position.
        if continue_ : 
            #Get user input of "make" which want to continue.
            make_input = input()
            
            #Find correspond index position in the "makes" dict.
            position_make = makes[year].index(make_input)
            
            #Get user input of "model" which want to continue.
            model_input = input()
            
            #Find correspond index position in the "models" dict.
            position_model = models[make_input].index(model_input)
            
            # Find a list of rest of makes by current year. 
            make_position = makes[year].index(make_input)
            
            # Find a list of rest of models by current year & make. 
            model_position = models[make_input].index(model_input)

            for make in (makes[year])[make_position:]:
                for model in (models[make_input])[model_position:]:
                    
                    clear_output()
                    api = api_prefix + "/recalls/recallsByVehicle?make=%s&model=%s&modelYear=%d" % (make,model,int(year))
                    # Fetch data.
                    status, data = fetch_recalls(api)

                    # Status equal -1 means we are failing to use this API at this moment.
                    if status == -1:
                        logger.warning("Stopped! (%s_%s_%s_%s)" % (recalls_count,year,make,model))                    
                        alarm_clock()
                        return -1
                    
                    # Skip if data is empty
                    if data["results"] != []:
                        #Write out
                        write_out_raw(recalls_count,year,make,model,data["results"])
                        recalls_count += 1
                    else:
                        logger.info("Empty data")
            continue_ = False
        else:
            for make in makes[year]:
                for model in models[make]:
                    clear_output()
                    api = api_prefix + "/recalls/recallsByVehicle?make=%s&model=%s&modelYear=%d" % (make,model,int(year))
                    status, data = fetch_recalls(api)

                    # Status equal -1 means we are failing to use this API at this moment.
                    if status == -1:
                        logger.warning("Stopped! (%s_%s_%s_%s)" % (recalls_count,year,make,model)) 
                        alarm_clock()
                        return -1
                    
                    # Skip if data is empty
                    if data["results"] != []:
                        #Write out
                        write_out_raw(recalls_count,year,make,model,data["results"])
                        recalls_count += 1
                    else:
                        logger.info("Empty data")
    end_year = year
    logger.info("Finished (%s_%s_%s_%s) from %s to %s." % (recalls_count,year,make,model,start_year,end_year))

# Set an alert to remind me to reset when our IP is blocked.
def alarm_clock():
    file = 'alarm_music.mp3'
    pygame.init()
    pygame.mixer.init()
    pygame.mixer.music.load(file)
    pygame.mixer.music.play()
    time.sleep(10)
    pygame.mixer.music.stop()

In [None]:
Get_recalls(0,years,makes,clean_models,False)

In [None]:
start = years.index("1967")
Get_recalls(658,years[start:],makes,clean_models,True)

2022-04-28 20:04:04.906 | INFO     | __main__:fetch_recalls:2 - Fetching https://api.nhtsa.gov/recalls/recallsByVehicle?make=PONTIAC&model=PHOENIX&modelYear=1971
2022-04-28 20:04:05.359 | INFO     | __main__:fetch_recalls:9 - 200


In [48]:
start = years.index("1988")
Get_recalls(12586,years[start:],makes,clean_models,True)

2022-04-29 11:05:16.713 | INFO     | __main__:fetch_recalls:2 - Fetching https://api.nhtsa.gov/recalls/recallsByVehicle?make=GLAVAL&model=EASY ON&modelYear=1990
2022-04-29 11:05:16.811 | INFO     | __main__:fetch_recalls:9 - 403


-1

In [54]:
start = years.index("1990")
Get_recalls(13813,years[start:],makes,clean_models,True)

2022-04-29 11:51:37.500 | INFO     | __main__:fetch_recalls:2 - Fetching https://api.nhtsa.gov/recalls/recallsByVehicle?make=GLAVAL&model=EASY ON&modelYear=1991
2022-04-29 11:51:37.613 | INFO     | __main__:fetch_recalls:9 - 403


-1

In [55]:
start = years.index("1991")
Get_recalls(14125,years[start:],makes,clean_models,True)

2022-04-29 13:20:52.110 | INFO     | __main__:fetch_recalls:2 - Fetching https://api.nhtsa.gov/recalls/recallsByVehicle?make=GLAVAL&model=EASY ON&modelYear=1993
2022-04-29 13:20:52.220 | INFO     | __main__:fetch_recalls:9 - 403


-1

In [60]:
start = years.index("1995")
Get_recalls(15771,years[start:],makes,clean_models,True)

2022-04-29 17:13:22.051 | INFO     | __main__:fetch_recalls:2 - Fetching https://api.nhtsa.gov/recalls/recallsByVehicle?make=GLAVAL&model=EASY ON&modelYear=1996
2022-04-29 17:13:22.226 | INFO     | __main__:fetch_recalls:9 - 403


-1

In [61]:
start = years.index("1996")
Get_recalls(16150,years[start:],makes,clean_models,True)

2022-04-29 18:12:55.240 | INFO     | __main__:fetch_recalls:2 - Fetching https://api.nhtsa.gov/recalls/recallsByVehicle?make=GLAVAL&model=EASY ON&modelYear=1997
2022-04-29 18:12:55.348 | INFO     | __main__:fetch_recalls:9 - 403


-1

In [64]:
start = years.index("1998")
Get_recalls(17357,years[start:],makes,clean_models,True)

2022-04-29 20:35:09.834 | INFO     | __main__:fetch_recalls:2 - Fetching https://api.nhtsa.gov/recalls/recallsByVehicle?make=GLAVAL&model=EASY ON&modelYear=1999
2022-04-29 20:35:09.943 | INFO     | __main__:fetch_recalls:9 - 403


-1

In [66]:
start = years.index("1999")
Get_recalls(17946,years[start:],makes,clean_models,True)

2022-04-29 21:26:34.109 | INFO     | __main__:fetch_recalls:2 - Fetching https://api.nhtsa.gov/recalls/recallsByVehicle?make=GLAVAL&model=EASY ON&modelYear=2000
2022-04-29 21:26:34.274 | INFO     | __main__:fetch_recalls:9 - 403


-1

In [161]:
start = years.index("2023")
Get_recalls(32107,years[start:],makes,clean_models,True)

2022-05-01 19:55:05.094 | INFO     | __main__:fetch_recalls:2 - Fetching https://api.nhtsa.gov/recalls/recallsByVehicle?make=MG&model=MG&modelYear=9999
2022-05-01 19:55:05.354 | INFO     | __main__:fetch_recalls:9 - 200
2022-05-01 19:55:05.355 | INFO     | __main__:Get_recalls:67 - Empty data
2022-05-01 19:55:05.356 | INFO     | __main__:Get_recalls:69 - Finished (32108_9999_MG_MG) from 2023 to 9999.
