# Accessing the NHTSA API (crawler)
## Recall Data 

In [2]:
import requests
from pathlib import Path
import json
import pandas as pd
import urllib.request
import urllib.error
import time
from loguru import logger
import pygame
from IPython.display import display, clear_output

In [12]:
# Set API prefix because our api headers are same.
api_prefix = "https://api.nhtsa.gov"

# Set API suffix for get year.
api_get_year = "/products/vehicle/modelYears?issueType=r"

# Set output path.
dir_raw = Path("../raw/Recalls")
dir_raw.mkdir(parents=True, exist_ok=True)
out_dir = Path("../raw/Recalls_buffer")
out_dir.mkdir(parents=True, exist_ok=True)

In [33]:
def fetch_recalls(url):
    logger.info("Fetching %s" % url)
    try:
        headers = {
          'User-Agent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"
        }
        response = requests.get(url,headers=headers)
        status = response.status_code
        logger.info(status)

        # Stop if the API cannot be accessed properly.
        if status != 200:
            return [-1,None]
        
        # Otherwise return retrieved data.
        raw_data = response.text
        return [1,json.loads(raw_data)]
    except urllib.error.URLError as e:
        e.reason 

# Output each recall for a json file.
def write_out_raw(recalls_count,year,make,model,data):
    # Check if '/' is already in make or model, replaced by '_or_'.
    if '/' in make:
        make = make.replace("/", "_or_")
    if '/' in model:
        model = model.replace("/", "_or_")
    # File name format [complaint_num]year_make_model.json
    #             e.g. [802]1994_MERCEDES BENZ_E.json
    fname = "[%d]%s_%s_%s.json" % (recalls_count,year,make,model)
    out_path = dir_raw / fname
    logger.info("Writing data to %s" % out_path)
    fout = open(out_path, "w")
    json.dump(data, fout, indent=4, sort_keys=True)
    fout.close()
    
# Check API status.
def check_status(data):
    #number of recall in this time.
    recall_num = data["count"]
    # Are results returned successfully?
    if data["message"] != "Results returned successfully":
        logger.warning("Fail to get from API, %s" % (data["status_message"]))
        return [False,-1]
    elif recall_num <= 0:
        logger.warning("No info from current API address.")
        return [False,0]
    else:
        return [True,recall_num]

# Output for "years", "makes", "models" which are dictionaries we get from first 3 steps.  
def write_out(name,data):
    fname = "%s.json" % (name)
    out_path = out_dir / fname
    logger.info("Writing data to %s" % out_path)
    fout = open(out_path, "w")
    json.dump(data, fout, indent=4, sort_keys=True)
    fout.close()

## Step-1: Get all Model Years 

In [11]:
# Construct API url.
api = api_prefix + api_get_year
# Fetch data.
status,data = fetch_recalls(api)
# Convert result data into list format.
years = (list(each["modelYear"] for each in data["results"]))
logger.info("Completed the task of getting all model years!")
# Store years data.
write_out("years",years)

2022-04-06 23:53:20.869 | INFO     | __main__:fetch_recalls:2 - Fetching https://api.nhtsa.gov/products/vehicle/modelYears?issueType=r
2022-04-06 23:53:21.227 | INFO     | __main__:fetch_recalls:9 - 200
2022-04-06 23:53:21.227 | INFO     | __main__:<module>:7 - Completed the task of getting all model years!
2022-04-06 23:53:21.228 | INFO     | __main__:write_out:55 - Writing data to ../raw/Recalls_buffer/years.json


## Step-2: Get all Makes for the Model Year

In [13]:
makes={}
for year in years:
    clear_output()
    # Construct API url.
    api = api_prefix + "/products/vehicle/makes?modelYear=%d&issueType=r" % (int(year))
    # Fetch data.
    status,data = fetch_recalls(api)
    # Convert result data into list format then store in a dict by year.
    makes[year] = (list(each["make"] for each in data["results"]))
logger.info("Completed the task of getting all makes for the model year!")
# Store makes data.
write_out("recall_makes",makes)

2022-04-06 23:55:07.139 | INFO     | __main__:fetch_recalls:2 - Fetching https://api.nhtsa.gov/products/vehicle/makes?modelYear=9999&issueType=r
2022-04-06 23:55:08.055 | INFO     | __main__:fetch_recalls:9 - 200
2022-04-06 23:55:08.056 | INFO     | __main__:<module>:10 - Completed the task of getting all makes for the model year!
2022-04-06 23:55:08.057 | INFO     | __main__:write_out:55 - Writing data to ../raw/Recalls_buffer/recall_makes.json


## Step-3: Get all Models for the Make and Model Year 
Due to the website's restrictions on crawlers, we have to change IP frequently to bypass its restrictions. The following function will automatically stop when it received an access error (Typically a 403 error) and return the data we got so far. And it will give us a hint for next start.

In [14]:
def Get_model(years,skip):
    models={}
    start_year = years[0]
    for year in years:
        for make in makes[year]:
            # If meet some special case we need skip it.
            if make != skip:
                clear_output()
                api = api_prefix + "/products/vehicle/models?modelYear=%d&make=%s&issueType=r" % (int(year),make)
                status,data = fetch_recalls(api)
            # Status equal -1 means we are failing to use this API at this moment.
            if status == -1:
                logger.warning("Start year:%s, Current Year:%s(index:%d) Current Make:%s(index:%d)" % (start_year,year,years.index(year),make,makes[year].index(make)))
                return models
            # Otherwise, convert result data into list format then store in a dict by make.
            else:
                models[make] = (list(each["model"] for each in data["results"]))
    end_year = year
    logger.info("Finished (%s-%s)" % (start_year,end_year))
    return models

In [15]:
models = Get_model(years,"")

2022-04-07 00:03:56.655 | INFO     | __main__:fetch_recalls:2 - Fetching https://api.nhtsa.gov/products/vehicle/models?modelYear=1975&make=CADILLAC&issueType=r
2022-04-07 00:03:56.843 | INFO     | __main__:fetch_recalls:9 - 403


In [22]:
start = years.index("1975")
end = years.index("1984")
model_1 = Get_model(years[start:end],"")

2022-04-07 09:45:44.208 | INFO     | __main__:fetch_recalls:2 - Fetching https://api.nhtsa.gov/products/vehicle/models?modelYear=1983&make=YAMAHA&issueType=r
2022-04-07 09:45:44.413 | INFO     | __main__:fetch_recalls:9 - 200
2022-04-07 09:45:44.414 | INFO     | __main__:Get_model:19 - Finished (1975-1983)


#### We cannot access url related to **"EMERGENCY ONE"**, therefore we skip it.

In [17]:
start = years.index("1984")
model_2 = Get_model(years[start:],"EMERGENCY ONE")

2022-04-07 01:13:30.072 | INFO     | __main__:fetch_recalls:2 - Fetching https://api.nhtsa.gov/products/vehicle/models?modelYear=9999&make=ZEMCO&issueType=r
2022-04-07 01:13:30.312 | INFO     | __main__:fetch_recalls:9 - 200
2022-04-07 01:13:30.313 | INFO     | __main__:Get_model:19 - Finished (1984-9999)


In [23]:
len(models),len(model_1),len(model_2)

(263, 259, 1485)

#### Define a function for merge two dictionary with out replacement & duplication.

In [24]:
def merge_2_dict(dict_1,dict_2):
    for key, value in dict_2.items():
        if key in dict_1:
            for v in value:
                if v not in dict_1[key]:
                    dict_1[key].append(v)
        else:
            dict_1[key] = value
    return dict_1

In [25]:
models = merge_2_dict(models,model_1)
models = merge_2_dict(models,model_2)
len(models)

1671

#### Delete pair if the value of this key is empty.

In [26]:
clean_models = models.copy()
for key in list(clean_models.keys()):
    if not clean_models.get(key):
        del clean_models[key]
len(clean_models)

1502

#### Delect special 403 case from **makes** and **models**.

In [27]:
#Retrieve back from file

#f = open('../raw/Complaints-Plus/makes.json')
#data_m = json.load(f)
#f.close()
#makes = data_m.copy()

In [28]:
makes_clean_403 = makes.copy()
for k,v in makes_clean_403.items():
    if "EMERGENCY ONE" in v:
        id = v.index("EMERGENCY ONE")
        del v[id]
logger.info( "%d -> %d" %(len(makes),len(makes_clean_403)))

2022-04-07 09:58:01.143 | INFO     | __main__:<module>:6 - 76 -> 76


In [29]:
models_clean_403 = models.copy()
del models_clean_403["EMERGENCY ONE"]
logger.info( "%d -> %d" %(len(models),len(models_clean_403)))

2022-04-07 09:58:10.452 | INFO     | __main__:<module>:3 - 1671 -> 1670


In [30]:
write_out("recall_models",models)

2022-04-07 09:58:13.426 | INFO     | __main__:write_out:55 - Writing data to ../raw/Recalls_buffer/recall_models.json


## Step-4: Get all recalls for the selected Model Year, Make, Model

In [31]:
def Get_recalls(recalls_count,years,makes,models,continue_):
    recalls_count = recalls_count
    start_year = years[0]
    for year in years:
        # "continue_" means we've suffered an interruption, 
        # and we're going back to that position.
        if continue_ : 
            #Get user input of "make" which want to continue.
            make_input = input()
            
            #Find correspond index position in the "makes" dict.
            position_make = makes[year].index(make_input)
            
            #Get user input of "model" which want to continue.
            model_input = input()
            
            #Find correspond index position in the "models" dict.
            position_model = models[make_input].index(model_input)
            
            # Find a list of rest of makes by current year. 
            make_position = makes[year].index(make_input)
            
            # Find a list of rest of models by current year & make. 
            model_position = models[make_input].index(model_input)

            for make in (makes[year])[make_position:]:
                for model in (models[make_input])[model_position:]:
                    
                    clear_output()
                    api = api_prefix + "/recalls/recallsByVehicle?make=%s&model=%s&modelYear=%d" % (make,model,int(year))
                    # Fetch data.
                    status, data = fetch_recalls(api)

                    # Status equal -1 means we are failing to use this API at this moment.
                    if status == -1:
                        logger.warning("Stopped! (%s_%s_%s_%s)" % (recalls_count,year,make,model))                    
                        alarm_clock()
                        return -1
                    
                    # Skip if data is empty
                    if data["results"] != []:
                        #Write out
                        write_out_raw(recalls_count,year,make,model,data["results"])
                        recalls_count += 1
                    else:
                        logger.info("Empty data")
            continue_ = False
        else:
            for make in makes[year]:
                for model in models[make]:
                    clear_output()
                    api = api_prefix + "/recalls/recallsByVehicle?make=%s&model=%s&modelYear=%d" % (make,model,int(year))
                    status, data = fetch_recalls(api)

                    # Status equal -1 means we are failing to use this API at this moment.
                    if status == -1:
                        logger.warning("Stopped! (%s_%s_%s_%s)" % (recalls_count,year,make,model)) 
                        alarm_clock()
                        return -1
                    
                    # Skip if data is empty
                    if data["results"] != []:
                        #Write out
                        write_out_raw(recalls_count,year,make,model,data["results"])
                        recalls_count += 1
                    else:
                        logger.info("Empty data")
    end_year = year
    logger.info("Finished (%s_%s_%s_%s) from %s to %s." % (recalls_count,year,make,model,start_year,end_year))

# Set an alert to remind me to reset when our IP is blocked.
def alarm_clock():
    file = 'alarm_music.mp3'
    pygame.init()
    pygame.mixer.init()
    pygame.mixer.music.load(file)
    pygame.mixer.music.play()
    time.sleep(10)
    pygame.mixer.music.stop()

In [34]:
Get_recalls(0,years,makes,models,False)

2022-04-07 10:08:22.943 | INFO     | __main__:fetch_recalls:2 - Fetching https://api.nhtsa.gov/recalls/recallsByVehicle?make=GMC&model=P2500&modelYear=1963
2022-04-07 10:08:23.113 | INFO     | __main__:fetch_recalls:9 - 403


-1

In [35]:
start = years.index("1963")
Get_recalls(151,years[start:],makes,models,True)

2022-04-07 10:18:36.296 | INFO     | __main__:fetch_recalls:2 - Fetching https://api.nhtsa.gov/recalls/recallsByVehicle?make=MOTOR COACH&model=MC9&modelYear=1966
2022-04-07 10:18:36.660 | INFO     | __main__:fetch_recalls:9 - 403


-1

In [36]:
start = years.index("1966")
Get_recalls(275,years[start:],makes,models,True)

2022-04-07 10:28:49.443 | INFO     | __main__:fetch_recalls:2 - Fetching https://api.nhtsa.gov/recalls/recallsByVehicle?make=INTERNATIONAL HARVESTER&model=S1600&modelYear=1969
2022-04-07 10:28:49.616 | INFO     | __main__:fetch_recalls:9 - 403


-1

In [37]:
start = years.index("1969")
Get_recalls(645,years[start:],makes,models,True)

2022-04-07 10:38:50.466 | INFO     | __main__:fetch_recalls:2 - Fetching https://api.nhtsa.gov/recalls/recallsByVehicle?make=KENWORTH&model=W900B&modelYear=1971
2022-04-07 10:38:50.637 | INFO     | __main__:fetch_recalls:9 - 403


-1

In [39]:
start = years.index("1971")
Get_recalls(1101,years[start:],makes,models,True)

2022-04-07 10:48:41.209 | INFO     | __main__:fetch_recalls:2 - Fetching https://api.nhtsa.gov/recalls/recallsByVehicle?make=PREVOST&model=LE MIRAGE&modelYear=1973
2022-04-07 10:48:41.378 | INFO     | __main__:fetch_recalls:9 - 403


-1

In [40]:
start = years.index("1973")
Get_recalls(1882,years[start:],makes,models,True)

2022-04-07 11:13:59.646 | INFO     | __main__:fetch_recalls:2 - Fetching https://api.nhtsa.gov/recalls/recallsByVehicle?make=MERCEDES BENZ&model=450SL&modelYear=1975
2022-04-07 11:13:59.814 | INFO     | __main__:fetch_recalls:9 - 403


-1

In [41]:
start = years.index("1975")
Get_recalls(2855,years[start:],makes,models,True)

2022-04-07 11:29:10.958 | INFO     | __main__:fetch_recalls:2 - Fetching https://api.nhtsa.gov/recalls/recallsByVehicle?make=OSHKOSH&model=A1823&modelYear=1978
2022-04-07 11:29:11.126 | INFO     | __main__:fetch_recalls:9 - 403


-1

In [42]:
start = years.index("1978")
Get_recalls(3907,years[start:],makes,models,True)

2022-04-07 11:39:02.514 | INFO     | __main__:fetch_recalls:2 - Fetching https://api.nhtsa.gov/recalls/recallsByVehicle?make=GMC&model=TZE063&modelYear=1980
2022-04-07 11:39:02.685 | INFO     | __main__:fetch_recalls:9 - 403


-1

In [43]:
start = years.index("1980")
Get_recalls(4434,years[start:],makes,models,True)

2022-04-07 11:53:57.667 | INFO     | __main__:fetch_recalls:2 - Fetching https://api.nhtsa.gov/recalls/recallsByVehicle?make=EZ LOADER&model=ADJUSTABLE STEEL TRAILER&modelYear=1981
2022-04-07 11:53:57.841 | INFO     | __main__:fetch_recalls:9 - 403


-1

In [45]:
start = years.index("1981")
Get_recalls(4541,years[start:],makes,models,True)

2022-04-07 12:04:13.028 | INFO     | __main__:fetch_recalls:2 - Fetching https://api.nhtsa.gov/recalls/recallsByVehicle?make=TOYOTA&model=CELICA&modelYear=1983
2022-04-07 12:04:13.214 | INFO     | __main__:fetch_recalls:9 - 403


-1

In [46]:
start = years.index("1983")
Get_recalls(5490,years[start:],makes,models,True)

2022-04-07 12:29:12.591 | INFO     | __main__:fetch_recalls:2 - Fetching https://api.nhtsa.gov/recalls/recallsByVehicle?make=SAAB&model=99LE&modelYear=1985
2022-04-07 12:29:12.818 | INFO     | __main__:fetch_recalls:9 - 403


-1

In [48]:
start = years.index("1985")
Get_recalls(6317,years[start:],makes,models,True)

2022-04-07 13:08:04.915 | INFO     | __main__:fetch_recalls:2 - Fetching https://api.nhtsa.gov/recalls/recallsByVehicle?make=VOLVO&model=DC&modelYear=1987
2022-04-07 13:08:05.151 | INFO     | __main__:fetch_recalls:9 - 403


-1

In [49]:
start = years.index("1987")
Get_recalls(7013,years[start:],makes,models,True)

2022-04-07 13:19:18.098 | INFO     | __main__:fetch_recalls:2 - Fetching https://api.nhtsa.gov/recalls/recallsByVehicle?make=INTERNATIONAL HARVESTER&model=1210&modelYear=1989
2022-04-07 13:19:18.285 | INFO     | __main__:fetch_recalls:9 - 403


-1