# Load, Aggregate And Clean Recalls Data

In [30]:
import json
from glob import glob
import pandas as pd
import numpy as np
import itertools

from loguru import logger
from IPython.display import display, clear_output

## Path to recalls dataset

In [33]:
recalls_dataset = "../Raw_Data/Raw_API/Recalls/Recalls"
output_path = '../Processed_Data/API/Recalls.pkl'

## Aggregate all json file in this directory.

In [34]:
files = glob(recalls_dataset+'/*.json')
len(files)

32146

In [35]:
data = []

for i, file in enumerate(files):
    
    clear_output()
    logger.info((i, file))
    
    with open(file, 'r') as f: data.append(json.load(f))
        
df = pd.DataFrame(itertools.chain.from_iterable(data))
df.shape

2022-05-01 20:20:22.920 | INFO     | __main__:<module>:6 - (32145, '../Raw_Data/Raw_API/Recalls/Recalls/[31587]2022_BUGATTI_CHIRON PUR SPORT.json')


(63599, 14)

In [36]:
df

Unnamed: 0,Component,Consequence,Make,Manufacturer,Model,ModelYear,NHTSACampaignNumber,Notes,Remedy,ReportReceivedDate,Summary,parkIt,parkOutSide,NHTSAActionNumber
0,POWER TRAIN:AXLE ASSEMBLY:AXLE SHAFT,,VOLVO,VOLVO OF AMERICA CORP.,245,1978,78V062000,VEHICLE DESCRIPTION: PASSENGER VEHICLES.SYSTEM...,THE DEALER WILL EITHER LUBRICATE THE BEARINGS ...,24/03/1978,THE REAR WHEEL BEARINGS ON THE INVOLVED VEHICL...,False,False,
1,SEAT BELTS:FRONT:ANCHORAGE,THE SEAT BELT BUCKLE ANCHORAGE CAN SEPARATE FR...,MONACO COACH,MONACO COACH CORPORATION,SIGNATURE,2004,04V080000,"MONACO RECALL NO. R04001. ALSO, CUSTOMERS CAN...",DEALERS WILL INSPECT THE BELT LATCH BOLT AND I...,17/02/2004,CERTAIN MOTORHOMES FAIL TO COMPLY WITH THE REQ...,False,False,
2,ENGINE AND ENGINE COOLING:ENGINE:DIESEL,REPROGRAMMING THE ECM WILL PREVENT THE POTENTI...,MONACO COACH,MONACO COACH CORPORATION,SIGNATURE,2004,06V182000,CUSTOMERS MAY ALSO CONTACT THE NATIONAL HIGHWA...,DETROIT DIESEL IS CONDUCTING THIS RECALL (PLEA...,25/05/2006,ON CERTAIN CLASS A MOTOR HOMES EQUIPPED WITH D...,False,False,
3,"FUEL SYSTEM, OTHER:DELIVERY:HOSES, LINES/PIPIN...",HOSE SEPARATION BETWEEN THE BOTTLE AND THE REG...,WESTERN,"WESTERN RV, INC.",ALPENLITE,1989,92V151000,SYSTEM: EQUIPMENT; LIQUID PETROLEUM GAS (LPG) ...,REPLACE THE TWO PIGTAIL HOSES ON EVERY VEHICLE.,20/10/1992,THE LPG PIGTAIL HOSE WHICH CONNECTS TO LPG BOT...,False,False,
4,STRUCTURE:BODY:DOOR,"IF THE WINDOW SEPARATES, THE OCCUPANT CAN FALL...",PREVOST,"PREVOST CAR, INCORPORATED",LE MIRAGE 96,1979,95V013000,SYSTEM: STRUCTURE; DOOR ASSEMBLY. VEHICLE DES...,DEALERS WILL INSTALL A RAIL IN THE LAVATORY CO...,03/01/1995,IF A PASSENGER TRIES TO EXIT THE LAVATORY COMP...,False,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63594,EQUIPMENT,"In the event of a fire, if the fire extinguish...",CROSSROADS,Keystone RV Company,CAMEO,2018,17V755000,Owners may also contact the National Highway T...,Keystone will notify owners and instruct them ...,22/11/2017,Keystone RV Company (Keystone) is recalling va...,False,False,
63595,"SERVICE BRAKES, AIR:DRUM:SHOES/LININGS",,GILLIG,GILLIG CORPORATION,GILLIG,1976,78V050000,VEHICLE DESCRIPTION: SCHOOL BUSES.SYSTEM: SERV...,DEALER WILL REPLACE DEFECTIVE BRAKE SHOE ASSEM...,09/03/1978,THE BRAKE SHOE WELDS ON REAR BRAKE ASSEMBLIES ...,False,False,
63596,AIR BAGS:FRONTAL,"IF THIS HAPPENS, THE CONTACT RING WIRING COULD...",BMW,BAYERISCHE MOTOREN WERKE,M5,1992,96V110000,SYSTEM: INTERIOR; PASSIVE RESTRAINT; AIR BAG; ...,DEALERS WILL REPLACE THE LOCKING TAB WITH ONE ...,01/07/1996,THE AIR BAG CONTACT RING LOCKING TAB LOCATED I...,False,False,PE94088
63597,ENGINE AND ENGINE COOLING:COOLING SYSTEM,IF A CRACK WERE TO FORM OR A SEAM WERE TO SEPA...,BMW,BAYERISCHE MOTOREN WERKE,M5,1992,98V178000,"OWNER NOTIFICATION BEGAN MARCH 31, 1999. OWN...",DEALERS WILL INSTALL A NEW DESIGN RADIATOR CAP...,04/08/1998,VEHICLE DESCRIPTION: PASSENGER VEHICLES. A M...,False,False,


## Drop some unused columns

In [37]:
df_clean = df.drop(['Consequence','NHTSACampaignNumber','Notes','Remedy','Summary','NHTSAActionNumber','parkIt','parkOutSide'], axis=1)
df_clean.head()

Unnamed: 0,Component,Make,Manufacturer,Model,ModelYear,ReportReceivedDate
0,POWER TRAIN:AXLE ASSEMBLY:AXLE SHAFT,VOLVO,VOLVO OF AMERICA CORP.,245,1978,24/03/1978
1,SEAT BELTS:FRONT:ANCHORAGE,MONACO COACH,MONACO COACH CORPORATION,SIGNATURE,2004,17/02/2004
2,ENGINE AND ENGINE COOLING:ENGINE:DIESEL,MONACO COACH,MONACO COACH CORPORATION,SIGNATURE,2004,25/05/2006
3,"FUEL SYSTEM, OTHER:DELIVERY:HOSES, LINES/PIPIN...",WESTERN,"WESTERN RV, INC.",ALPENLITE,1989,20/10/1992
4,STRUCTURE:BODY:DOOR,PREVOST,"PREVOST CAR, INCORPORATED",LE MIRAGE 96,1979,03/01/1995


## Convert date columns to Dateformat

In [38]:
df_clean['ReportReceivedDate'] = pd.to_datetime(df_clean['ReportReceivedDate'], format='%d/%m/%Y').dt.strftime("%Y")
df_clean.head()

Unnamed: 0,Component,Make,Manufacturer,Model,ModelYear,ReportReceivedDate
0,POWER TRAIN:AXLE ASSEMBLY:AXLE SHAFT,VOLVO,VOLVO OF AMERICA CORP.,245,1978,1978
1,SEAT BELTS:FRONT:ANCHORAGE,MONACO COACH,MONACO COACH CORPORATION,SIGNATURE,2004,2004
2,ENGINE AND ENGINE COOLING:ENGINE:DIESEL,MONACO COACH,MONACO COACH CORPORATION,SIGNATURE,2004,2006
3,"FUEL SYSTEM, OTHER:DELIVERY:HOSES, LINES/PIPIN...",WESTERN,"WESTERN RV, INC.",ALPENLITE,1989,1992
4,STRUCTURE:BODY:DOOR,PREVOST,"PREVOST CAR, INCORPORATED",LE MIRAGE 96,1979,1995


## Output cleaned version

In [39]:
df_clean.to_pickle(output_path) 