# Load, Aggregate And Clean Rating Data

In [1]:
import json
from glob import glob
import pandas as pd
import numpy as np
import itertools

from loguru import logger
from IPython.display import display, clear_output

## Path to rating dataset

In [2]:
ratings_dataset = "../Raw_Data/Raw_API/Ratings/Ratings"
output_path = '../Processed_Data/API/Ratings.pkl'

## Aggregate all json file in this directory.

In [3]:
files = glob(ratings_dataset+'/*.json')
len(files)

7314

In [4]:
data = []

for i, file in enumerate(files):
    
    clear_output()
    logger.info((i, file))
    
    with open(file, 'r') as f: data.append(json.load(f))
        
df = pd.DataFrame(itertools.chain.from_iterable(data))
df.shape

2022-04-26 19:25:17.075 | INFO     | __main__:<module>:6 - (7313, '../Raw_Data/Raw_API/Ratings/Ratings/[6390]2002_BMW_7 SERIES.json')


(7314, 34)

In [9]:
df.columns

Index(['ComplaintsCount', 'FrontCrashDriversideRating',
       'FrontCrashPassengersideRating', 'InvestigationCount', 'Make', 'Model',
       'ModelYear', 'NHTSAElectronicStabilityControl',
       'OverallFrontCrashRating', 'OverallRating', 'OverallSideCrashRating',
       'RecallsCount', 'RolloverPossibility', 'RolloverPossibility2',
       'RolloverRating', 'RolloverRating2', 'SideCrashDriversideRating',
       'SideCrashPassengersideRating', 'SidePoleCrashRating',
       'VehicleDescription', 'VehicleId',
       'combinedSideBarrierAndPoleRating-Front',
       'combinedSideBarrierAndPoleRating-Rear', 'dynamicTipResult',
       'sideBarrierRating-Overall', 'VehiclePicture', 'FrontCrashPicture',
       'FrontCrashVideo', 'SideCrashPicture', 'SideCrashVideo',
       'SidePolePicture', 'SidePoleVideo'],
      dtype='object')

## Drop some unused columns

In [32]:
df_clean = df.drop(['VehicleId',
                    'RolloverPossibility',
                    'RolloverPossibility2',
                    'combinedSideBarrierAndPoleRating-Front',
                    'combinedSideBarrierAndPoleRating-Rear',
                    'dynamicTipResult',
                    'VehiclePicture',
                    'FrontCrashPicture',
                    'FrontCrashVideo',
                    'SideCrashPicture',
                    'SideCrashVideo',
                    'SidePolePicture',
                    'SidePoleVideo'], axis=1)
df_clean.head()

Unnamed: 0,ComplaintsCount,FrontCrashDriversideRating,FrontCrashPassengersideRating,InvestigationCount,Make,Model,ModelYear,NHTSAElectronicStabilityControl,NHTSAForwardCollisionWarning,NHTSALaneDepartureWarning,...,OverallRating,OverallSideCrashRating,RecallsCount,RolloverRating,RolloverRating2,SideCrashDriversideRating,SideCrashPassengersideRating,SidePoleCrashRating,VehicleDescription,sideBarrierRating-Overall
0,56,4,4,3,MITSUBISHI,ECLIPSE,1992,No,No,No,...,Not Rated,Not Rated,3,Not Rated,Not Rated,Not Rated,Not Rated,Not Rated,1992 Mitsubishi Eclipse 2-DR.,Not Rated
1,0,Not Rated,Not Rated,1,BMW,6 SERIES,2017,Standard,No,No,...,Not Rated,Not Rated,1,Not Rated,Not Rated,Not Rated,Not Rated,Not Rated,2017 BMW 6 Series C RWD,Not Rated
2,1,Not Rated,Not Rated,0,BMW,X5 HYBRID,2021,Standard,Standard,Standard,...,Not Rated,Not Rated,1,Not Rated,Not Rated,Not Rated,Not Rated,Not Rated,2021 BMW X5 Hybrid SUV AWD,Not Rated
3,0,Not Rated,Not Rated,0,LEXUS,LS 500,2021,Standard,Standard,Standard,...,Not Rated,Not Rated,0,Not Rated,Not Rated,Not Rated,Not Rated,Not Rated,2021 Lexus LS 500 4 DR AWD,Not Rated
4,156,5,5,0,CHEVROLET,TRAVERSE,2015,Standard,Optional,Optional,...,5,5,3,4,Not Rated,5,5,5,2015 Chevrolet Traverse SUV AWD,5


## Output cleaned version

In [35]:
df_clean.to_pickle(output_path) 