This is a notebook that I have prepared to scrape safety data for selected cars from iihs.org. This is for a project that I am completing for my WGU masters program.

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import os
import re
regexalpha = re.compile('[^a-zA-Z]') #get letters only

In [4]:
def get_safety_items(page_in, carname):
    
    page = requests.get(page_in)
    
    soup = BeautifulSoup(page.content,'html.parser')
    
    #main object containing rates
    main = soup.find_all(id='main-copy')[0]
    
    #safety ratings
    front_driver = main.find_all(id='small-overlap-front-driver-side')[0].find_all(class_='rating-icon-block')[0].get_text()
    front_passenger = main.find_all(id='small-overlap-front-passenger-side')[0].find_all(class_='rating-icon-block')[0].get_text()
    moderate_overlap_front = main.find_all(id='moderate-overlap-front')[0].find_all(class_='rating-icon-block')[0].get_text()
    side = main.find_all(id='side')[0].find_all(class_='rating-icon-block')[0].get_text()
    roof_strength = main.find_all(id='roof-strength')[0].find_all(class_='rating-icon-block')[0].get_text()
    head_seats = main.find_all(id='head-restraints-and-seats')[0].find_all(class_='rating-icon-block')[0].get_text()
    child_seat_anchors = main.find_all(id='child-seat-anchors')[0].find_all(class_='rating-icon-block')[0].get_text()
    
    cardict= {'car_name': carname, "front_driver": front_driver, "front_passenger": front_passenger,'moderate_overlap_front':moderate_overlap_front,\
           'side':side, 'roof_strength':roof_strength, 'head_seats':head_seats, 'child_seat_anchors':child_seat_anchors}
    
    return pd.DataFrame([list(cardict.values())], columns = cardict.keys())
    

In [30]:
ford_edge_2019 = 'https://www.iihs.org/ratings/vehicle/ford/edge-4-door-suv/2019'
nissan_rogue_2019 = 'https://www.iihs.org/ratings/vehicle/Nissan/rogue-4-door-suv/2019'
subaru_forester = 'https://www.iihs.org/ratings/vehicle/Subaru/forester-4-door-suv/2019'
chevy_equinox = 'https://www.iihs.org/ratings/vehicle/Chevrolet/equinox-4-door-suv/2019'
mazda_cx_5_2019 = 'https://www.iihs.org/ratings/vehicle/Mazda/cx-5-4-door-suv/2019'
jeep_cherokee_2019 = 'https://www.iihs.org/ratings/vehicle/Mazda/cx-5-4-door-suv/2019'
ford_escape_2019 = 'https://www.iihs.org/ratings/vehicle/ford/escape-4-door-suv/2019'
toy_rav4_2019 = 'https://www.iihs.org/ratings/vehicle/Toyota/rav4-4-door-suv/2019'
hyundai_santafe_2019 = 'https://www.iihs.org/ratings/vehicle/Hyundai/santa-fe-4-door-suv/2019'
honda_crv_2019 = 'https://www.iihs.org/ratings/vehicle/Honda/cr-v-4-door-suv/2019'

carlist = [ford_edge_2019, nissan_rogue_2019, subaru_forester, chevy_equinox, \
          mazda_cx_5_2019, jeep_cherokee_2019, ford_escape_2019,  toy_rav4_2019, hyundai_santafe_2019, honda_crv_2019]

carnames = ['ford_edge_2019', 'nissan_rogue_2019', 'subaru_forester', 'chevy_equinox', \
          'mazda_cx_5_2019', 'jeep_cherokee_2019', 'ford_escape_2019',  'toy_rav4_2019', 'hyundai_santafe_2019', 'honda_crv_2019']

In [31]:
#set dataframe columns
columns =  get_safety_items(honda_crv_2019, 'honda_crv_2019').keys()

In [33]:
#gather safety ratings and write to a dataframe
car_safety_all = pd.DataFrame(columns = columns)

for i in range(len(carlist)):
    cardata = get_safety_items(carlist[i], carnames[i])
    car_safety_all = car_safety_all.append(cardata)
    
car_safety_all = car_safety_all.reset_index()[columns]
car_safety_all

Unnamed: 0,car_name,front_driver,front_passenger,moderate_overlap_front,side,roof_strength,head_seats,child_seat_anchors
0,ford_edge_2019,G,G,G,G,G,G,A
1,nissan_rogue_2019,G,A,G,G,G,G,A
2,subaru_forester,G,G,G,G,G,G,G
3,chevy_equinox,G,G,G,G,G,G,A
4,mazda_cx_5_2019,G,G,G,G,G,G,A
5,jeep_cherokee_2019,G,G,G,G,G,G,A
6,ford_escape_2019,A,P,G,G,G,G,M
7,toy_rav4_2019,G,G,G,G,G,G,G
8,hyundai_santafe_2019,G,G,G,G,G,G,A
9,honda_crv_2019,G,G,G,G,G,G,A


In [10]:
def conv_rating(rating):
    #convert letter ratings to numbers
    if rating == 'G':
        return 4
    if rating == 'A':
        return 3
    if rating == 'M':
        return 1 
    if rating == 'P':
        return 0   
    else:
        return rating

In [34]:
#convert letter ratings to numbers
for i in car_safety_all.columns[1:]:
    car_safety_all[i] = car_safety_all[i].apply(conv_rating)
    
car_safety_all.to_csv('car_safety.csv')
car_safety_all

Unnamed: 0,car_name,front_driver,front_passenger,moderate_overlap_front,side,roof_strength,head_seats,child_seat_anchors
0,ford_edge_2019,4,4,4,4,4,4,3
1,nissan_rogue_2019,4,3,4,4,4,4,3
2,subaru_forester,4,4,4,4,4,4,4
3,chevy_equinox,4,4,4,4,4,4,3
4,mazda_cx_5_2019,4,4,4,4,4,4,3
5,jeep_cherokee_2019,4,4,4,4,4,4,3
6,ford_escape_2019,3,0,4,4,4,4,1
7,toy_rav4_2019,4,4,4,4,4,4,4
8,hyundai_santafe_2019,4,4,4,4,4,4,3
9,honda_crv_2019,4,4,4,4,4,4,3
