In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# The url for the EV website 
url = ('https://ev-database.org/#sort:path~type~order=.rank~number~desc|range-slider-range:prev~next=0~1200|range-slider-acceleration:prev~next=2~23|range-slider-topspeed:prev~next=110~450|range-slider-battery:prev~next=10~200|range-slider-towweight:prev~next=0~2500|range-slider-fastcharge:prev~next=0~1500|paging:currentPage=0|paging:number=9')

In [3]:
#Dowloading the html page for the url specified 
page = requests.get(url)

In [5]:
#Parsing the downloaded html page using the beautiful soup library
soup = BeautifulSoup(page.content, "html.parser")

The information for each indiviidual car can be found in a class called **list-item** in the main block of the ev-site html document

In [7]:
#Extracting information for each respective car from the class list_item
cars_list = soup.find_all("div", class_="list-item")

We'll be going through the process for the first car in our database which is the Tesla Model 3 Long Range

In [87]:
#Search in class title_wrap and anchor tag title for title or name of car
carname1 = cars_list[0].find('a', class_='title').text  #Extracting name of the vehicale

In [88]:
carname1

'Tesla Model 3 Long Range Dual Motor'

In [86]:
#The battery information can be found in the subtitle section of every car information, we extract that information
car1sub_title = cars_list[0].find("div", class_="subtitle").text.strip().split('|') #Extract the subtitle portion of the car titles
#Looking at the subtitle information
car1sub_title

['Battery Electric Vehicle ', ' \n\t\t\t\t\t75 kWh *']

In [None]:
car1_bat_type = car1sub_title[0].strip() #Extract the main subtitle heading
car1_batt = car1sub_title[1].replace('*','').strip() #Extracting the battery capacity

In [11]:
#Extracting the specs for the first car in our cars list
specs_list = cars_list[0].find_all('p', class_ = 'left')
specs_list

[<p class="left">
 <span class="tag">0 - 100</span>
 <span class="acceleration">4.4 sec</span>
 </p>,
 <p class="left">
 <span class="tag">Top Speed</span>
 <span class="topspeed">233 km/h</span>
 </p>,
 <p class="left">
 <span class="tag">Range</span>
 <span class="erange_real">485 km</span>
 </p>,
 <p class="left">
 <span class="tag">Efficiency</span>
 <span class="efficiency">155 Wh/km</span>
 </p>,
 <p class="left">
 <span class="tag">Fastcharge</span>
 <span class="fastcharge_speed_print">750 km/h</span>
 <span class="fastcharge_speed hidden">750</span>
 </p>]

In [12]:
#0-100 acceleration for the first car
zerotohund = specs_list[0].text.strip().split('\n')

In [13]:
#Top speed for the first car
top_speed = specs_list[1].text.strip().split('\n')

In [14]:
#Distance of trtavel on a full charge
car_range = specs_list[2].text.strip().split('\n')

In [15]:
#Power effeciency rating for the first car
efficiency = specs_list[3].text.strip().split('\n')

In [16]:
#Fast charging time for the first car
fast_charge = specs_list[4].text.strip().split('\n')

In [17]:
#The function below automates the process of extracting each spec in the specs data 
specs_dict = {} 
for spec in specs_list:
    spec_data = spec.text.strip().split('\n')
    specs_dict[spec_data[0]] = spec_data[1]
    

In [18]:
specs_dict

{'0 - 100': '4.4 sec',
 'Top Speed': '233 km/h',
 'Range': '485 km',
 'Efficiency': '155 Wh/km',
 'Fastcharge': '750 km/h'}

In [19]:
#Extracting information about the prices of the car before and after incentives(additional packages that can be purchased)
price_list = cars_list[0].find_all('span', class_ = 'price_buy')
price_list

[<span class="price_buy">
 <span class="country_de" title="Price in Germany before incentives">€62,465</span>
 <span class="flag-icon flag-icon-de"></span>
 </span>,
 <span class="price_buy">
 <span class="country_nl" title="Price in The Netherlands before incentives">€60,995</span>
 <span class="flag-icon flag-icon-nl"></span>
 </span>,
 <span class="price_buy">
 <span class="country_uk" title="Price in the United Kingdom after incentives">£57,490</span>
 <span class="flag-icon flag-icon-gb"></span>
 </span>]

In [20]:
#Extracting the prices for the 
prices = ['price_in_germ_b4_incentives','price_in_neth_b4_incentives', 'price_in_uk_after_incentives']
prices_data = {}
for ind,price in enumerate(price_list):
    amount = price.text.strip()
    prices_data[prices[ind]] = amount

In [21]:
prices_data

{'price_in_germ_b4_incentives': '€62,465',
 'price_in_neth_b4_incentives': '€60,995',
 'price_in_uk_after_incentives': '£57,490'}

In [22]:
#Extracting information about the towing capacity of the car
towing_capacity = cars_list[0].find_all('span', class_ = 'towweight')[0].text

In [23]:
#Extracting information about the number of seats available for in the car
cars_list[0].find_all('span', title = 'Number of seats')[1].text

'5'

In [44]:
def get_car_prior_info(car):
    
    #Extract information regarding title, battery and battery capacity 
    car_name = car.find('a', class_='title').text  #Extracting name of the vehicale
    car_sub_info = car.find("div", class_="subtitle").text.strip().split('|') #Contains subtitle information for  every car
    car_batt = car_sub_info[0].strip() #Extract the main subtitle heading
    car_batt_capacity = car_sub_info[1].replace('*','').strip() #Extracting the battery capacity

        
    return car_name, car_batt , car_batt_capacity

In [46]:
get_car_prior_info(cars_list[1])

('Kia EV6 GT', 'Battery Electric Vehicle', '74 kWh')

In [89]:
def get_car_specs(car):
    #Extracting information regarding specs of car 
    specs_list = car.find_all('p', class_ = 'left') #Information about the specs of a car
    specs_dict = {}
    
    #looping through the specs data 
    for spec in specs_list:
        spec_data = spec.text.strip().split('\n')
        specs_dict[spec_data[0].replace('*','')] = spec_data[1]
    
    return specs_dict 

In [90]:
get_car_specs(cars_list[3])

{'0 - 100': '6.1 sec',
 'Top Speed': '225 km/h',
 'Range': '380 km',
 'Efficiency': '151 Wh/km',
 'Fastcharge': '630 km/h'}

In [91]:
type(specs_list)

bs4.element.ResultSet

In [92]:
def get_car_prices(car: 'bs4.element.Tag'):
    #Extracting information regarding the pricing of the cars
    price_names = ['germ_price','neth_price', 'uk_price']
    price_data = car.find_all('span', class_ = 'price_buy')
    prices_data = {}
    
    #Looping through the prices data and assigning each price to the repective price
    for ind,price in enumerate(price_data):
        amount = price.text.strip()
        prices_data[price_names[ind]] = amount
        
        
    return prices_data

In [93]:
get_car_prices(cars_list[3])

{'germ_price': '€52,965', 'neth_price': '€52,995', 'uk_price': '£48,490'}

In [81]:
def get_car_extra_info(car):
    '''

    args:
    car -> bs4 tag element , contains information about a single car
    towing_capacity and number of seats
    
    result -> str, towing capaciity
    '''
    
    #Extracting information about the towing capacity of the car
    towing_capacity = car.find_all('span', class_ = 'towweight')[0].text
    #Extracting information about the number of seats available for in the car
    num_of_seats = car.find_all('span', title = 'Number of seats')[1].text
    
    return towing_capacity, num_of_seats

In [94]:
get_car_extra_info(cars_list[3])

('1000', '5')

In [82]:
def get_car_info(car: 'bs4.element.Tag'):
    #Coalating all car information into one place
    car_info = []
    
    #Extarcting information about name and battery
    car_name , car_batt, car_batt_cap = get_car_prior_info(car)
    car_specs = get_car_specs(car )
    car_prices = get_car_prices(car)
    towing_capacity , num_of_seats = get_car_extra_info(car)
    
    return (
        [
            car_name, car_batt, car_batt_cap,
            car_specs['0 - 100'],car_specs['Top Speed'],car_specs['Range'],
            car_specs['Efficiency'], car_specs['Fastcharge'],
            car_prices['germ_price'],car_prices['neth_price'],car_prices['uk_price'],
            towing_capacity, num_of_seats
    ]
)

In [83]:
type(cars_list[2])

bs4.element.Tag

In [66]:
#Automating data collection process for all cars in our database
all_cars = []
for car in cars_list:
    single_car_data = get_car_info(car)
    all_cars.append(single_car_data)

In [96]:
#Displaying information for the first two cars 
all_cars[:2]

[['Tesla Model 3 Long Range Dual Motor',
  'Battery Electric Vehicle',
  '75 kWh',
  '4.4 sec',
  '233 km/h',
  '485 km',
  '155 Wh/km',
  '750 km/h',
  '€62,465',
  '€60,995',
  '£57,490',
  '1000',
  '5'],
 ['Kia EV6 GT',
  'Battery Electric Vehicle',
  '74 kWh',
  '3.5 sec',
  '260 km/h',
  '370 km',
  '200 Wh/km',
  '970 km/h',
  '€69,990',
  '€68,495',
  '£61,595',
  '1800',
  '5']]

In [69]:
columns = [
    'car_name', 'electric_type','battery_capacity',
    'zero_to_hundred','top_speed','range','efficiency',
    'fastcharge','germ_price','neth_price','uk_price',
    'tow_weight','no_of_seats'
]

In [71]:
all_cars_df = pd.DataFrame(all_cars, columns=columns)

In [72]:
all_cars_df.head()

Unnamed: 0,car_name,electric_type,battery_capacity,zero_to_hundred,top_speed,range,efficiency,fastcharge,germ_price,neth_price,uk_price,tow_weight,no_of_seats
0,Tesla Model 3 Long Range Dual Motor,Battery Electric Vehicle,75 kWh,4.4 sec,233 km/h,485 km,155 Wh/km,750 km/h,"€62,465","€60,995","£57,490",1000,5
1,Kia EV6 GT,Battery Electric Vehicle,74 kWh,3.5 sec,260 km/h,370 km,200 Wh/km,970 km/h,"€69,990","€68,495","£61,595",1800,5
2,Tesla Model Y Long Range Dual Motor,Battery Electric Vehicle,75 kWh,5.0 sec,217 km/h,435 km,172 Wh/km,670 km/h,"€59,965","€66,995","£57,990",1600,5
3,Tesla Model 3,Battery Electric Vehicle,57.5 kWh,6.1 sec,225 km/h,380 km,151 Wh/km,630 km/h,"€52,965","€52,995","£48,490",1000,5
4,BMW i4 eDrive40,Battery Electric Vehicle,80.7 kWh,5.7 sec,190 km/h,470 km,172 Wh/km,730 km/h,"€59,200","€60,630","£53,480",1600,5


In [None]:
#Further Cleaning will be performed to remove units