In [3]:
###############
### Imports ###
###############

import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re

In [None]:
#########################
### Gather Data Files ###
#########################


req = requests.get('https://vehiculeselectriques.gouv.qc.ca/english/rabais/ve-neuf/vehicules-neufs-admissibles.asp')
soup = BeautifulSoup(req.content, 'html.parser')
with open('Eligible vehicles.html') as Eliveh:
    new_soup = BeautifulSoup(Eliveh,'html.parser')
cars = soup.find_all("div", class_="infoVoiture")
fed_cars = new_soup.find('table')
fed_cars = fed_cars.findAll('tr', role='row')[1:]

In [None]:
###########################
### Load into Dataframe ###
###########################


fed_cars_df = pd.DataFrame()

for car in fed_cars:
    car_make = car.find('td', class_='sorting_1').text
    car_model = car.find('td', class_='sorting_3').text
    car_model_year = car.find('td', class_='sorting_2').text
    try:
        car_trim = car.find('td', headers='tbl8').text
        car_category = car.find('td', headers='tbl9').text
        car_rebate = car.find('td', headers='tbl12').text
        car_eligibility = car.find('td', headers = 'tbl16').text
    except:
        car_trim = ''
        car_category = ''
        car_rebate = ''
        car_eligibility = 'Empty'
    fed_cars_df = fed_cars_df.append({'Make':car_make,'Model':car_model, 'Trim':car_trim,'Rebate-Fed':car_rebate,'Year-Fed':car_model_year,'Category':car_category,'Eligibility':car_eligibility}, ignore_index=True)

cars_df = pd.DataFrame()
makes = ['Audi','BMW','BYD','Chevrolet','Chrysler','Ford','Honda','Hyundai','Jeep','Kia','Lexus','Lincoln','Mazda','Mini','Mitsubishi','Nissan','Polestar','Subaru','Tesla','Toyota','Volkswagen','Volvo']

for car in cars:
    car_category = car.findAll('p')[1].text[10:]
    if car_category in ['All-electric','Plug-in hybrid']:
        car_string = car.find('h2').text
        car_model_year = car.findAll('p')[0].text[15:-2]
        car_rebate = car.find('div', class_='rabais-achat').text[1:-1]
        for make in makes:
            if make in car_string:
                car_name = car_string[:len(make)]
                car_model = car_string[len(make)+1:]
                continue
        cars_df = cars_df.append({'Make':car_name,'Model':car_model,'Rebate-QC':car_rebate,'Year-QC':car_model_year,'Category':car_category}, ignore_index=True)

fed_cars_df.loc[fed_cars_df['Eligibility'] == 'Empty']

This is all the values that did not enter correctly from the web scraping

In [None]:
#####################################
### Manually Enter Missing Values ###
#####################################

veh_list = [12,27,30,32,41,42,63]

rebate_qc = ['$5000','$5000','$5000','$5000','$5000','$5000','$5000']
category = ['BEV','PHEV','PHEV','PHEV','PHEV','PHEV','PHEV']
trim = ['Base\n\t\t\ts','SE\n\t\t\tSEL\n\t\t\tTitanium','SE\n\t\t\tSEL\n\t\t\tTitanium','SE\n\t\t\tSEL\n\t\t\tTitanium','Preferred\n\t\t\tLuxury','Preferred\n\t\t\tLuxury','EX\n\t\t\tEX+\n\t\t\tSX']
eligibility = ['2020-01-16','2022-04-25','2022-04-25','2022-04-25','2022-04-25','2022-04-25','2022-04-25']

for ind, car in enumerate(veh_list):
    fed_cars_df.iloc[car,2] = trim[ind]
    fed_cars_df.iloc[car,3] = rebate_qc[ind]
    fed_cars_df.iloc[car,-2] = category[ind]
    fed_cars_df.iloc[car,-1] = eligibility[ind]

fed_cars_df.iloc[veh_list,:]

In [None]:
##################
###save to file###
##################

fed_cars_df.to_csv('fed_cars_df.csv')
cars_df.to_csv('cars_df.csv')

In [32]:
######################
### Load from file ###
######################

fed_cars_df = pd.read_csv('fed_cars_df.csv',index_col=0)
cars_df = pd.read_csv('cars_df.csv',index_col= 0)

In [None]:
def trimseparator(series,separator = '\n\t\t\t'):
    df = pd.DataFrame()
    str = series['Trim']
    res = str.split(separator)
    for ind in res:
        car_make = series['Make']
        car_model = series['Model']
        car_model_year = series['Year-Fed']
        car_category = series['Category']
        car_rebate = series['Rebate-Fed']
        car_eligibility = series['Eligibility']
        temp_dict={'Make':car_make,'Model':car_model, 'Trim':ind,'Rebate-Fed':car_rebate,'Year-Fed':car_model_year,'Category':car_category,'Eligibility':car_eligibility}
        df = df.append(temp_dict,ignore_index=True)
    return df

size = range(len(fed_cars_df))
drop_list = []
for ind in size:
    series = fed_cars_df.iloc[ind]
    if '\n\t\t\t' in series['Trim']:
        temp= trimseparator(series)
        fed_cars_df = fed_cars_df.append(temp, ignore_index=True)
for ind in size:
    series = fed_cars_df.iloc[ind]
    if '\n\t\t\t' in series['Trim']:
        drop_list.append(ind)
fed_cars_df.drop(drop_list,inplace=True)

drop_list = []
for ind in size:
    series = fed_cars_df.iloc[ind]
    if '\n' in series['Trim']:
        temp= trimseparator(series,'\n')
        fed_cars_df = fed_cars_df.append(temp, ignore_index=True)
for ind in size:
    series = fed_cars_df.iloc[ind]
    if '\n' in series['Trim']:
        drop_list.append(ind)
fed_cars_df.drop(drop_list,inplace=True)

def timeseparator(series,separator = '\n\t\t\t'):
    df = pd.DataFrame()
    str = series['Year-Fed']
    res = str.split(separator)
    for ind in res:
        car_make = series['Make']
        car_model = series['Model']
        car_trim = series['Trim']
        car_category = series['Category']
        car_rebate = series['Rebate-Fed']
        car_eligibility = series['Eligibility']
        temp_dict={'Make':car_make,'Model':car_model, 'Trim':car_trim,'Rebate-Fed':car_rebate,'Year-Fed':ind,'Category':car_category,'Eligibility':car_eligibility}
        df = df.append(temp_dict,ignore_index=True)
    return df

size = range(len(fed_cars_df))

for ind in size:
    series = fed_cars_df.iloc[ind]
    if '/' in series['Year-Fed']:
        if '/\n\t\t\t' in series['Year-Fed']:
            series['Year-Fed'] = series['Year-Fed'].replace('/\n\t\t\t', '\n\t\t\t')
        else:
            series['Year-Fed'] = series['Year-Fed'].replace('/', '\n\t\t\t')

drop_list = []
for ind in size:
    series = fed_cars_df.iloc[ind]
    if '\n\t\t\t' in series['Year-Fed']:
        temp= timeseparator(series)
        fed_cars_df = fed_cars_df.append(temp, ignore_index=True)
for ind in size:
    series = fed_cars_df.iloc[ind]
    if '\n\t\t\t' in series['Year-Fed']:
        drop_list.append(ind)
fed_cars_df.drop(drop_list,inplace=True)

drop_list = []
for ind in size:
    series = fed_cars_df.iloc[ind]
    if '\n' in series['Year-Fed']:
        temp= timeseparator(series,'\n')
        fed_cars_df = fed_cars_df.append(temp, ignore_index=True)
for ind in size:
    series = fed_cars_df.iloc[ind]
    if '\n' in series['Year-Fed']:
        drop_list.append(ind)
fed_cars_df.drop(drop_list,inplace=True)

The database is now separated by trim and year released.

In [33]:
size = range(len(cars_df))

for ind in size:
    series = cars_df.iloc[ind]
    if 'All-electric' in series['Category']:
        series['Category'] = 'BEV'
    else:
        series['Category'] = 'PHEV'

size = range(len(cars_df))

for ind in size:
    series = cars_df.iloc[ind]
    for word in [' PHEV', ' Plug-in Hybrid', ' Electric']:
        if word in series['Model']:
            series['Model'] = series['Model'].replace(word, '')


def model_formatter(series):
    models = ['i4', 'IONIQ 5', 'Model 3', '60 Recharge']
    for model in models:
        if model in series['Model']:
            return series['Model'].replace(model, model + ' – ')
    return series['Model']


cars_df.insert(2, 'Trim', np.zeros(51))
def model_splitter(series):
    if ' – ' in series['Model']:
        res = series['Model'].split(' – ')
        return res[1]
    else:
        return ''

def model_correction(series):
    if ' – ' in series['Model']:
        res = series['Model'].split(' – ')
        return res[0]
    else:
        return series['Model']

def trimseparator(series,separator):
    df = pd.DataFrame()
    str = series['Trim']
    res = re.split(separator, str)
    for ind in res:
        car_make = series['Make']
        car_model = series['Model']
        car_model_year = series['Year-QC']
        car_category = series['Category']
        car_rebate = series['Rebate-QC']
        temp_dict={'Make':car_make,'Model':car_model, 'Trim':ind,'Rebate-QC':car_rebate,'Year-QC':car_model_year,'Category':car_category}
        df = df.append(temp_dict,ignore_index=True)
    return df

cars_df['Model'] = cars_df.apply(model_formatter, axis=1)
cars_df['Trim'] = cars_df.apply(model_splitter, axis=1)
cars_df['Model'] = cars_df.apply(model_correction, axis=1)

# drop_list = []
# separators = ['or', ', ', ' and ']
# for ind in size:
#     series = cars_df.iloc[ind]
#     for separator in separators:
#         if separator in series['Trim']:
#             temp= trimseparator(series,separator)
#             cars_df = cars_df.append(temp, ignore_index=True)
# for ind in size:
#     series = cars_df.iloc[ind]
#     if '\n' in series['Trim']:
#         drop_list.append(ind)
# cars_df.drop(drop_list,inplace=True)

# def timeseparator(series,separator = ' - '):
#     df = pd.DataFrame()
#     str = series['Year-QC']
#     res = str.split(separator)
#     for ind in res:
#         car_make = series['Make']
#         car_model = series['Model']
#         car_category = series['Category']
#         car_rebate = series['Rebate-QC']
#         temp_dict={'Make':car_make,'Model':car_model,'Rebate-QC':car_rebate,'Year-QC':ind,'Category':car_category}
#         df = df.append(temp_dict,ignore_index=True)
#     return df

# drop_list = []
# for ind in size:
#     series = cars_df.iloc[ind]
#     if ' - ' in series['Year-QC']:
#         temp= timeseparator(series)
#         cars_df = cars_df.append(temp, ignore_index=True)
# for ind in size:
#     series = cars_df.iloc[ind]
#     if ' - ' in series['Year-QC']:
#         drop_list.append(ind)
# cars_df.drop(drop_list,inplace=True)



cars_df

Unnamed: 0,Make,Model,Trim,Rebate-QC,Year-QC,Category
0,Audi,Q4 e-tron,Komfort,$7 000,2022,BEV
1,BMW,X3,xDrive30e,$2 500,2020 - 2021 - 2022,PHEV
2,BMW,3 Series,330e or 330e xDrive,$2 500,2021 - 2022,PHEV
3,BMW,i3,,$7 000,2020 - 2021,BEV
4,BMW,i3 REx and i3s Rex,,$5 000,2020 - 2021,PHEV
5,BMW,i3s,,$7 000,2020 - 2021,BEV
6,BMW,i4,eDrive40,$7 000,2022,BEV
7,BYD,E6,,$7 000,2020,BEV
8,Chevrolet,Bolt EUV,,$7 000,2022 - 2023,BEV
9,Chevrolet,Bolt EV,,$7 000,2020 - 2021 - 2022 - 2023,BEV


In [20]:

def model_formatter(series):
    models = ['i4', 'IONIQ 5', 'Model 3', '60 Recharge']
    if ' - ' in series['Model']:
        return series['Model'].replace(' - ',' – ')
    for model in models:
        if model in series['Model']:
            return series['Model'].replace(model, model + ' – ')
    return series['Model']

cars_df.insert(2, 'Trim', np.zeros(51))
def model_splitter(series):
    if ' – ' in series['Model']:
        res = series['Model'].split(' – ')
        return res[1]
    else:
        return ''

def model_correction(series):
    if ' – ' in series['Model']:
        res = series['Model'].split(' – ')
        return res[0]
    else:
        return series['Model']

cars_df['Model'] = cars_df.apply(model_formatter, axis=1)
cars_df['Trim'] = cars_df.apply(model_splitter, axis=1)
cars_df['Model'] = cars_df.apply(model_correction, axis=1)


In [34]:
def trimseparator(series,separator):
    df = pd.DataFrame()
    str = series['Trim']
    res = re.split(separator, str)
    for ind in res:
        car_make = series['Make']
        car_model = series['Model']
        car_model_year = series['Year-QC']
        car_category = series['Category']
        car_rebate = series['Rebate-QC']
        temp_dict={'Make':car_make,'Model':car_model, 'Trim':ind,'Rebate-QC':car_rebate,'Year-QC':car_model_year,'Category':car_category}
        df = df.append(temp_dict,ignore_index=True)
    return df

for i in range(5):
    separators = [' or ', ', ', ' and ']
    for ind in size:
        series = cars_df.iloc[ind]
        for separator in separators:
            if separator in series['Trim']:
                temp= trimseparator(series,separator)
                cars_df = cars_df.append(temp, ignore_index=True)

In [35]:
cars_df.tail(45)

Unnamed: 0,Make,Model,Trim,Rebate-QC,Year-QC,Category
101,Hyundai,IONIQ 5,Essential,$7 000,2022,BEV
102,Hyundai,IONIQ 5,Preferred,$7 000,2022,BEV
103,Hyundai,IONIQ 5,Preferred Long Range or Preferred AWD Long Range,$7 000,2022,BEV
104,Jeep,Wrangler 4xe,Rubicon,$5 000,2021 - 2022,PHEV
105,Jeep,Wrangler 4xe,Sahara,$5 000,2021 - 2022,PHEV
106,Volkswagen,ID.4,Pro,$7 000,2021,BEV
107,Volkswagen,ID.4,AWD Pro,$7 000,2021,BEV
108,BMW,3 Series,330e,$2 500,2021 - 2022,PHEV
109,BMW,3 Series,330e xDrive,$2 500,2021 - 2022,PHEV
110,Ford,Escape,"SE, SEL",$2 500,2020 - 2021 - 2022,PHEV
