# Using VCA data to identify BEVs - creating a master type database

We're going to try to use the VCA database to see whether we can use it to categorise our 1.4 million test ves-mot joined table identified in notebook number 6. 

To test whether this is possible, we'll start by downloading and tidying the VCA type approvals available on their website.

In [1633]:
#import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from zipfile import ZipFile
from tqdm import tqdm
import re

# visualising
import seaborn as sns

# Big Query
from google.oauth2 import service_account
import google.auth
from google.cloud import bigquery
from google.cloud import bigquery_storage

In [1634]:
#First we'll import the latest 2020 vehicle emissions data
url_d20 = 'https://carfueldata.vehicle-certification-agency.gov.uk/additional/2020/data%20for%20guide%202020.zip'
df_d20 = pd.read_csv(url_d20, encoding='cp1252')

In [1635]:
df_d20.head()

Unnamed: 0,Manufacturer,Model,Description,Transmission,Engine Capacity,Fuel Type,Powertrain,Engine Power (Kw),Engine Power (PS),Testing Scheme,...,Total cost / 10000 miles,Noise Level dB(A),Emissions CO [mg/km],THC Emissions [mg/km],Emissions NOx [mg/km],THC + NOx Emissions [mg/km],Particulates [No.] [mg/km],RDE NOx Urban,RDE NOx Combined,Unnamed: 45
0,ABARTH,595,595 1.4 145 BHP,M5,1368,Petrol,Internal Combustion Engine (ICE),107.0,145.0,WLTP,...,"£1,390",73.5,760.0,52.0,27.0,,,,,
1,ABARTH,595,595 1.4 145 BHP,M5,1368,Petrol,Internal Combustion Engine (ICE),107.0,145.0,WLTP,...,"£1,468",73.5,760.0,52.0,27.0,,,,,
2,ABARTH,595,595 1.4 TJET 145 BHP,M5,1368,Petrol,Internal Combustion Engine (ICE),107.0,145.0,WLTP,...,"£1,390",73.5,760.0,52.0,27.0,,,,,
3,ABARTH,595,595 1.4 TJET 145 BHP,M5,1368,Petrol,Internal Combustion Engine (ICE),107.0,145.0,WLTP,...,"£1,468",73.5,760.0,52.0,27.0,,,,,
4,ABARTH,595,595 1.4 TJET 145 BHP Convertible,M5,1368,Petrol,Internal Combustion Engine (ICE),107.0,145.0,WLTP,...,"£1,390",73.5,760.0,52.0,27.0,,,,,


In [1636]:
df_d20.columns

Index(['Manufacturer', 'Model', 'Description', 'Transmission',
       'Engine Capacity', 'Fuel Type', 'Powertrain', 'Engine Power (Kw)',
       'Engine Power (PS)', 'Testing Scheme', 'Euro Standard',
       'Diesel VED Supplement', 'Electric energy consumption Miles/kWh',
       'wh/km', 'Maximum range (Km)', 'Maximum range (Miles)',
       'WLTP Imperial Low', 'WLTP Imperial Medium', 'WLTP Imperial High',
       'WLTP Imperial Extra High', 'WLTP Imperial Combined',
       'WLTP Imperial Combined (Weighted)', 'WLTP Metric Low',
       'WLTP Metric Medium', 'WLTP Metric High', 'WLTP Metric Extra High',
       'WLTP Metric Combined', 'WLTP Metric Combined (Weighted)', 'WLTP CO2',
       'WLTP CO2 Weighted', 'Equivalent All Electric Range Miles',
       'Equivalent All Electric Range KM', 'Electric Range City Miles',
       'Electric Range City Km', 'Annual fuel Cost 10000 Miles',
       'Annual Electricity cost / 10000 miles', 'Total cost / 10000 miles',
       'Noise Level dB(A)', 'Emis

In [1637]:
df_d20.groupby(by=df_d20['Fuel Type']).count()

Unnamed: 0_level_0,Manufacturer,Model,Description,Transmission,Engine Capacity,Powertrain,Engine Power (Kw),Engine Power (PS),Testing Scheme,Euro Standard,...,Total cost / 10000 miles,Noise Level dB(A),Emissions CO [mg/km],THC Emissions [mg/km],Emissions NOx [mg/km],THC + NOx Emissions [mg/km],Particulates [No.] [mg/km],RDE NOx Urban,RDE NOx Combined,Unnamed: 45
Fuel Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Diesel,1338,1338,1338,1338,1338,1338,1338,1338,1338,1338,...,1338,1338,1274,660,1338,1269,1274,537,665,0
Diesel Electric,172,172,172,172,172,172,172,172,172,172,...,172,172,172,118,172,172,172,56,56,0
Electricity,43,43,43,0,43,43,21,21,43,43,...,43,43,19,9,9,2,0,1,7,0
Electricity / Petrol,94,94,94,94,94,94,94,94,94,94,...,94,94,94,94,94,44,7,24,36,0
Petrol,2527,2527,2527,2527,2527,2527,2525,2527,2527,2527,...,2527,2527,2408,2403,2526,438,775,1220,1338,0
Petrol Electric,705,705,705,705,705,705,705,705,705,705,...,705,705,705,705,705,457,319,63,63,0


## Check whether previous VCA databases have same headers

In [1638]:
#url_d20 was imported above
vca_urls = ['https://carfueldata.vehicle-certification-agency.gov.uk/additional/2020/data%20for%20guide%202020.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/2019/data%20for%20guide%202019.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/sept2018/September%202018%20data%20download.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/aug2017/download-data-for-Aug-2017-Euro-6.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/aug2016/download-data-for-Aug-2016-Euro-6.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/aug2015/download-data-for-Aug-2015-Euro-6.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/aug2015/download-data-for-Aug-2015-Euro-5.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/aug2014/download-data-for-Aug-2014-Euro-6.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/aug2014/download-data-for-Aug-2014-Euro-5.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/aug2013/download-data-for-Aug-2013-Euro-6.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/aug2013/download-data-for-Aug-2013-Euro-6.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/aug2012/download-data-for-Aug-2012-Euro-6.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/aug2012/download-data-for-Aug-2012-Euro-5.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/aug2012/download-data-for-Aug-2012-Euro-4.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/aug2011/download-data-for-Aug-2011-Euro-6.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/aug2011/download-data-for-Aug-2011-Euro-5.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/aug2011/download-data-for-Aug-2011-Euro-4.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/may2010/download-data-for-May-2010-Euro-4.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/may2010/download-data-for-May-2010-Euro-5.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/may2009/download-data-for-May-2009-Euro-4.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/may2009/download-data-for-May-2009-Euro-5.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/may2008/Part_A_Euro_IV_may2008.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/may2007/Part_A_Euro_IV_may2007.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/may2007/Part_B_Euro_III_may2007.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/may2006/Part_A_Euro_IV_may2006.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/may2006/Part_B_Euro_III_may2006.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/may2005/Part_A_Euro_IV_may2005.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/may2005/Part_B_Euro_III_may2005.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/may2004/Part_A_Euro_IV_may2004.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/may2004/Part_B_Euro_III_may2004.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/may2003/Part_A_Euro_IV_may2003.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/may2003/Part_B_Euro_III_may2003.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/may2002/Part_A_Euro_IV_may2002.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/may2002/Part_B_Euro_III_may2002.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/may2002/Part_C_Euro_II_may2002.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/july2001/Part_A_Euro_IV_july2001.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/july2001/Part_B_Euro_III_july2001.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/july2001/Part_C_Euro_II_july2001.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/january2001/Part_A_Euro_IV_jan2001.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/january2001/Part_B_Euro_III_jan2001.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/january2001/Part_C_Euro_II_jan2001.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/july2000/DataPartA_july2000.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/july2000/DataPartB_july2000.zip',
'https://carfueldata.vehicle-certification-agency.gov.uk/additional/july2000/DatapartC_july2000.zip']

# url_list = [url_20, url_19, url_18, url_17, url_16, url_15a, url_15b, url_14a, url_14b, url_13a, url_13b, url_12a, url_12b, url_12c, url_11a, url_11b, url_11c]

In [1639]:
df_dict = {}
failed_indexes = []

for i in tqdm(range(len(vca_urls))):
    try:
        df_dict[i] = pd.read_csv(vca_urls[i], encoding='cp1252', skip_blank_lines=True)
    except:
        failed_indexes.append(i)
        print(f"URL at index {i} failed to import")

 34%|███████████████████████████▉                                                      | 15/44 [00:02<00:03,  9.14it/s]

URL at index 14 failed to import


 39%|███████████████████████████████▋                                                  | 17/44 [00:02<00:03,  7.11it/s]

URL at index 15 failed to import
URL at index 16 failed to import


 66%|██████████████████████████████████████████████████████                            | 29/44 [00:04<00:01,  8.14it/s]

URL at index 27 failed to import


 77%|███████████████████████████████████████████████████████████████▎                  | 34/44 [00:04<00:01,  8.18it/s]

URL at index 33 failed to import


100%|██████████████████████████████████████████████████████████████████████████████████| 44/44 [00:05<00:00,  7.82it/s]


In [1640]:
for i in failed_indexes:
    print(i)
    print(vca_urls[i])

14
https://carfueldata.vehicle-certification-agency.gov.uk/additional/aug2011/download-data-for-Aug-2011-Euro-6.zip
15
https://carfueldata.vehicle-certification-agency.gov.uk/additional/aug2011/download-data-for-Aug-2011-Euro-5.zip
16
https://carfueldata.vehicle-certification-agency.gov.uk/additional/aug2011/download-data-for-Aug-2011-Euro-4.zip
27
https://carfueldata.vehicle-certification-agency.gov.uk/additional/may2005/Part_B_Euro_III_may2005.zip
33
https://carfueldata.vehicle-certification-agency.gov.uk/additional/may2002/Part_B_Euro_III_may2002.zip


In [1641]:
# some of the links wouldn't work, so we have had to manually download, save the excel files as .ods - bit annoying.
df_dict[14] = pd.read_excel('euro_data/Euro 6 snapshot.ods', engine='odf')
df_dict[15] = pd.read_excel('euro_data/Euro 5 snapshot.ods', engine='odf')
df_dict[16] = pd.read_excel('euro_data/Euro 4 snapshot.ods', engine='odf')
df_dict[27] = pd.read_csv('euro_data/Part_B_Euro_III_may2005.csv')
df_dict[33] = pd.read_csv('euro_data/Part_B_Euro_III_may2002.csv')
df_dict[19] = pd.read_csv('euro_data/May-2009-Euro-4.csv', encoding='cp1252')
df_dict[20] = pd.read_csv('euro_data/May-2009-Euro-5.csv', encoding = 'cp1252')

In [1642]:
print(len(df_dict))

44


There's a problem with dataframe number 26, we need to clean up some whitespace. Someone put a title section in a CSV...

In [1643]:
# df_dict[26].head()

In [1644]:
drop_cols = df_dict[26].iloc[:3, 20:].columns.to_list()
df_dict[26].drop(columns=drop_cols, inplace=True)

In [1645]:
# df_dict[26]
new_cols = df_dict[26].iloc[2]
df_dict[26] = df_dict[26][6:].reset_index(drop=True)
df_dict[26].columns = new_cols

# df_dict[26].dropna(axis = 0, inplace=True)

We also have a problem in df_dict[43], which has some empty column. We'll just delete these. 

In [1646]:
df_dict[43].drop(columns=['Line_2', 'Line_3'], inplace=True)

Again in 35...

In [1647]:
df_dict[35].drop(columns=['Line_2'], inplace=True)

And 41 & 42...

In [1648]:
df_dict[41].drop(columns=['Line_2', 'Line_3'], inplace=True)

In [1649]:
df_dict[42].drop(columns=['Line_2', 'Line_3'], inplace=True)

df_dicts 7-13 have an additional transmission type column, which we'll account for by inserting an additional, blank column in all the other dfs.

This must be done at column index 4, since it will come after the first three columns. It'll be 

In [1650]:
for i in range(0, 7):
    df_dict[i].insert(4, 'transmission type', np.nan)

for i in range(14, 44):
    df_dict[i].insert(4, 'transmission type', np.nan)

The very first dataframe, df_dict[0], contains a 'powertrain' column, which none of the others do. Let's insert it into the others, at index position 7.

In [1651]:
for i in range(1, 44):
    df_dict[i].insert(7, 'powertrain', np.nan)

The first two (0 and 1) df_dicts have two columns at column index position 8 and 9: engine power(kw) and engine power(ps). Let's insert those columns into the subsequent databases. They also have additional columns at 9: testing scheme.

In [1652]:
for i in range(2, 44):
    df_dict[i].insert(8, 'engine power(kw)', np.nan)
    df_dict[i].insert(9, 'engine power(ps)', np.nan)
    df_dict[i].insert(10, 'testing scheme', np.nan)

Now we have a bit more of a complicated problem. Most of the data frames have a euro standard column. In 17-24 and 26-27 (inclusive) we have no euro standard column. In everything else, we have a euro standard column, but in the wrong place. 

In [1653]:
for i in range(17, 28):
    print(i)
    print(vca_urls[i])

17
https://carfueldata.vehicle-certification-agency.gov.uk/additional/may2010/download-data-for-May-2010-Euro-4.zip
18
https://carfueldata.vehicle-certification-agency.gov.uk/additional/may2010/download-data-for-May-2010-Euro-5.zip
19
https://carfueldata.vehicle-certification-agency.gov.uk/additional/may2009/download-data-for-May-2009-Euro-4.zip
20
https://carfueldata.vehicle-certification-agency.gov.uk/additional/may2009/download-data-for-May-2009-Euro-5.zip
21
https://carfueldata.vehicle-certification-agency.gov.uk/additional/may2008/Part_A_Euro_IV_may2008.zip
22
https://carfueldata.vehicle-certification-agency.gov.uk/additional/may2007/Part_A_Euro_IV_may2007.zip
23
https://carfueldata.vehicle-certification-agency.gov.uk/additional/may2007/Part_B_Euro_III_may2007.zip
24
https://carfueldata.vehicle-certification-agency.gov.uk/additional/may2006/Part_A_Euro_IV_may2006.zip
25
https://carfueldata.vehicle-certification-agency.gov.uk/additional/may2006/Part_B_Euro_III_may2006.zip
26
https:

So we know that...
* 17, 19, 21, 24, 26 are euro 4
* 18, 20, are euro 5
* 23, 25, 27 are euro 3

In [1654]:
euro3 = [23, 27]
euro4 = [18, 20, 22]
euro5 = [17, 19, 21, 24, 26]
# print(len(euro3), len(euro4), len(euro5))

Let's insert a column at column index 11 in those dataframes, before moving on to sort out the order problem.

In [1655]:
for i in euro3:
    df_dict[i].insert(11, 'euro_standard', 3)

for i in euro4:
    df_dict[i].insert(11, 'euro_standard', 4)

for i in euro5:
    df_dict[i].insert(11, 'euro_standard', 5)

Ok, that's got us a bit further. In the remainng dfs, we have "Euro Standard" columns, and "Euro standard" columns. 

Let's make a list of those that have title case, euros and then iterate through them (exception 0 and 1), and add a new column at position 11, which is a copy of the later Euro Standard column. 

In [1656]:
euros = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 14, 15, 16, 25, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43]

# This bit of code was run after columns dictionary was created later... oops
# euros = []
# for i in range(len(df_dict)):
#     if 'Euro Standard' in columns_dictionary[i]:
#         euros.append(i)

# print(euros)

In [1657]:
for i in euros[2:]:
    df_dict[i].insert(11, 'euro_standard', df_dict[i]['Euro Standard'])
    df_dict[i].drop(columns='Euro Standard', inplace=True)

dfs 11, 12, and 13's euro standard column is labelled as "Euro standard". Let's sort those out now:

In [1658]:
for i in [11, 12, 13]:
    df_dict[i].insert(11, 'euro_standard', df_dict[i]['Euro standard'])
    df_dict[i].drop(columns=['Euro standard'], inplace=True)

Dfs 0 and 1 have a "Diesel VED supplement column", which we'll now add to the rest. 

In [1659]:
for i in range(2, 44):
    df_dict[i].insert(12, 'diesel_ved_supplement', np.nan)

Dfs 11 and onwards do not have an "Electric energy consumption Miles/kWh" column, or a wh/km column. We also have "maximum range" or "electric range", which appear to mean the same thing. We will call this parameter "electric range", although this hides the fact that the parameter's name was changed in 2015. We believe nothing in the underlying data changed. We'll call those columns something that incorporates both names. 

In [1660]:
for i in range(11, 44):
    df_dict[i].insert(13, 'electric_energy_miles_per_kwh', np.nan)
    df_dict[i].insert(14, 'wh/km', np.nan)
    df_dict[i].insert(15, 'max_range_km', np.nan)
    df_dict[i].insert(16, 'max_range_mi', np.nan)

DFs 0 and 1 have a bunch of WLTP columns...

In [1661]:
for i in range(2, 44):
    df_dict[i].insert(17, 'imperial_tow', np.nan)
    df_dict[i].insert(18, 'WLTP Imperial Medium', np.nan)
    df_dict[i].insert(19, 'WLTP Imperial High', np.nan)
    df_dict[i].insert(20, 'WLTP Imperial Extra High', np.nan)
    df_dict[i].insert(21, 'WLTP Imperial Combined', np.nan)
    df_dict[i].insert(22, 'wltp_imperial_combined_weighted', np.nan)
    df_dict[i].insert(23, 'WLTP Metric Low', np.nan)
    df_dict[i].insert(24, 'WLTP Metric Medium', np.nan)
    df_dict[i].insert(25, 'WLTP Metric High', np.nan)
    df_dict[i].insert(26, 'WLTP Metric Extra High', np.nan)
    df_dict[i].insert(27, 'WLTP Metric Combined', np.nan)
    df_dict[i].insert(28, 'WLTP Metric Combined (Weighted)', np.nan)

DF 0 has a WLTP CO2 column at index 29, which appears to align to what DF1 has at index 30, going by the values in the column.

In [1662]:
df_dict[1].insert(29, 'WLTP CO2', df_dict[1]['WLTP CO2 g/km'])
df_dict[1].drop(columns=['WLTP CO2 g/km'], inplace=True)

Now insert an extral WLTP CO2 column in the rest:

In [1663]:
for i in range(2, 44):
    df_dict[i].insert(29, 'wltp_co2_gkm', np.nan)

Insert a column for WLTP CO2 weighted at index 30 in df[1] which uses values from df_dict[1]['WLTP CO2 Weighted'] column. 

In [1664]:
df_dict[1].insert(30, 'WLTP_CO2_weighted', df_dict[1]['WLTP CO2 Weighted'])
df_dict[1].drop(columns=['WLTP CO2 Weighted'], inplace=True)

In [1665]:
for i in range(2, 44):
    df_dict[i].insert(30, 'WLTP CO2 Weighted', np.nan)

Now we have to insert some columns in df_dict[0, 1] for the discontinued NEDC measures of co2. df_dict[1] has this down as something other than the subsequent dfs, making things yet more complicated. 

So let's start by inserting NEDC column into df_dict[0], and df_dict[2] and above.

In [1666]:
df_dict[0].insert(31, 'NEDC CO2 g/km', np.nan)

In [1667]:
for i in range(2, 44):
    df_dict[i].insert(31, 'NEDC CO2 g/km', df_dict[i]['CO2 g/km'])
    df_dict[i].drop(columns=['CO2 g/km'], inplace=True)

In [1668]:
for i in range(0, 2):
    df_dict[i].insert(32, 'Metric Urban (Cold)', np.nan)
    df_dict[i].insert(33, 'Metric Extra-Urban', np.nan)
    df_dict[i].insert(34, 'Metric Combined', np.nan)
    df_dict[i].insert(35, 'Imperial Urban (Cold)', np.nan)
    df_dict[i].insert(36, 'Imperial Extra-Urban', np.nan)
    df_dict[i].insert(37, 'Imperial Combined', np.nan)

df_dict[0, 1, 11, 12, 13] need a fuel cost of 12000 miles column at position 38 (separate to 'Total cost / 12000 miles', which appears later in other columns). 
* df_dict[0] has one called 'Annual fuel Cost 10,000 miles', which is annoying. We'll keep that separate to the 12,000 miles column for now, so it needs np.nans
* df_dict[1] has one called 'Fuel Cost 12000 Miles' later, so we'll copy that one

In [1669]:
df_dict[0].insert(38, 'fuel_cost_12000_mi', np.nan)
df_dict[1].insert(38, 'fuel_cost_12000_mi', df_dict[1]['Fuel Cost 12000 Miles'])
df_dict[1].drop(columns=['Fuel Cost 12000 Miles'], inplace=True)


df_dict[11, 12, 13] have a 'Fuel Cost 12000 Miles' column later on, we'll move that to position 38 and drop the original.

In [1670]:
for i in range(11, 14):
    df_dict[i].insert(38, 'fuel_cost_12000_mi', df_dict[i]['Fuel Cost 12000 Miles'])
    df_dict[i].drop(columns=['Fuel Cost 12000 Miles'], inplace=True)

We now have "electricity cost" and "electricity cost / 12000 miles", in df_dict[2-10 inclusive]. Manual inspection reveals that the entries in this column are the same. Df_dict[0, 1] have "Annual electricity cost / 10,000 miles", which seems to be different. We'll preserve that difference and create two separate columns filled with np.nans for now, but it looks like with some simple maths (multply by 1.2) you can get from one to the other. 

So we need a new column at 39 in df_dict[0, 1], and df_dict [11-end]

In [1671]:
df_dict[0].insert(39, 'electricity_cost_12000_miles', np.nan)
df_dict[1].insert(39, 'electricity_cost_12000_miles', df_dict[1]['Electricity cost'])
df_dict[1].drop(columns=['Electricity cost'], inplace=True)

And now for df_dict[12-end]



In [1672]:
for i in range(11, 44):
    df_dict[i].insert(39, 'electricity_cost_12000_miles', np.nan)

df_dict[2-10] have a 'total cost of 12,000 miles' column. We'll insert one in df_dict[0, 1 & 11-end]

In [1673]:
df_dict[0].insert(40, 'total_cost_12000_miles', np.nan)
df_dict[1].insert(40, 'total_cost_12000_miles', df_dict[1]['Total cost / 12000 miles'])
df_dict[1].drop(columns=['Total cost / 12000 miles'], inplace=True)

In [1674]:
for i in range(11,44):
    df_dict[i].insert(40, 'total_cost_12000_miles', np.nan)

df_dict[0, 1] have:
* 'Equivalent All Electric Range Miles' at 41
* 'Equivalent All Electric Range KM' at 42
* 'Electric Range City Miles' at 43
* 'Electric Range City KM' at 44

We'll insert those into the rest, which don't have those columns.

In [1675]:
for i in range(2, 44):
    df_dict[i].insert(41, 'equivalent_all_electric_range_mi', np.nan)
    df_dict[i].insert(42, 'equivalent_all_electric_range_km', np.nan)
    df_dict[i].insert(43, 'electric_city_range_mi', np.nan)
    df_dict[i].insert(44, 'electric_city_range_km', np.nan)

df_dict[0] has 'Annual fuel cost 10000 Miles', 'Annual Electricity cost / 10000 miles', 'Total cost / 10000 miles', which no one else has. We'll add that into range(1, 44). They need to go into column index 45, 46, 47. 

In [1676]:
for i in range(1, 44):
    df_dict[i].insert(45, 'annual_fuel_cost_10k_mi', np.nan)
    df_dict[i].insert(46, 'annual_electricity_cost_10k_mi', np.nan)
    df_dict[i].insert(47, 'total_cost_10k_mi', np.nan)

Column 48 is noise level dB(A) - and is the same across all dfs! Hurrah! No change needed.

Column 49 is emissions CO across all dfs, but the units change. df_dict[0 - 14 inclusive] have mg/km, but earlier they have g/km. We'll sort this out later, and fill in np.nans for now. 

In [1677]:
df_dict[16].insert(49, 'emission_co_mgkm', np.nan)
# df_dict[16].drop(columns=['Emissions CO [g/km]'], inplace=True)

for i in range(17, 26):
    df_dict[i].insert(49, 'emission_co_mgkm', np.nan)
#     df_dict[i].drop(columns=['Emissions CO'], inplace=True)

for i in range(28, 44):
    df_dict[i].insert(49, 'emission_co_mgkm', np.nan)
#     df_dict[i].drop(columns=['Emissions CO'], inplace=True)
    
for i in [26, 27]:
    df_dict[i].insert(49, 'emission_co_mgkm', np.nan)
#     df_dict[i].drop(columns=['Emissions CO    (g/km)'], inplace=True)

In colum  50, in df_dict[16-43] we have a emissions co g/km column. We'll create that now in df_dict[0-15].

In [1678]:
for i in range(0, 16):
    df_dict[i].insert(50, 'emissions_co_gkm', np.nan)

In column 51, we have:
* THCs mg/km in df_dict[0-15 inclusive]
* HCs g/km in df-dict[16-27 inclusive], and
* HCs g/km in column 51 in df_dict[28-43]. 

After some research, it seems that HCs and THCs are the same, but were just labelled differently. We can combine these columns, therefore. So, we need to insert a new column at position 50 in df_dict[28-43] which copies the column at 51 and then drops it. 

In [1679]:
for i in range(28, 44):
    df_dict[i].insert(51, 'emissions_hc_gkm', df_dict[i]['Emissions HC'])
    df_dict[i].drop(columns=['Emissions HC'], inplace=True)

Now we need to insert a new column in position 51 in df_dicts[16-43] inclusive that is converted to mg/km. We'll do the conversion later, and fill with np.nans for now. 

In [1680]:
for i in range(16, 44):
    df_dict[i].insert(51, 'emissions_hc_mgkm', np.nan)

Now need to insert a placeholder hc g/km column in 0-15 inclusive.

In [1681]:
for i in range(0, 16):
    df_dict[i].insert(52, 'emissions_hc_gkm', np.nan)

In 53 and 55, we've got some mixed up NOx and NOx + HCs. 

In columns 53, we have:
* NOx mg/km in dfs 0-15 inclusive
* NOx g/km in dfs 16-27
* NOx + HCs g/km in dfs 28-43

In column 54, we have:
* THC + NOx mg/km in 0-15
* HC + NOx g/km in 16-27
* NOx g/km in 28-43

So let's create columns:
* 52 - NOx mg/km - needs adding to 16-43
* 53 - NOx g/km - needs adding to 0-15, and realigning in 28-43
* 54 - THC + NOx mg/km needs adding to 16-43
* 55 - HC+NOx g/km - needs adding to 0-15, and realigning in 28-43

In [1682]:
for i in range(16, 44):
    df_dict[i].insert(53, 'emissions_nox_mgkm', np.nan)

In [1683]:
for i in range(0, 16):
    df_dict[i].insert(54, 'emissions_nox_gkm', np.nan)
    
for i in range(28, 44):
    df_dict[i].insert(54, 'emissions_nox_gkm', df_dict[i].iloc[:, 54])
    df_dict[i].drop(columns=['Emissions NOx'], inplace=True)


Now to add a column in 16-43 that will have THC_NOx mg/km for 16-43.

In [1684]:
for i in range(16, 44):
    df_dict[i].insert(55, 'emissions_thc+nox_mgkm', np.nan)

In [1685]:
for i in range(0, 16):
    df_dict[i].insert(56, 'emissions_thc+hox_gkm', np.nan)

At column index 56, we have particulates. In 0-15 this is measured in mg, after which it is measured in g. We'll add a column in the df_dict[16-43] frames for mg, then one in 0-15 with g. 

In [1686]:
for i in range(16, 44):
    df_dict[i].insert(57, 'pm_mgkm', np.nan)

In [1687]:
for i in range(0, 16):
    df_dict[i].insert(58, 'emissions_pm_gkm', np.nan)

At 58, we've got a column in df_dict[0-2 inclusive] which is called 'RDE NOx', also at 59 we've got 'RDE NOx Combined', we'll add those into df_dict[3-43]

In [1688]:
for i in range(3, 44):
    df_dict[i].insert(59, 'RDE_nox_urban', np.nan)
    df_dict[i].insert(50, 'RDE_nox_combined', np.nan)

At column 61, we have 'Unnamed' something in df_dict[0, 3, 6, 17, 18, 27, 33, 34]

In [1689]:
df_dict[0].drop(columns=['Unnamed: 45'], inplace=True)
df_dict[3].drop(columns=['Unnamed: 27'], inplace=True)
df_dict[6].drop(columns=['Unnamed: 27'], inplace=True)
df_dict[17].drop(columns=['Unnamed: 20'], inplace=True)
df_dict[18].drop(columns=['Unnamed: 20'], inplace=True)
df_dict[27].drop(columns=['Unnamed: 20'], inplace=True)
df_dict[33].drop(columns=['Unnamed: 21'], inplace=True)
df_dict[34].drop(columns=['Unnamed: 21'], inplace=True)

A bunch more Unnamed columns need dropping:

In [1690]:
df_dict[3].drop(columns=['Unnamed: 28', 'Unnamed: 29', 'Unnamed: 30', 'Unnamed: 31', 'Unnamed: 32', 'Unnamed: 33'], inplace=True)

In [1691]:
df_dict[6].drop(columns=['Unnamed: 28', 'Unnamed: 29', 'Unnamed: 30', 'Unnamed: 31', 'Unnamed: 32', 'Unnamed: 33', 'Unnamed: 34', 'Unnamed: 35', 'Unnamed: 36'], inplace=True)

In [1692]:
df_dict[8].drop(columns=['Unnamed: 30', 'Unnamed: 31', 'Unnamed: 32', 'Unnamed: 33', 'Unnamed: 34', 'Unnamed: 35', 'Unnamed: 36', 'Unnamed: 37', 'Unnamed: 38', 'Unnamed: 39', 'Unnamed: 40'], inplace=True)

In [1693]:
df_dict[11].drop(columns=['Unnamed: 28', 'Unnamed: 29', 'Unnamed: 30'], inplace=True)

In [1694]:
df_dict[12].drop(columns=['Unnamed: 28', 'Unnamed: 29', 'Unnamed: 30'], inplace=True)

In [1695]:
df_dict[13].drop(columns=['Unnamed: 28', 'Unnamed: 29', 'Unnamed: 30'], inplace=True)

In [1696]:
df_dict[27].drop(columns=['Unnamed: 21'], inplace=True)
df_dict[29].drop(columns=['Unnamed: 22'], inplace=True)
df_dict[30].drop(columns=['Unnamed: 22'], inplace=True)
df_dict[31].drop(columns=['Unnamed: 22'], inplace=True)

DFs 7, 8, 9, and 10 have a 'Booklet' column... seems to refer to the meta data accompanying the booklet. We'll drop this.

In [1697]:
for i in range(7, 11):
    df_dict[i].drop(columns=['Booklet'], inplace=True)

DFs 11, 12, and 13 have a 'Date of change' column that's completely empty

In [1698]:
for i in range(11, 14):
    df_dict[i].drop(columns=['Date of change'], inplace=True)

In [1699]:
for i in range(28, 32):
    df_dict[i].drop(columns=['Date of change'], inplace=True)

DFs 7 - 13 inclusive have a 'VED band' column, presumably from the old VED regime. We'll drop that. Also a bunch of other outdated tax stuff.

In [1700]:
for i in range(7, 11):
    df_dict[i].drop(columns=['VED Band'], inplace=True)

for i in range(11, 14):
    df_dict[i].drop(columns=['Tax band', 'Standard 12 months', 'Standard 6 Months', '1st year 12 months', '1st year 6 months'], inplace=True)

### Finally, we're done sorting out the columns
We now have 59 columns in each df. They should pretty much align now. 

### Investigate columns, and find ways to concatenate these tables into one

In [1701]:
columns_dictionary = {}

for i in range(len(df_dict)):
    columns_dictionary[i] = df_dict[i].columns.to_list()

In [1702]:
columns_dictionary[7]

['Manufacturer',
 'Model',
 'Description',
 'Transmissions Type',
 'Transmission',
 'Engine Capacity',
 'Fuel Type',
 'powertrain',
 'engine power(kw)',
 'engine power(ps)',
 'testing scheme',
 'euro_standard',
 'diesel_ved_supplement',
 'Electric energy consumption Miles/kWh',
 'wh/km',
 'Maximum range (Km)',
 'Maximum range (Miles)',
 'imperial_tow',
 'WLTP Imperial Medium',
 'WLTP Imperial High',
 'WLTP Imperial Extra High',
 'WLTP Imperial Combined',
 'wltp_imperial_combined_weighted',
 'WLTP Metric Low',
 'WLTP Metric Medium',
 'WLTP Metric High',
 'WLTP Metric Extra High',
 'WLTP Metric Combined',
 'WLTP Metric Combined (Weighted)',
 'wltp_co2_gkm',
 'WLTP CO2 Weighted',
 'NEDC CO2 g/km',
 'Metric Urban (Cold)',
 'Metric Extra-Urban',
 'Metric Combined',
 'Imperial Urban (Cold)',
 'Imperial Extra-Urban',
 'Imperial Combined',
 'Fuel Cost 12000 Miles',
 'Electricity cost',
 'Total cost / 12000 miles',
 'equivalent_all_electric_range_mi',
 'equivalent_all_electric_range_km',
 'elec

In [1703]:
# Here we tidy up and make the column names more uniform. We'll use this dictionary to name our columns in the future. 

for i in range(len(columns_dictionary)):
    columns_dictionary[i][0] = 'make'
    columns_dictionary[i][1] = 'model'
    columns_dictionary[i][2] = 'description'
    columns_dictionary[i][3] = 'transmission'
    columns_dictionary[i][4] = 'transmission_type'
    columns_dictionary[i][5] = 'engine_capacity'
    columns_dictionary[i][6] = 'fuel_type'
    columns_dictionary[i][7] = 'powertrain'
    columns_dictionary[i][8] = 'engine_power_kw'
    columns_dictionary[i][9] = 'engine_power_ps'
    columns_dictionary[i][10] = 'testing_scheme'
    columns_dictionary[i][11] = 'euro_standard'
    columns_dictionary[i][12] = 'diesel_ved_supplement'
    columns_dictionary[i][13] = 'electric_energy_miles_per_kwh'
    columns_dictionary[i][15] = 'max_range_electric_km'
    columns_dictionary[i][16] = 'max_range_electric_mi'
    columns_dictionary[i][17] = 'wltp_imperial_tow'
    columns_dictionary[i][18] = 'wltp_imperial_medium'
    columns_dictionary[i][19] = 'wltp_imperial_high'
    columns_dictionary[i][20] = 'wltp_imperial_extra_high'
    columns_dictionary[i][21] = 'wltp_imperial_combined'
    columns_dictionary[i][22] = 'wltp_imperial_combined_weighted'
    columns_dictionary[i][23] = 'wltp_metric_low'
    columns_dictionary[i][24] = 'wltp_metric_medium'
    columns_dictionary[i][25] = 'wltp_metric_high'
    columns_dictionary[i][26] = 'wltp_metric_extra_high'
    columns_dictionary[i][27] = 'wltp_metric_combined'
    columns_dictionary[i][28] = 'wltp_metric_combined_weighted'
    columns_dictionary[i][29] = 'wltp_co2_gkm'
    columns_dictionary[i][30] = 'wltp_co2_weighted'
    columns_dictionary[i][31] = 'nedc_co2_gkm'
    columns_dictionary[i][32] = 'metric_urban_cold'
    columns_dictionary[i][33] = 'metric_extra_urban'
    columns_dictionary[i][34] = 'metric_combined'
    columns_dictionary[i][35] = 'imperial_urban_cold'
    columns_dictionary[i][36] = 'imperial_extra_urban'
    columns_dictionary[i][37] = 'imperial_combined'
    columns_dictionary[i][38] = 'fuel_cost_12000_mi'
    columns_dictionary[i][39] = 'electricity_cost_12000_mi'
    columns_dictionary[i][40] = 'total_cost_12000_mi'
    columns_dictionary[i][41] = 'equivalent_all_electric_range_mi'
    columns_dictionary[i][42] = 'equivalent_all_electric_range_km'
    columns_dictionary[i][43] = 'electric_range_city_mi'
    columns_dictionary[i][44] = 'electric_range_city_km'
    columns_dictionary[i][45] = 'annual_fuel_cost_10k_mi'
    columns_dictionary[i][46] = 'annual_electricity_cost_10k_mi'
    columns_dictionary[i][47] = 'total_cost_10k_mi'
    columns_dictionary[i][48] = 'noise_level_db(a)'
    columns_dictionary[i][49] = 'emissions_co_mgkm'
    columns_dictionary[i][50] = 'emissions_co_gkm'
    columns_dictionary[i][51] = 'emissions_thc_mgkm'
    columns_dictionary[i][52] = 'emissions_thc_gkm'
    columns_dictionary[i][53] = 'emissions_nox_mgkm'
    columns_dictionary[i][54] = 'emissions_nox_gkm'
    columns_dictionary[i][55] = 'emissions_thc+nox_mgkm'
    columns_dictionary[i][56] = 'emissions_thc+nox_gkm'
    columns_dictionary[i][57] = 'emissions_pm_mgkm'
    columns_dictionary[i][58] = 'emissions_pm_gkm'
    columns_dictionary[i][59] = 'emissions_rde_nox_urban'
    columns_dictionary[i][60] = 'emissions_rde_nox_combined'
    
    print(i)
    print(columns_dictionary[i][50:])

0
['emissions_co_gkm', 'emissions_thc_mgkm', 'emissions_thc_gkm', 'emissions_nox_mgkm', 'emissions_nox_gkm', 'emissions_thc+nox_mgkm', 'emissions_thc+nox_gkm', 'emissions_pm_mgkm', 'emissions_pm_gkm', 'emissions_rde_nox_urban', 'emissions_rde_nox_combined']
1
['emissions_co_gkm', 'emissions_thc_mgkm', 'emissions_thc_gkm', 'emissions_nox_mgkm', 'emissions_nox_gkm', 'emissions_thc+nox_mgkm', 'emissions_thc+nox_gkm', 'emissions_pm_mgkm', 'emissions_pm_gkm', 'emissions_rde_nox_urban', 'emissions_rde_nox_combined']
2
['emissions_co_gkm', 'emissions_thc_mgkm', 'emissions_thc_gkm', 'emissions_nox_mgkm', 'emissions_nox_gkm', 'emissions_thc+nox_mgkm', 'emissions_thc+nox_gkm', 'emissions_pm_mgkm', 'emissions_pm_gkm', 'emissions_rde_nox_urban', 'emissions_rde_nox_combined']
3
['emissions_co_gkm', 'emissions_thc_mgkm', 'emissions_thc_gkm', 'emissions_nox_mgkm', 'emissions_nox_gkm', 'emissions_thc+nox_mgkm', 'emissions_thc+nox_gkm', 'emissions_pm_mgkm', 'emissions_pm_gkm', 'emissions_rde_nox_urban'

Let's now use columns dictionary to rename our columns in every dataframe.

In [1704]:
for i in range(len(columns_dictionary)):
    df_dict[i].columns = columns_dictionary[i]

Let's add a column to our dataframes which contains the month and year that the types were issued.

In [1705]:
dates = []
for url in vca_urls:
    split = url.split('/')
    dates.append(split[4])
    print(split[4])

2020
2019
sept2018
aug2017
aug2016
aug2015
aug2015
aug2014
aug2014
aug2013
aug2013
aug2012
aug2012
aug2012
aug2011
aug2011
aug2011
may2010
may2010
may2009
may2009
may2008
may2007
may2007
may2006
may2006
may2005
may2005
may2004
may2004
may2003
may2003
may2002
may2002
may2002
july2001
july2001
july2001
january2001
january2001
january2001
july2000
july2000
july2000


In [1706]:
df_dict[9]['nedc_co2_gkm']

0       99
1      112
2      118
3      121
4      121
      ... 
328    122
329    139
330    139
331    148
332    149
Name: nedc_co2_gkm, Length: 333, dtype: int64

In [1707]:
vca_urls[10]

'https://carfueldata.vehicle-certification-agency.gov.uk/additional/aug2013/download-data-for-Aug-2013-Euro-6.zip'

In [1708]:
for i in range(0, 44):
    df_dict[i]['year'] = dates[i]

In [1709]:
df_dict[0]

Unnamed: 0,make,model,description,transmission,transmission_type,engine_capacity,fuel_type,powertrain,engine_power_kw,engine_power_ps,...,emissions_thc_gkm,emissions_nox_mgkm,emissions_nox_gkm,emissions_thc+nox_mgkm,emissions_thc+nox_gkm,emissions_pm_mgkm,emissions_pm_gkm,emissions_rde_nox_urban,emissions_rde_nox_combined,year
0,ABARTH,595,595 1.4 145 BHP,M5,,1368,Petrol,Internal Combustion Engine (ICE),107.0,145.0,...,,27.0,,,,,,,,2020
1,ABARTH,595,595 1.4 145 BHP,M5,,1368,Petrol,Internal Combustion Engine (ICE),107.0,145.0,...,,27.0,,,,,,,,2020
2,ABARTH,595,595 1.4 TJET 145 BHP,M5,,1368,Petrol,Internal Combustion Engine (ICE),107.0,145.0,...,,27.0,,,,,,,,2020
3,ABARTH,595,595 1.4 TJET 145 BHP,M5,,1368,Petrol,Internal Combustion Engine (ICE),107.0,145.0,...,,27.0,,,,,,,,2020
4,ABARTH,595,595 1.4 TJET 145 BHP Convertible,M5,,1368,Petrol,Internal Combustion Engine (ICE),107.0,145.0,...,,27.0,,,,,,,,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4874,VOLVO,"XC90, MY21","B6 AWD R-Design 20"" Alloy",8A-AWD,,1969,Petrol Electric,Hybrid Electric Vehicle (HEV),220.0,300.0,...,,31.0,,52.0,,0.37,,,,2020
4875,VOLVO,"XC90, MY21","B6 AWD R-Design 22"" Alloy",8A-AWD,,1969,Petrol Electric,Hybrid Electric Vehicle (HEV),220.0,300.0,...,,31.0,,52.0,,0.37,,,,2020
4876,VOLVO,"XC90, MY21","B6 AWD R-Design 22"" Alloy",8A-AWD,,1969,Petrol Electric,Hybrid Electric Vehicle (HEV),220.0,300.0,...,,31.0,,52.0,,0.37,,,,2020
4877,VOLVO,"XC90, MY21","B6 AWD R-Design Pro 22"" Alloy",8A-AWD,,1969,Petrol Electric,Hybrid Electric Vehicle (HEV),220.0,300.0,...,,31.0,,52.0,,0.37,,,,2020


Now let's create a list of our dfs

In [1710]:
df_list = []
for i in range(0, 44):
    df_list.append(df_dict[i])

In [1711]:
master_df = pd.concat(df_list).reset_index(drop=True)

In [1712]:
master_df.shape

(80999, 62)

In [1713]:
master_df.columns

Index(['make', 'model', 'description', 'transmission', 'transmission_type',
       'engine_capacity', 'fuel_type', 'powertrain', 'engine_power_kw',
       'engine_power_ps', 'testing_scheme', 'euro_standard',
       'diesel_ved_supplement', 'electric_energy_miles_per_kwh', 'wh/km',
       'max_range_electric_km', 'max_range_electric_mi', 'wltp_imperial_tow',
       'wltp_imperial_medium', 'wltp_imperial_high',
       'wltp_imperial_extra_high', 'wltp_imperial_combined',
       'wltp_imperial_combined_weighted', 'wltp_metric_low',
       'wltp_metric_medium', 'wltp_metric_high', 'wltp_metric_extra_high',
       'wltp_metric_combined', 'wltp_metric_combined_weighted', 'wltp_co2_gkm',
       'wltp_co2_weighted', 'nedc_co2_gkm', 'metric_urban_cold',
       'metric_extra_urban', 'metric_combined', 'imperial_urban_cold',
       'imperial_extra_urban', 'imperial_combined', 'fuel_cost_12000_mi',
       'electricity_cost_12000_mi', 'total_cost_12000_mi',
       'equivalent_all_electric_range_mi

In [1714]:
master_df[master_df['fuel_type'] == 'Electricity']

Unnamed: 0,make,model,description,transmission,transmission_type,engine_capacity,fuel_type,powertrain,engine_power_kw,engine_power_ps,...,emissions_thc_gkm,emissions_nox_mgkm,emissions_nox_gkm,emissions_thc+nox_mgkm,emissions_thc+nox_gkm,emissions_pm_mgkm,emissions_pm_gkm,emissions_rde_nox_urban,emissions_rde_nox_combined,year
290,CITROEN,New C4,100kW Electric Vehicle with 50kWh battery,,,0,Electricity,Battery Electric Vehicle (BEV) / Pure Electric...,100.0,136.0,...,,,,,,,,,0.0,2020
291,CITROEN,New C4,100kW Electric Vehicle with 50kWh battery,,,0,Electricity,Battery Electric Vehicle (BEV) / Pure Electric...,100.0,136.0,...,,,,,,,,,0.0,2020
304,CITROEN,SpaceTourer,50KWh Electric Vehicle,,,0,Electricity,Battery Electric Vehicle (BEV) / Pure Electric...,100.0,136.0,...,,,,,,,,,0.0,2020
305,CITROEN,SpaceTourer,50KWh Electric Vehicle,,,0,Electricity,Battery Electric Vehicle (BEV) / Pure Electric...,100.0,136.0,...,,,,,,,,,0.0,2020
324,DS,DS 3 CROSSBACK,E-TENSE,,,0,Electricity,Battery Electric Vehicle (BEV) / Pure Electric...,,,...,,,,,,,,,0.0,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33411,VOLKSWAGEN,UP,e-UP,,,,Electricity,,,,...,,,,,,,,,,aug2014
37380,MITSUBISHI,i-MiEV,i-MiEV,,,,Electricity,,,,...,,,,,,,,,,aug2013
37381,NISSAN,Leaf,Leaf,,,,Electricity,,,,...,,,,,,,,,,aug2013
37713,MITSUBISHI,i-MiEV,i-MiEV,,,,Electricity,,,,...,,,,,,,,,,aug2013


### Sort out data type issues
We have lots of numeric datatypes as objects.

In [1715]:
# A handy function to remove any pesky letters that have been added into our data's numerical columns.

def strip_letters(string):
    if isinstance(string, str):
        string = re.sub(r'[a-zA-Z]', '', string)
    else:
        pass
    return string

In [1716]:
master_df['engine_capacity'] = master_df['engine_capacity'].apply(lambda x: strip_letters(x))

In [1717]:
master_df['engine_capacity'] = pd.to_numeric(master_df['engine_capacity'])

In [1718]:
master_df['engine_power_kw'] = pd.to_numeric(master_df['engine_power_kw'])

In [1719]:
master_df['engine_power_ps'] = pd.to_numeric(master_df['engine_power_ps'])

In [1720]:
master_df['euro_standard'].unique()

array(['Euro 6d-TEMP', 'Euro 6d', 'Euro 6c', 'Euro 6-WLTP', 'Euro 6-NEDC',
       'Euro 6b', '6', '6b', '6c', '6d(TEMP)', 6, 5, nan, 4, 3, 'III',
       'IV', '536', '543', 'II'], dtype=object)

In [1721]:
# A function to clean up euro standards, to group them as follows:
# Euro 6d-TEMP, 6d(TEMP) => 6d TEMP
# Euro 6d => 6d
# Euro 6c, 6c, => 6c
# Euro 6-WLTP => 6 WLTP
# Euro 6-NEDC => 6 NEDC
# Euro 6b, 6b => 6b
# '6' = > 6
# 'III', '536', '543' => 3
# 'IV' => 4
# 'II' => 2

def euro_cleaner(string):
    euro6dtemp = ['Euro 6d-TEMP', '6d(TEMP)']
    euro6d = ['Euro 6d']
    euro6c = ['Euro 6c', '6c']
    euro6wltp = ['Euro 6-WLTP']
    euro6nedc = ['Euro 6-NEDC']
    euro6b = ['Euro 6b', '6b']
    euro6 = ['6']
    euro4 = ['IV']
    euro3 = ['III']
    euro2 = ['II']
    if isinstance(string, str):
        if string in euro6dtemp:
            return '6d TEMP'
        elif string in euro6d:
            return '6d'
        elif string in euro6c:
            return '6c'
        elif string in euro6wltp:
            return '6 WLTP'
        elif string in euro6nedc:
            return '6 NEDC'
        elif string in euro6b:
            return '6b'
        elif string in euro6:
            return 6
        elif string in euro4:
            return 4
        elif string in euro3:
            return 3
        elif string in euro2:
            return 2
        else:
            return string

# Manually investigated 543 and 536. They are the result of columns being out of order... they are in fact Euro 3. 

In [1722]:
master_df['euro_standard'] = master_df['euro_standard'].apply(lambda x: euro_cleaner(x))

In [1723]:
master_df.columns

Index(['make', 'model', 'description', 'transmission', 'transmission_type',
       'engine_capacity', 'fuel_type', 'powertrain', 'engine_power_kw',
       'engine_power_ps', 'testing_scheme', 'euro_standard',
       'diesel_ved_supplement', 'electric_energy_miles_per_kwh', 'wh/km',
       'max_range_electric_km', 'max_range_electric_mi', 'wltp_imperial_tow',
       'wltp_imperial_medium', 'wltp_imperial_high',
       'wltp_imperial_extra_high', 'wltp_imperial_combined',
       'wltp_imperial_combined_weighted', 'wltp_metric_low',
       'wltp_metric_medium', 'wltp_metric_high', 'wltp_metric_extra_high',
       'wltp_metric_combined', 'wltp_metric_combined_weighted', 'wltp_co2_gkm',
       'wltp_co2_weighted', 'nedc_co2_gkm', 'metric_urban_cold',
       'metric_extra_urban', 'metric_combined', 'imperial_urban_cold',
       'imperial_extra_urban', 'imperial_combined', 'fuel_cost_12000_mi',
       'electricity_cost_12000_mi', 'total_cost_12000_mi',
       'equivalent_all_electric_range_mi

In [1724]:
master_df['electric_energy_miles_per_kwh'] = master_df['electric_energy_miles_per_kwh'].apply(lambda x: pd.to_numeric(x))

In [1725]:
master_df['wh/km'] = pd.to_numeric(master_df['wh/km'] )

In [1726]:
master_df['max_range_electric_km'] = pd.to_numeric(master_df['max_range_electric_km'] )

In [1727]:
master_df['max_range_electric_mi'] = pd.to_numeric(master_df['max_range_electric_mi'] )

In [1728]:
master_df['wltp_imperial_tow'] = pd.to_numeric(master_df['wltp_imperial_tow'])

In [1729]:
columns_to_num = ['wltp_imperial_tow',
       'wltp_imperial_medium', 'wltp_imperial_high',
       'wltp_imperial_extra_high', 'wltp_imperial_combined',
       'wltp_imperial_combined_weighted', 'wltp_metric_low',
       'wltp_metric_medium', 'wltp_metric_high', 'wltp_metric_extra_high',
       'wltp_metric_combined', 'wltp_metric_combined_weighted', 'wltp_co2_gkm',
       'wltp_co2_weighted', 'nedc_co2_gkm', 'metric_urban_cold',
       'metric_extra_urban', 'metric_combined', 'imperial_urban_cold',
       'imperial_extra_urban', 'imperial_combined', 'fuel_cost_12000_mi',
       'electricity_cost_12000_mi', 'total_cost_12000_mi',
       'equivalent_all_electric_range_mi', 'equivalent_all_electric_range_km',
       'electric_range_city_mi', 'electric_range_city_km',
       'annual_fuel_cost_10k_mi', 'annual_electricity_cost_10k_mi',
       'total_cost_10k_mi', 'noise_level_db(a)', 'emissions_co_mgkm',
       'emissions_co_gkm', 'emissions_thc_mgkm', 'emissions_thc_gkm',
       'emissions_nox_mgkm', 'emissions_nox_gkm', 'emissions_thc+nox_mgkm',
       'emissions_thc+nox_gkm', 'emissions_pm_mgkm', 'emissions_pm_gkm',
       'emissions_rde_nox_urban', 'emissions_rde_nox_combined']

In [1730]:
def remove_dots(string):
    if isinstance(string, str):
        string = string.replace('..', '.')
        string = string.replace('Diesel', '')
    else:
        pass
    return string


In [1731]:
master_df['metric_urban_cold'] = master_df['metric_urban_cold'].apply(lambda x: remove_dots(x))

In [1732]:
def cost_cleaner(string):
    if isinstance(string, str):
        string = string.replace('£', '')
        string = string.replace(',', '')
        string = string.replace(' ', '')
        string = string.replace('?', '')
        string = string.replace('#REF!', '')
        string = string.replace('NotAvailable', '')
    else:
        pass
    return string

In [1733]:
master_df['fuel_cost_12000_mi'] = master_df['fuel_cost_12000_mi'].apply(lambda x: cost_cleaner(x))

In [1734]:
master_df['electricity_cost_12000_mi'] = master_df['electricity_cost_12000_mi'].apply(lambda x: cost_cleaner(x))

In [1735]:
master_df['total_cost_12000_mi'] = master_df['total_cost_12000_mi'].apply(lambda x: cost_cleaner(x))

In [1736]:
master_df['annual_fuel_cost_10k_mi'] = master_df['annual_fuel_cost_10k_mi'].apply(lambda x: cost_cleaner(x))

In [1737]:
master_df['annual_electricity_cost_10k_mi'] = master_df['annual_electricity_cost_10k_mi'].apply(lambda x: cost_cleaner(x))

In [1738]:
master_df['total_cost_10k_mi'] = master_df['total_cost_10k_mi'].apply(lambda x: cost_cleaner(x))

In [1739]:
def noise_level_clean(string):
    if isinstance(string, str):
        string = string.replace('III', '')
        string = string.replace(' ', '')
    else:
        pass
    return string

In [1740]:
master_df['noise_level_db(a)'] = master_df['noise_level_db(a)'].apply(lambda x: noise_level_clean(x))

In [1741]:
def despace(string):
    if isinstance(string, str):
        string = string.replace(' ', '')
        string = string.replace('*', '')
        string = string.replace(',', '.')
    else:
        pass
    return string

In [1742]:
master_df['emissions_thc_mgkm'] = master_df['emissions_thc_mgkm'].apply(lambda x: despace(x))

In [1743]:
master_df['emissions_thc_gkm'] = master_df['emissions_thc_gkm'].apply(lambda x: despace(x))

In [1744]:
master_df['emissions_nox_mgkm'] = master_df['emissions_nox_mgkm'].apply(lambda x: despace(x))

In [1745]:
master_df['emissions_nox_gkm'] = master_df['emissions_nox_gkm'].apply(lambda x: despace(x))

In [1746]:
master_df['emissions_thc+nox_mgkm'] = master_df['emissions_thc+nox_mgkm'].apply(lambda x: despace(x))
master_df['emissions_thc+nox_gkm'] = master_df['emissions_thc+nox_mgkm'].apply(lambda x: despace(x))

In [1747]:
master_df['emissions_pm_mgkm'] = master_df['emissions_pm_mgkm'].apply(lambda x: despace(x))

In [1632]:
# for i in range(len(master_df['emissions_thc+nox_gkm'].unique())):
#     print('index', i)
# #     print(master_df['emissions_thc+nox_gkm'].unique()[i])

# master_df['emissions_thc+nox_gkm'].unique()[198]

0.216

In [1622]:
columns_to_num[38]

'emissions_thc+nox_mgkm'

In [1611]:
for column in tqdm(columns_to_num):
    master_df[column] = pd.to_numeric(master_df[column])

 86%|██████████████████████████████████████████████████████████████████████▊           | 38/44 [00:00<00:00, 81.33it/s]


ValueError: Unable to parse string "*" at position 57539

Now let's clean up the year column.

In [None]:
master_df['year'].unique()

In [None]:
year_dict = {
    '2020' : '2020-12-01', 
    '2019' : '2019-12-01', 
    'sept2018' : '2018-09-01', 
    'aug2017' : '2017-08-01', 
    'aug2016' : '2016-08-01', 
    'aug2015' : '2015-08-01',
    'aug2014' : '2014-08-01', 
    'aug2013' : '2013-08-01', 
    'aug2012' : '2012-08-01', 
    'aug2011' : '2011-08-01', 
    'may2010' : '2010-05-01', 
    'may2009' : '2009-05-01',
    'may2008' : '2008-05-01', 
    'may2007' : '2007-05-01', 
    'may2006' : '2006-05-01', 
    'may2005' : '2005-05-01', 
    'may2004' : '2004-05-01', 
    'may2003' : '2003-05-01',
    'may2002' : '2002-05-01', 
    'july2001' : '2001-07-01', 
    'january2001' : '2001-01-01',
    'july2000' : '2000-07-01'
}

In [None]:
def year_replacer(string):
    string = year_dict[string]
    return string

In [None]:
master_df['year'] = master_df['year'].apply(lambda x: year_replacer(x))

In [None]:
master_df['year'].unique()

In [None]:
master_df['year'] = pd.to_datetime(master_df['year']) #.dt.to_period('M')

In [None]:
master_df['year'].dtypes

In [None]:
master_df[(master_df['make'] == 'VAUXHALL')]['fuel_type'].unique()

In [None]:
master_df.columns

In [None]:
master_df['total_cost_12000_mi'].unique()

In [None]:
sns.pairplot(master_df[['engine_capacity', 'nedc_co2_gkm', 'fuel_cost_12000_mi']])

In [None]:
master_df.columns

In [None]:
plt.figure(figsize=(25, 10))
co2 = sns.lineplot(data=master_df, x='year', y='nedc_co2_gkm')
plt.title('CO2 efficiency of new car models since 2000')
plt.savefig('co2 efficiency.png')
# co2 = sns.lineplot(data=master_df, x='year', y='wltp_co2_gkm')

In [None]:
g = sns.FacetGrid(master_df, col='make', col_wrap=6, height=2)
g.map(sns.lineplot, 'year', 'nedc_co2_gkm')
plt.savefig('make_co2.png')

In [None]:
master_df.columns

In [None]:
master_df['noise_level_db(a)'].describe()

In [None]:
sns.lineplot(data=master_df, x='year', y='engine_capacity')

In [None]:
master_df[['emissions_co_mgkm',
       'emissions_thc_mgkm', 'emissions_thc_gkm', 'emissions_nox_mgkm',
       'emissions_nox_gkm', 'emissions_thc+nox_mgkm', 'emissions_thc+nox_gkm',
       'emissions_pm_mgkm', 'emissions_pm_gkm', 'year']].iloc[44245:44255]

In [None]:
def g_to_mg(value):
    if not np.isnan(value):
        value = value * 1000
    else:
        pass
    return value

In [None]:
master_df['emissions_co_mgkm'].iloc[44252:]

In [None]:
vca_urls[31]

In [None]:
x = 0.06
y = np.nan

In [None]:
g_to_mg(x)

In [None]:
g_to_mg(y)

### Save the data to bigquery

In [None]:
# Authenticate with google
project_id = 'rugged-baton-283921'
credentials = service_account.Credentials.from_service_account_file('rugged-baton-283921-5706f65c85fe.json')

# Instantiating the bigquery client
bqclient = bigquery.Client(credentials=credentials, project=project_id,)
bqstorageclient = bigquery_storage.BigQueryReadClient(credentials=credentials)