In [1]:
#Import dependencies
import pandas as pd
import pymongo
import json

## (E) Extract - Scrape Raw Data from sources

In [2]:
# Define url to scrape
tesla_wiki_url = 'https://en.wikipedia.org/wiki/History_of_Tesla,_Inc.#Timeline_of_production_and_sales'


In [3]:
# Use Panda's `read_html` to parse the url
tables = pd.read_html(tesla_wiki_url)
tables

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                
 9   Deepak Ahuja (two-time CFO) Ze'ev Drori (secon...                                                                                                                                                                                                                                                                                                                                                                                 

In [4]:
#Select table regarding Total Tesla production and sales since 2012
tesla_prod_df = tables[4]
tesla_prod_df

Unnamed: 0,Quarter,Cumulativeproduction,Totalproduction,Model Ssales,Model Xsales,Model 3sales,Model Ysales[a],Totalsales[b],In transit[c],Source
0,Q3 2012,,350,250+,,,,250+,,[187]
1,Q4 2012,,"2,750+",2400,,,,2400,,[188]
2,Q1 2013,,"5,000+",4900,,,,4900,,[189]
3,Q2 2013,,,5150,,,,5150,,[190]
4,Q3 2013,,,"5,500+",,,,"5,500+",,[191]
5,Q4 2013,"~34,851",6587,6892,,,,6892,,[192]
6,Q1 2014,"~41,438",7535,6457,,,,6457,,[193]
7,Q2 2014,"~48,973",8763,7579,,,,7579,,[194]
8,Q3 2014,"~57,736","~7,075",7785,,,,7785,,[195]
9,Q4 2014,64811,11627,9834,,,,9834,,[196]


## (T) Transform - Clean Data

In [5]:
# Get relevant sales columns columns and create new df
tesla_sales_df = tesla_prod_df[['Quarter','Totalsales[b]']]
tesla_sales_df

Unnamed: 0,Quarter,Totalsales[b]
0,Q3 2012,250+
1,Q4 2012,2400
2,Q1 2013,4900
3,Q2 2013,5150
4,Q3 2013,"5,500+"
5,Q4 2013,6892
6,Q1 2014,6457
7,Q2 2014,7579
8,Q3 2014,7785
9,Q4 2014,9834


In [6]:
#Rename columns
tesla_sales_df = tesla_sales_df.rename(columns = {'Quarter':'Quarter_Yr','Totalsales[b]':'Total_Sales'})
# tesla_sales_df.columns = ['Quarter_Yr', 'Total_Sales']
tesla_sales_df

Unnamed: 0,Quarter_Yr,Total_Sales
0,Q3 2012,250+
1,Q4 2012,2400
2,Q1 2013,4900
3,Q2 2013,5150
4,Q3 2013,"5,500+"
5,Q4 2013,6892
6,Q1 2014,6457
7,Q2 2014,7579
8,Q3 2014,7785
9,Q4 2014,9834


In [7]:
#Check data types
tesla_sales_df['Total_Sales'].dtype

dtype('O')

In [8]:
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

# Define the 'electric_vehicles' database in Mongo
db = client.electric_vehicles

In [9]:
tesla_sales_df['Total_Sales'] = tesla_sales_df['Total_Sales'].replace(
    {'5,500+': '5500', '250+': '250'})
tesla_sales_df['Quarter_Yr'] = tesla_sales_df['Quarter_Yr'].replace(
    {'Q4 2016[d]': 'Q4 2016'})
tesla_sales_clean_df = tesla_sales_df
tesla_sales_clean_df.set_index('Quarter_Yr')


Unnamed: 0_level_0,Total_Sales
Quarter_Yr,Unnamed: 1_level_1
Q3 2012,250
Q4 2012,2400
Q1 2013,4900
Q2 2013,5150
Q3 2013,5500
Q4 2013,6892
Q1 2014,6457
Q2 2014,7579
Q3 2014,7785
Q4 2014,9834


In [10]:
# Import US EV Sales from Spreadsheet
# read file
excel_file = "10567_pev_sales_2-28-20.xlsx"
us_ev_sales_df = pd.read_excel(excel_file, 'PEV Sales Final 2019', header=2,index_col=1)
us_ev_sales_df

Unnamed: 0_level_0,Unnamed: 0,Type,2011,2012,2013,2014,2015,2016,2017,2018,2019,Total
Vehicle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Chevy Volt,,PHEV,7671.0,23461.0,23094.0,18805.0,15393.0,24739.0,20349.0,18306.0,4915.0,156733.0
Nissan Leaf,,EV,9674.0,9819.0,22610.0,30200.0,17269.0,14006.0,11230.0,14715.0,12365.0,141888.0
Smart ED,,EV,342.0,139.0,923.0,2594.0,1387.0,657.0,544.0,1219.0,680.0,8485.0
Mitsubishi I EV,,EV,76.0,588.0,1029.0,196.0,115.0,94.0,6.0,0.0,0.0,2104.0
BMW Active E,,EV,0.0,673.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,673.0
...,...,...,...,...,...,...,...,...,...,...,...,...
Last updated: January 2020,,,,,,,,,,,,
Acronyms:,,,,,,,,,,,,
EV: All-electric vehicle,,,,,,,,,,,,
PEV: Plug-in electric vehicle. These include both all-electric and plug-in hybrid electric vehicles.,,,,,,,,,,,,


In [16]:
#Remove first column
us_ev_sales_df = us_ev_sales_df.drop(us_ev_sales_df.columns[0], axis=1)

In [17]:
# Remove redundant rows
us_ev_sales_df.drop(us_ev_sales_df.index[56:], axis=0)

Unnamed: 0_level_0,Type,2011,2012,2013,2014,2015,2016,2017,2018,2019,Total
Vehicle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Chevy Volt,PHEV,7671.0,23461.0,23094.0,18805.0,15393.0,24739.0,20349.0,18306.0,4915.0,156733.0
Nissan Leaf,EV,9674.0,9819.0,22610.0,30200.0,17269.0,14006.0,11230.0,14715.0,12365.0,141888.0
Smart ED,EV,342.0,139.0,923.0,2594.0,1387.0,657.0,544.0,1219.0,680.0,8485.0
Mitsubishi I EV,EV,76.0,588.0,1029.0,196.0,115.0,94.0,6.0,0.0,0.0,2104.0
BMW Active E,EV,0.0,673.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,673.0
Prius PHEV,PHEV,0.0,12749.0,12088.0,13264.0,4191.0,2474.0,20936.0,27595.0,23630.0,116927.0
Ford Focus EV,EV,0.0,683.0,1738.0,1964.0,1582.0,901.0,1817.0,560.0,0.0,9245.0
Honda Fit EV,EV,0.0,93.0,569.0,407.0,2.0,0.0,0.0,0.0,0.0,1071.0
Tesla Model S,EV,0.0,2400.0,19400.0,16750.0,26200.0,30200.0,26500.0,25745.0,15090.0,162285.0
Toyota RAV4 EV,EV,0.0,192.0,1005.0,1184.0,18.0,0.0,0.0,0.0,0.0,2399.0


## (L) Loading - Upload Clean Data into Mongo Database

In [13]:
# Verify that the Amount column datatype has been made numeric
# df['Amount'].dtype

In [14]:
tesla_sales_dict = tesla_sales_clean_df.to_dict('records')
tesla_sales_dict

[{'Quarter_Yr': 'Q3 2012', 'Total_Sales': '250'},
 {'Quarter_Yr': 'Q4 2012', 'Total_Sales': '2400'},
 {'Quarter_Yr': 'Q1 2013', 'Total_Sales': '4900'},
 {'Quarter_Yr': 'Q2 2013', 'Total_Sales': '5150'},
 {'Quarter_Yr': 'Q3 2013', 'Total_Sales': '5500'},
 {'Quarter_Yr': 'Q4 2013', 'Total_Sales': '6892'},
 {'Quarter_Yr': 'Q1 2014', 'Total_Sales': '6457'},
 {'Quarter_Yr': 'Q2 2014', 'Total_Sales': '7579'},
 {'Quarter_Yr': 'Q3 2014', 'Total_Sales': '7785'},
 {'Quarter_Yr': 'Q4 2014', 'Total_Sales': '9834'},
 {'Quarter_Yr': 'Q1 2015', 'Total_Sales': '10045'},
 {'Quarter_Yr': 'Q2 2015', 'Total_Sales': '11532'},
 {'Quarter_Yr': 'Q3 2015', 'Total_Sales': '11603'},
 {'Quarter_Yr': 'Q4 2015', 'Total_Sales': '17478'},
 {'Quarter_Yr': 'Q1 2016', 'Total_Sales': '14820'},
 {'Quarter_Yr': 'Q2 2016', 'Total_Sales': '14402'},
 {'Quarter_Yr': 'Q3 2016', 'Total_Sales': '24821'},
 {'Quarter_Yr': 'Q4 2016', 'Total_Sales': '22254'},
 {'Quarter_Yr': 'Q1 2017', 'Total_Sales': '25051'},
 {'Quarter_Yr': 'Q2 201

In [15]:

#Clear collection of existing documents
db.electric_vehicles.delete_many({})
# Insert new documents in empty collection
db.electric_vehicles.insert_many(tesla_sales_dict)

<pymongo.results.InsertManyResult at 0x1edc7d16580>

In [23]:
#Reset index
us_ev_sales_df.reset_index()
#Todo need to change year column headers to string and not number for pymongo upload

Unnamed: 0,index,Vehicle,Type,2011,2012,2013,2014,2015,2016,2017,2018,2019,Total
0,0,Chevy Volt,PHEV,7671.0,23461.0,23094.0,18805.0,15393.0,24739.0,20349.0,18306.0,4915.0,156733.0
1,1,Nissan Leaf,EV,9674.0,9819.0,22610.0,30200.0,17269.0,14006.0,11230.0,14715.0,12365.0,141888.0
2,2,Smart ED,EV,342.0,139.0,923.0,2594.0,1387.0,657.0,544.0,1219.0,680.0,8485.0
3,3,Mitsubishi I EV,EV,76.0,588.0,1029.0,196.0,115.0,94.0,6.0,0.0,0.0,2104.0
4,4,BMW Active E,EV,0.0,673.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,673.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,61,Last updated: January 2020,,,,,,,,,,,
62,62,Acronyms:,,,,,,,,,,,
63,63,EV: All-electric vehicle,,,,,,,,,,,
64,64,PEV: Plug-in electric vehicle. These include b...,,,,,,,,,,,


In [30]:
us_ev_sales_df
us_ev_sales_dict = us_ev_sales_df.to_dict('records')
us_ev_sales_dict

[{'index': 0,
  'Vehicle': 'Chevy Volt',
  'Type': 'PHEV',
  2011: 7671.0,
  2012: 23461.0,
  2013: 23094.0,
  2014: 18805.0,
  2015: 15393.0,
  2016: 24739.0,
  2017: 20349.0,
  2018: 18306.0,
  2019: 4915.0,
  'Total': 156733.0},
 {'index': 1,
  'Vehicle': 'Nissan Leaf',
  'Type': 'EV',
  2011: 9674.0,
  2012: 9819.0,
  2013: 22610.0,
  2014: 30200.0,
  2015: 17269.0,
  2016: 14006.0,
  2017: 11230.0,
  2018: 14715.0,
  2019: 12365.0,
  'Total': 141888.0},
 {'index': 2,
  'Vehicle': 'Smart ED',
  'Type': 'EV',
  2011: 342.0,
  2012: 139.0,
  2013: 923.0,
  2014: 2594.0,
  2015: 1387.0,
  2016: 657.0,
  2017: 544.0,
  2018: 1219.0,
  2019: 680.0,
  'Total': 8485.0},
 {'index': 3,
  'Vehicle': 'Mitsubishi I EV',
  'Type': 'EV',
  2011: 76.0,
  2012: 588.0,
  2013: 1029.0,
  2014: 196.0,
  2015: 115.0,
  2016: 94.0,
  2017: 6.0,
  2018: 0.0,
  2019: 0.0,
  'Total': 2104.0},
 {'index': 4,
  'Vehicle': 'BMW Active E',
  'Type': 'EV',
  2011: 0.0,
  2012: 673.0,
  2013: 0.0,
  2014: 0.0,
 

In [31]:
# db.electric_vehicles.delete_many({})
# Insert new documents in empty collection
db.electric_vehicles.insert_many(us_ev_sales_dict)

InvalidDocument: documents must have only string keys, key was 2011