In [5]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [56]:
# download and parse the url

start_url = 'https://en.wikipedia.org/wiki/Tesla,_Inc.'

# download the HTML from start url

downloaded_html = requests.get(start_url)

# parse the HTML with BeautifulSoup and create Soup object

soup = BeautifulSoup(downloaded_html.text,'html5lib')

# save a local copy

with open('downloaded_html','w', encoding='utf-8') as file:
    file.write(soup.prettify())

In [58]:
# select wikitable

full_table = soup.select('table.wikitable tbody')[0]
print(full_table)


<tbody><tr style="text-align:center;">
<th>Quarter</th>
<th>Cumulative<br/>production</th>
<th>Total<br/>production</th>
<th>Model S<br/>sales
</th>
<th>Model X<br/>sales
</th>
<th>Model 3 + Model Y<br/>sales<sup class="reference" id="cite_ref-96"><a href="#cite_note-96">[b]</a></sup></th>
<th>Total<br/>sales<sup class="reference" id="cite_ref-97"><a href="#cite_note-97">[c]</a></sup></th>
<th>In transit<sup class="reference" id="cite_ref-98"><a href="#cite_note-98">[d]</a></sup></th>
<th>Source
</th></tr>
<tr style="text-align:center;">
<td>Q1 2013</td>
<td>?</td>
<td>5,000+</td>
<td>4,900</td>
<td style="background:#f1f5fa;">
</td>
<td style="background:#f1f5fa;"></td>
<td>4,900</td>
<td></td>
<td><sup class="reference" id="cite_ref-99"><a href="#cite_note-99">[95]</a></sup>
</td></tr>
<tr style="text-align:center;">
<td>Q2 2013</td>
<td>?</td>
<td>?</td>
<td>5,150</td>
<td style="background:#f1f5fa;">
</td>
<td style="background:#f1f5fa;"></td>
<td>5,150</td>
<td></td>
<td><sup clas

In [None]:
# first iteration

# extract the table column heading
# End Result: A list with all table headings

table_head = full_table.select('tr th')
print(table_head)

In [None]:
# Second iteration getting only text

table_head = full_table.select('tr th')
# print(table_head)

# removing the tr th tag
print("-------------")
for element in table_head:
    print(element.text)

In [None]:
# Third Iteration

table_columns = []
for element in table_head:
    column_label = element.get_text(separator=" ",strip=True)
    table_columns.append(column_label)
print('-----------------')
print(table_columns)

In [None]:
# Fourth Iteration

table_columns = []
for element in table_head:
    column_label = element.get_text(separator=" ",strip=True)
    column_label = column_label.replace(' ', '_')                          # replacing space from single word to make them readable try without it also
    table_columns.append(column_label)
print('-----------------')
print(table_columns)

In [66]:
# Final Iteration

# extract the table column heading
# End Result: A list with all table headings

table_head = full_table.select('tr th')    
# the elements get jumbeled so lets remove this
import re
regex = re.compile('_\[\w\]')

table_columns = []
for element in table_head:
    column_label = element.get_text(separator=" ",strip=True)
    column_label = column_label.replace(' ', '_')                          # replacing space from single word to make them readable try without it also
    column_label = regex.sub('',column_label)
    table_columns.append(column_label)
print('-----------------')
print(table_columns)

-------------
-----------------
['Quarter', 'Cumulative_production', 'Total_production', 'Model_S_sales', 'Model_X_sales', 'Model_3_+_Model_Y_sales', 'Total_sales', 'In_transit', 'Source']


In [67]:
for columns in table_columns:
    print(columns)

Quarter
Cumulative_production
Total_production
Model_S_sales
Model_X_sales
Model_3_+_Model_Y_sales
Total_sales
In_transit
Source


In [68]:
# Extract the table data (rows)
# End Result: A multi-dimensional list containing list of each row

table_rows = full_table.select('tr')
print(table_rows)

[<tr style="text-align:center;">
<th>Quarter</th>
<th>Cumulative<br/>production</th>
<th>Total<br/>production</th>
<th>Model S<br/>sales
</th>
<th>Model X<br/>sales
</th>
<th>Model 3 + Model Y<br/>sales<sup class="reference" id="cite_ref-96"><a href="#cite_note-96">[b]</a></sup></th>
<th>Total<br/>sales<sup class="reference" id="cite_ref-97"><a href="#cite_note-97">[c]</a></sup></th>
<th>In transit<sup class="reference" id="cite_ref-98"><a href="#cite_note-98">[d]</a></sup></th>
<th>Source
</th></tr>, <tr style="text-align:center;">
<td>Q1 2013</td>
<td>?</td>
<td>5,000+</td>
<td>4,900</td>
<td style="background:#f1f5fa;">
</td>
<td style="background:#f1f5fa;"></td>
<td>4,900</td>
<td></td>
<td><sup class="reference" id="cite_ref-99"><a href="#cite_note-99">[95]</a></sup>
</td></tr>, <tr style="text-align:center;">
<td>Q2 2013</td>
<td>?</td>
<td>?</td>
<td>5,150</td>
<td style="background:#f1f5fa;">
</td>
<td style="background:#f1f5fa;"></td>
<td>5,150</td>
<td></td>
<td><sup class="r

In [70]:
# Iteration 2 over rows data

table_rows = full_table.select('tr')
table_data = []

for index, element in enumerate(table_rows):
    if index > 0:
        row_list = []
        values = element.select('td')
        for value in values:
            row_list.append(value.text)
        table_data.append(row_list)
        
print(table_data)


[['Q1 2013', '?', '5,000+', '4,900', '\n', '', '4,900', '', '[95]\n'], ['Q2 2013', '?', '?', '5,150', '\n', '', '5,150', '', '[96]\n'], ['Q3 2013', '?', '?', '5,500+', '\n', '', '5,500+', '', '[97]\n'], ['Q4 2013', '~34,851', '6,587', '6,892', '\n', '', '6,892', '', '[98]\n'], ['Q1 2014', '~41,438', '7,535', '6,457', '\n', '', '6,457', '', '[99]\n'], ['Q2 2014', '~48,973', '8,763', '7,579', '\n', '', '7,579', '', '[100]\n'], ['Q3 2014', '~57,736', '~7,075', '7,785', '\n', '', '7,785', '', '[101]\n'], ['Q4 2014', '64,811', '11,627', '9,834', '\n', '', '9,834', '', '[102]\n'], ['Q1 2015', '76,438', '11,160', '10,045', '\n', '', '10,045', '', '[103]\n'], ['Q2 2015', '89,245', '12,807', '11,532', '\n', '', '11,532', '', '[104]\n'], ['Q3 2015', '102,336', '13,091', '11,597', '6\n', '', '11,603', '', '[105]\n'], ['Q4 2015', '116,373', '14,037', '17,272', '206\n', '', '17,478', '', '[106]\n'], ['Q1 2016', '131,883', '15,510', '12,420', '2,400\n', '', '14,820', '2,615', '[107]\n'], ['Q2 2016',

In [71]:
# Final Iteration


table_rows = full_table.select('tr')
table_data = []

for index, element in enumerate(table_rows):
    if index > 0:
        row_list = []
        values = element.select('td')
        for value in values:
            row_list.append(value.text.strip())
        table_data.append(row_list)
        
print(table_data)


[['Q1 2013', '?', '5,000+', '4,900', '', '', '4,900', '', '[95]'], ['Q2 2013', '?', '?', '5,150', '', '', '5,150', '', '[96]'], ['Q3 2013', '?', '?', '5,500+', '', '', '5,500+', '', '[97]'], ['Q4 2013', '~34,851', '6,587', '6,892', '', '', '6,892', '', '[98]'], ['Q1 2014', '~41,438', '7,535', '6,457', '', '', '6,457', '', '[99]'], ['Q2 2014', '~48,973', '8,763', '7,579', '', '', '7,579', '', '[100]'], ['Q3 2014', '~57,736', '~7,075', '7,785', '', '', '7,785', '', '[101]'], ['Q4 2014', '64,811', '11,627', '9,834', '', '', '9,834', '', '[102]'], ['Q1 2015', '76,438', '11,160', '10,045', '', '', '10,045', '', '[103]'], ['Q2 2015', '89,245', '12,807', '11,532', '', '', '11,532', '', '[104]'], ['Q3 2015', '102,336', '13,091', '11,597', '6', '', '11,603', '', '[105]'], ['Q4 2015', '116,373', '14,037', '17,272', '206', '', '17,478', '', '[106]'], ['Q1 2016', '131,883', '15,510', '12,420', '2,400', '', '14,820', '2,615', '[107]'], ['Q2 2016', '150,228', '18,345', '9,764', '4,638', '', '14,402'

In [72]:
df = pd.DataFrame(table_data,columns=table_columns)
df

Unnamed: 0,Quarter,Cumulative_production,Total_production,Model_S_sales,Model_X_sales,Model_3_+_Model_Y_sales,Total_sales,In_transit,Source
0,Q1 2013,?,"5,000+",4900,,,4900,,[95]
1,Q2 2013,?,?,5150,,,5150,,[96]
2,Q3 2013,?,?,"5,500+",,,"5,500+",,[97]
3,Q4 2013,"~34,851",6587,6892,,,6892,,[98]
4,Q1 2014,"~41,438",7535,6457,,,6457,,[99]
5,Q2 2014,"~48,973",8763,7579,,,7579,,[100]
6,Q3 2014,"~57,736","~7,075",7785,,,7785,,[101]
7,Q4 2014,64811,11627,9834,,,9834,,[102]
8,Q1 2015,76438,11160,10045,,,10045,,[103]
9,Q2 2015,89245,12807,11532,,,11532,,[104]
