*Import Modules required for parsing webpage*

In [1]:
from bs4 import BeautifulSoup
import requests

*Convert webpage into a Python object to extract the elements that we want*

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_in_the_United_States_by_revenue'

# Pass URL into 'requests' module. Returns a 'request object.'
page = requests.get(url)



*Verify that the 'get' method returned a 'request' object. Then store it into a variable called 'page'.*

In [3]:
print(page)

<Response [200]>


*Pass the 'page' through the Beautiful Soup module 'html' parser method to return a Python object so that we can extract the elements that we want. Then store that Python object into a variable called 'soup.'*

In [4]:
soup = BeautifulSoup(page.text, 'html')

*Find the second table in the list of elements that begin with the 'table' tag. Then store it in an object called 'table.'*

In [6]:
table = soup.find_all('table')[1]

*Within the 'table' object return a list of values inside 'th' tags.*

In [8]:
table_headers = table.find_all('th')

*Verify that we have a list of table column headings.*

In [9]:
print(table_headers)

[<th>Rank
</th>, <th>Name
</th>, <th>Industry
</th>, <th>Revenue <br/>(USD millions)
</th>, <th>Revenue growth
</th>, <th>Employees
</th>, <th>Headquarters
</th>]


*Loop through list of headers extracting just the text values of each 'th' tag. And, get rid of any unnecessary new line breaks and spaces.*

In [10]:
table_headers = [header.text.strip() for header in table_headers]

*Verify that we have a clean list of the table's column headers.*

In [11]:
print(table_headers)

['Rank', 'Name', 'Industry', 'Revenue (USD millions)', 'Revenue growth', 'Employees', 'Headquarters']


In [12]:
import pandas as pd

*Insert 'table_headers' into a Pandas dataframe object*

In [13]:
df = pd.DataFrame(columns = table_headers)

*Verify that 'table_headers' inserted correctly as column headers in dataframe.*

In [14]:
df

Unnamed: 0,Rank,Name,Industry,Revenue (USD millions),Revenue growth,Employees,Headquarters


*Extract the table values enclosed in 'tr' (table row) tags. Then store them in variable called 'column_data.*

In [15]:
column_data = table.find_all('tr')

*Loop through the list of row data starting with the second row. Extracting the column value for each row.*

In [17]:
for row in column_data[1:]:
    # Column values for each row are enclosed within 'td' tags.
    row_data = row.find_all('td')
    # Exract the text values from each tag removing unnecessary line breaks and spaces.
    individual_row_data = [data.text.strip() for data in row_data]
    
    # Append new table row to table with each iteration
    length = len(df)
    df.loc[length] = individual_row_data

*Verify that we have the table in Pandas dataframe object.*

In [18]:
df

Unnamed: 0,Rank,Name,Industry,Revenue (USD millions),Revenue growth,Employees,Headquarters
0,1,Walmart,Retail,611289,6.7%,2100000,"Bentonville, Arkansas"
1,2,Amazon,Retail and cloud computing,513983,9.4%,1540000,"Seattle, Washington"
2,3,ExxonMobil,Petroleum industry,413680,44.8%,62000,"Spring, Texas"
3,4,Apple,Electronics industry,394328,7.8%,164000,"Cupertino, California"
4,5,UnitedHealth Group,Healthcare,324162,12.7%,400000,"Minnetonka, Minnesota"
...,...,...,...,...,...,...,...
95,96,Best Buy,Retail,46298,10.6%,71100,"Richfield, Minnesota"
96,97,Bristol-Myers Squibb,Pharmaceutical industry,46159,0.5%,34300,"New York City, New York"
97,98,United Airlines,Airline,44955,82.5%,92795,"Chicago, Illinois"
98,99,Thermo Fisher Scientific,Laboratory instruments,44915,14.5%,130000,"Waltham, Massachusetts"


*Export table to a spreadsheet format for collaboration and updating.*

In [19]:
df.to_csv(r'C:\Users\ramir\Companies.csv', index = False)