# WEB SCRAPING TOP 100 LARGEST COMPANIES IN USA 2023 

In [1]:
# Web Scraping Data of the largest companies in USA by revenue.
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [2]:
# Call Wikipedia web page and retrieve raw HTML information as soup.
url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_in_the_United_States_by_revenue'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html')

In [3]:
# Pull the table from the web with the largest companies attributes. 
# We have three tables in a list; we can index it to take a closer look at the right one and select the class attribute for find().
soup.find_all('table')[1]
table_html = soup.find('table', class_='wikitable sortable')



In [4]:
# Pull the titles of the table 
titles = table_html.find_all('th')
print(titles)

[<th>Rank
</th>, <th>Name
</th>, <th>Industry
</th>, <th>Revenue <br/>(USD millions)
</th>, <th>Revenue growth
</th>, <th>Employees
</th>, <th>Headquarters
</th>]


In [5]:
# Put the titles into a list, using a list comprehension and clean it with .text.strip() using a for loop.
table_titles = [i.text.strip() for i in titles]
print(table_titles)

['Rank', 'Name', 'Industry', 'Revenue (USD millions)', 'Revenue growth', 'Employees', 'Headquarters']


In [6]:
# Create DataFrame with pandas
df = pd.DataFrame(columns = table_titles)
df

Unnamed: 0,Rank,Name,Industry,Revenue (USD millions),Revenue growth,Employees,Headquarters


In [7]:
%%capture
# Pull the table row data information 
data_information = table_html.find_all('tr')
data_information

In [8]:
# Loop through '<tr>-</tr>' to get all the data in between each row.
# Start at possition 2 -> [1:] to skip the first '<tr>-</tr>' that points to the table header which we already have.
for row in data_information[1:]:
    row_data = row.find_all('td')  # Separate the '<td>-</td>' data values by iterating through them and then clean them up using a list comprehension.
    individual_row_data = [data.text.strip() for data in row_data] 
# Use len(df) to check the number of rows (which will be 0). Then, use it as an index of 0 to populate the first row with .loc[], assigning the data from 'first_individual_row_data.' As it iterates next time, it will take index 1 to populate the second row.    l = len(df)
    l = len(df)
    df.loc[l] = individual_row_data 
    


In [9]:
# Set new index to Rank 
df = df.set_index('Rank')
df

Unnamed: 0_level_0,Name,Industry,Revenue (USD millions),Revenue growth,Employees,Headquarters
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Walmart,Retail,611289,6.7%,2100000,"Bentonville, Arkansas"
2,Amazon,Retail and Cloud Computing,513983,9.4%,1540000,"Seattle, Washington"
3,Exxon Mobil,Petroleum industry,413680,44.8%,62000,"Spring, Texas"
4,Apple,Electronics industry,394328,7.8%,164000,"Cupertino, California"
5,UnitedHealth Group,Healthcare,324162,12.7%,400000,"Minnetonka, Minnesota"
...,...,...,...,...,...,...
96,Best Buy,Retail,46298,10.6%,71100,"Richfield, Minnesota"
97,Bristol-Myers Squibb,Pharmaceutical industry,46159,0.5%,34300,"New York City, New York"
98,United Airlines,Airline,44955,82.5%,92795,"Chicago, Illinois"
99,Thermo Fisher Scientific,Laboratory instruments,44915,14.5%,130000,"Waltham, Massachusetts"


In [10]:
#Export data to CSV file
df.to_csv(r'C:\Users\Gerardo\Downloads\Top100Companies.csv')