# Data Scraping Using Python (BeautifulSoup and Requests)

In [1]:
# Import necessary libraries for web scraping and data handling
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
# Fetch the page and parse it
url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_in_the_United_States_by_revenue'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')

In [3]:
# Find the first table element on the webpage
t1 = soup.find('table')

In [4]:
# Find the first table element with a specific class name
# This retrieves the first <table> tag that has the class 'wikitable sortable'.
# The 'class_' parameter is used to specify the class attribute (since 'class' is a reserved keyword in Python).
t2 = soup.find('table', class_='wikitable sortable')  # Retrieves the first table with the specified class

In [5]:
# Find all table elements on the page
# This will return a list of all <table> tags present in the HTML content.
# The index [1] retrieves the second table in the list (since indexing starts at 0).
table = soup.find_all('table')[1]  # Retrieves the second <table> tag from the page

In [6]:
print(table)

<table class="wikitable sortable">
<tbody><tr>
<th>Rank
</th>
<th>Name
</th>
<th>Industry
</th>
<th>Revenue <br/>(USD billions)
</th>
<th>Employees
</th>
<th>Headquarters
</th></tr>
<tr>
<td>1
</td>
<td><a href="/wiki/Cargill" title="Cargill">Cargill</a>
</td>
<td>Food industry
</td>
<td style="text-align:center;">177
</td>
<td style="text-align:center;">160,000
</td>
<td><a href="/wiki/Minnetonka,_Minnesota" title="Minnetonka, Minnesota">Minnetonka, Minnesota</a>
</td></tr>
<tr>
<td>2
</td>
<td><a class="mw-redirect" href="/wiki/Koch_Industries" title="Koch Industries">Koch Industries</a>
</td>
<td>Conglomerate
</td>
<td style="text-align:center;">125
</td>
<td style="text-align:center;">120,000
</td>
<td><a href="/wiki/Wichita,_Kansas" title="Wichita, Kansas">Wichita, Kansas</a>
</td></tr>
<tr>
<td>3
</td>
<td><a class="mw-redirect" href="/wiki/Publix_Super_Markets" title="Publix Super Markets">Publix Super Markets</a>
</td>
<td>Retail
</td>
<td style="text-align:center;">54.5
</td>


In [7]:
# Extract all header cells from the table
world_titles = table.find_all('th')

In [8]:
world_titles

[<th>Rank
 </th>,
 <th>Name
 </th>,
 <th>Industry
 </th>,
 <th>Revenue <br/>(USD billions)
 </th>,
 <th>Employees
 </th>,
 <th>Headquarters
 </th>]

In [9]:
# Clean and store the header text
world_table_titles = [title.text.strip() for title in world_titles]

print(world_table_titles)  

['Rank', 'Name', 'Industry', 'Revenue (USD billions)', 'Employees', 'Headquarters']


In [10]:
# Create an empty DataFrame with the headers
df = pd.DataFrame(columns=world_table_titles)

In [11]:
df

Unnamed: 0,Rank,Name,Industry,Revenue (USD billions),Employees,Headquarters


In [12]:
# Find all rows (<tr>) in the table
column_data = table.find_all('tr')

In [13]:
# Iterate over each row (skipping the first row with headers)
for row in column_data[1:]:
    # Find all data cells (<td>) in the current row
    row_data = row.find_all('td')
    
    # Extract and clean the text from each data cell
    individual_row_data = [data.text.strip() for data in row_data]
    
    # Add the cleaned row data to the DataFrame
    length = len(df)  # Get the current length of the DataFrame
    df.loc[length] = individual_row_data  # Append the row to the DataFrame


In [14]:
df

Unnamed: 0,Rank,Name,Industry,Revenue (USD billions),Employees,Headquarters
0,1,Cargill,Food industry,177.0,160000,"Minnetonka, Minnesota"
1,2,Koch Industries,Conglomerate,125.0,120000,"Wichita, Kansas"
2,3,Publix Super Markets,Retail,54.5,250000,"Lakeland, Florida"
3,4,"Mars, Incorporated",Food industry,47.0,140000,"McLean, Virginia"
4,5,H-E-B,Retail,43.6,145000,"San Antonio, Texas"
5,6,Reyes Holdings,Wholesaling,40.0,36000,"Rosemont, Illinois"
6,7,Enterprise Holdings,Car rental,35.0,90000,"Clayton, Missouri"
7,8,C&S Wholesale Grocers,Wholesaling,34.7,15000,"Keene, New Hampshire"
8,9,Love's,Petroleum industry and Retail,26.5,40000,"Oklahoma City, Oklahoma"
9,10,Southern Glazer's Wine and Spirits,Food industry,26.0,24000,"Miramar, Florida"


In [15]:
# Save the DataFrame to a CSV file without including the index column
df.to_csv(r'C:\Users\mahes\OneDrive\Documents\Companies_US.csv', index=False)
