In [15]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

# Fetch the page
url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')

# Extract the table of interest
table = soup.find('table', {'class': 'wikitable'})

# Extract table headers (skipping the unnecessary headers and references)
headers = table.find_all('th')
header_list = ['Rank'] + [header.text.strip().replace("[note 1]", "").replace("\n", "") for header in headers[1:7]]  # Limit to 6 relevant columns

# Initialize DataFrame with the correct number of headers
companies = pd.DataFrame(columns=header_list)

# Extract the rows and manually add the "Rank" column
rows = table.find_all('tr')[1:]  # Skip the header row
rank = 1
for row in rows:
    columns = row.find_all('td')
    if len(columns) >= 6:  # Ensure there are at least 6 columns of data
        row_data = [col.text.strip() for col in columns[:6]]  # Extract only the first 6 columns
        row_data.insert(0, rank)  # Insert the rank at the beginning of the row
        companies.loc[len(companies)] = row_data
        rank += 1
    else:
        print(f"Skipping row with mismatched columns: {len(columns)} columns found, expected at least 6")

# Display the first few rows of the DataFrame
print(companies.head())

# Optional: Save the DataFrame to a CSV file
companies.to_csv('largest_companies_by_revenue.csv', index=False)


Skipping row with mismatched columns: 0 columns found, expected at least 6
   Rank                             Name     Industry   Revenue    Profit  \
0     1                          Walmart       Retail  $648,125   $15,511   
1     2                           Amazon       Retail  $574,785   $30,425   
2     3  State Grid Corporation of China  Electricity  $545,948    $9,204   
3     4                     Saudi Aramco  Oil and gas  $494,890  $129,699   
4     5  China Petrochemical Corporation  Oil and gas  $429,700    $9,393   

   Employees   Headquarters  
0  2,100,000  United States  
1  1,525,000  United States  
2  1,361,423          China  
3     73,311   Saudi Arabia  
4    513,434          China  


In [16]:
companies.head()

Unnamed: 0,Rank,Name,Industry,Revenue,Profit,Employees,Headquarters
0,1,Walmart,Retail,"$648,125","$15,511",2100000,United States
1,2,Amazon,Retail,"$574,785","$30,425",1525000,United States
2,3,State Grid Corporation of China,Electricity,"$545,948","$9,204",1361423,China
3,4,Saudi Aramco,Oil and gas,"$494,890","$129,699",73311,Saudi Arabia
4,5,China Petrochemical Corporation,Oil and gas,"$429,700","$9,393",513434,China


In [3]:
!pip install ace_tools

Defaulting to user installation because normal site-packages is not writeable
