In [116]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import matplotlib as plt

The goal here is to scrape the tables from this wiki and convert them into pandas dataframes fit for an analytics workflow.

In [117]:
url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_in_the_United_States_by_revenue'

response = requests.get(url)

In [118]:
soup = BeautifulSoup(response.text, features='html')

#lets first extract the columns from this table
tables = soup.find_all("table")


In [119]:
def process_table(table):

    #find and process headers
    raw_headers = table.find_all("th")
    headers = [header.text.strip() for header in raw_headers]

    #find and process rows
    raw_rows = table.find_all("tr")
    rows = [[value.text.strip() for value in row.find_all("td")] for row in raw_rows[1:]]

    return pd.DataFrame(data=rows, columns=headers).set_index("Rank")

In [120]:
df_public = process_table(tables[0])
df_private = process_table(tables[1])
df_profit = process_table(tables[2])

In [121]:
df_public['Revenue (USD millions)'] = df_public['Revenue (USD millions)'].str.replace(",", "").astype("float")
df_public['Revenue growth'] = df_public['Revenue growth'].str.replace("%", "").astype("float")
df_public['Employees'] = df_public['Employees'].str.replace(",", "").astype("int")

In [122]:
df_public.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 1 to 100
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Name                    100 non-null    object 
 1   Industry                100 non-null    object 
 2   Revenue (USD millions)  100 non-null    float64
 3   Revenue growth          100 non-null    float64
 4   Employees               100 non-null    int32  
 5   Headquarters            100 non-null    object 
dtypes: float64(2), int32(1), object(3)
memory usage: 5.1+ KB


In [123]:
#process the numerical columns currently represented as strings.
df_private["Revenue (USD billions)"] = df_private["Revenue (USD billions)"].astype("float")
df_private['Employees'] = df_private['Employees'].str.replace(",", "").astype("int")

In [124]:
df_private

Unnamed: 0_level_0,Name,Industry,Revenue (USD billions),Employees,Headquarters
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Cargill,Food industry,177.0,160000,"Minnetonka, Minnesota"
2,Koch Industries,Conglomerate,125.0,120000,"Wichita, Kansas"
3,Publix Super Markets,Retail,54.5,250000,"Winter Haven, Florida"
4,"Mars, Incorporated",Food industry,47.0,140000,"McLean, Virginia"
5,H-E-B,Retail,43.6,145000,"San Antonio, Texas"
6,Reyes Holdings,Wholesaling,40.0,36000,"Rosemont, Illinois"
7,Enterprise Holdings,Car rental,35.0,90000,"Clayton, Missouri"
8,C&S Wholesale Grocers,Wholesaling,34.7,15000,"Keene, New Hampshire"
9,Love's,Petroleum industry and Retail,26.5,40000,"Oklahoma City, Oklahoma"
10,Southern Glazer's Wine and Spirits,Food industry,26.0,24000,"Miramar, Florida"


In [127]:
df_profit.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, 1 to 10
Data columns (total 3 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Name                   10 non-null     object
 1   Industry               10 non-null     object
 2   Profits(USD millions)  10 non-null     object
dtypes: object(3)
memory usage: 320.0+ bytes


In [128]:
df_profit["Profits(USD millions)"] = df_profit["Profits(USD millions)"].str.replace(",","").astype("float")

In [129]:
df_profit.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, 1 to 10
Data columns (total 3 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Name                   10 non-null     object 
 1   Industry               10 non-null     object 
 2   Profits(USD millions)  10 non-null     float64
dtypes: float64(1), object(2)
memory usage: 320.0+ bytes


# Hypotheses 

1. Public companies will be much larger in all dimensions than private companies.
2. Silicon Valley will dominate in terms of profits.
3. There is a power law distribution at work in all numerical variables