In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from fake_useragent import UserAgent
from time import sleep
import re

In [3]:
ua = UserAgent()
header = {'User-Agent':str(ua.chrome)}

url = "https://www.value.today/headquarters/united-states-america-usa?page=0"

source = requests.get(url, headers=header)
soup = BeautifulSoup(source.content, "html.parser")

In [4]:
info_blocks = soup.find_all(class_="node node--type-listed-companies node--view-mode-teaser ds-2col-stacked clearfix")

In [165]:
collected_names = []
collected_businesses = []
collected_values = []
collected_values_metrics = []
collected_revenues = []
collected_revenues_metrics = []
collected_op_incomes = []
collected_op_incomes_metrics = []
collected_net_incomes = []
collected_net_incomes_metrics = []
collected_assets = []
collected_assets_metrics = []
collected_liabilities = []
collected_liabilities_metrics = []

for item in info_blocks:

    name = item.find("h2").text.strip()
    collected_names.append(name)

    business = item.select("div.field--name-field-company-category-primary > div > div.field--item > a")[0].text.strip()
    collected_businesses.append(business)

    value = item.select("div.field--name-field-market-value-jan012021 > div.field--item")[0]["content"]
    collected_values.append(value)

    value_string = item.select("div.field--name-field-market-value-jan012021 > div.field--item")[0].text.strip()
    if "Billion" in value_string:
        collected_values_metrics.append("Billion")
    elif "Million" in value_string:
        collected_values_metrics.append("Million")

    revenue_string = item.select("div.field--name-field-annual-revenue > div.field--item")[0].text.strip()
    revenue = re.findall("\d+[.,]\d+", revenue_string)
    collected_revenues.append(revenue[0])

    if "Billion" in revenue_string:
        collected_revenues_metrics.append("Billion")
    elif "Million" in revenue_string:
        collected_revenues_metrics.append("Million")
        
    op_income_item = item.select("div.field--name-field-annual-operating-income > div.field--item")
    if len(op_income_item) == 0:
        collected_op_incomes.append(None)
    else:
        op_income_string = op_income_item[0].text.strip()
        op_income = re.findall("\d+[.,]\d+", op_income_string)
        collected_op_incomes.append(op_income[0])

    if len(op_income_item) == 0:
        collected_op_incomes_metrics.append(None)
    elif "Billion" in op_income_string:
        collected_op_incomes_metrics.append("Billion")
    elif "Million" in op_income_string:
        collected_op_incomes_metrics.append("Million")

    net_income_string = item.select("div.field--name-field-annual-revenue > div.field--item")[0].text.strip()
    net_income = re.findall("\d+[.,]\d+", net_income_string)
    collected_net_incomes.append(net_income[0])

    if "Billion" in net_income_string:
        collected_net_incomes_metrics.append("Billion")
    elif "Million" in net_income_string:
        collected_net_incomes_metrics.append("Million")

    assets_string = item.select("div.field--name-field-total-assets > div.field--item")[0].text.strip()
    assets = re.findall("\d+[.,]\d+", assets_string)
    collected_assets.append(assets[0])

    if "Billion" in assets_string:
        collected_assets_metrics.append("Billion")
    elif "Million" in assets_string:
        collected_assets_metrics.append("Million")

    liabilities_string = item.select("div.field--name-field-total-liabilities- > div.field--item")[0].text.strip()
    liabilities = re.findall("\d+[.,]\d+", liabilities_string)
    collected_liabilities.append(liabilities[0])

    if "Billion" in liabilities_string:
        collected_liabilities_metrics.append("Billion")
    elif "Million" in liabilities_string:
        collected_liabilities_metrics.append("Million")

df = pd.DataFrame({"Company Name": collected_names, "Company Business": collected_businesses, "Market Value": collected_values, "Value Metric": collected_values_metrics, "Annual Revenue": collected_revenues, "Revenue Metric": collected_revenues_metrics, "Operating Income": collected_op_incomes, "Op Income Metric": collected_op_incomes_metrics, "Net Income": collected_net_incomes, "Net Income Metric": collected_net_incomes_metrics, "Assets": collected_assets, "Assets Metric": collected_assets_metrics, "Liabilities": collected_liabilities, "Liabilities Metric": collected_liabilities_metrics})

In [166]:
numeric_columns = ["Market Value", "Annual Revenue", "Operating Income", "Net Income", "Assets", "Liabilities"]

df[numeric_columns] = df[numeric_columns].replace(",", "", regex=True)
df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric)

In [189]:
df

Unnamed: 0,Company Name,Company Business,Market Value,Value Metric,Annual Revenue,Revenue Metric,Operating Income,Op Income Metric,Net Income,Net Income Metric,Assets,Assets Metric,Liabilities,Liabilities Metric
0,APPLE,Technology,2256.0,Billion,274.51,Billion,66.28,Billion,274.51,Billion,317344.0,Million,245062.0,Million
1,MICROSOFT CORPORATION,Technology,1682.0,Billion,143.0,Billion,52.95,Billion,143.0,Billion,301311.0,Million,183007.0,Million
2,ALPHABET,Technology,1185.0,Billion,182.52,Billion,41.22,Billion,182.52,Billion,299.243,Billion,86.32,Billion
3,AMAZON.COM,eCommerce,1634.0,Billion,386.06,Billion,22.89,Billion,386.06,Billion,110908.0,Million,184586.0,Million
4,FACEBOOK,Communication Services,778.04,Billion,85.96,Billion,32.67,Billion,85.96,Billion,139691.0,Million,29244.0,Million
5,BERKSHIRE HATHAWAY,Financial Services,543.68,Billion,327.22,Billion,,,327.22,Billion,829.946,Billion,410.73,Billion
6,J P MORGAN CHASE & CO,Financial Services,387.33,Billion,122.9,Billion,,,122.9,Billion,3386.0,Billion,3106.0,Billion
7,VISA,Financial Services,482.17,Billion,21.84,Billion,14.09,Billion,21.84,Billion,77884.0,Million,42360.0,Million
8,JOHNSON & JOHNSON,Healthcare,414.31,Billion,82.58,Billion,19.73,Billion,82.58,Billion,158380.0,Million,95402.0,Million
9,WALMART,Retail,407.84,Billion,523.96,Billion,22.54,Billion,523.96,Billion,237382.0,Million,162072.0,Million


In [227]:
final_df = df[["Company Name", "Company Business"]]

In [228]:
final_df

Unnamed: 0,Company Name,Company Business
0,APPLE,Technology
1,MICROSOFT CORPORATION,Technology
2,ALPHABET,Technology
3,AMAZON.COM,eCommerce
4,FACEBOOK,Communication Services
5,BERKSHIRE HATHAWAY,Financial Services
6,J P MORGAN CHASE & CO,Financial Services
7,VISA,Financial Services
8,JOHNSON & JOHNSON,Healthcare
9,WALMART,Retail


In [232]:
def change_num_values(number_column: str, metric_column: str) -> None:
    final_df[number_column] = 0 
    for count, row in enumerate(df[metric_column]):
        if row == "Million":
            final_df[number_column][count] = df[number_column][count] * 1000000
        if row == "Billion":
            final_df[number_column][count] = df[number_column][count] * 1000000000

In [233]:
change_num_values("Market Value", "Value Metric")
change_num_values("Annual Revenue", "Revenue Metric")
change_num_values("Operating Income", "Op Income Metric")
change_num_values("Net Income", "Net Income Metric")
change_num_values("Assets", "Assets Metric")
change_num_values("Liabilities", "Liabilities Metric")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df[number_column] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df[number_column][count] = df[number_column][count] * 1000000000
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  change_num_values("Market Value", "Value Metric")
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.