# Forbes' 2025 Global 2000

#### 1. Web Scraping

Site: https://www.forbes.com/lists/global2000/      
We will be scraping the table records listing the top 2000 public companies in the world.

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
base_url = "https://www.forbes.com/lists/global2000/"
records = []

# Collect HTML data from this page
response = requests.get(base_url)

# Parse content
content = response.content
parsed_content = BeautifulSoup(content, 'html.parser')

# Each row is wrapped in <a class="table-row ...">
for row in parsed_content.find_all("a", class_="table-row"):
    cols = row.find_all("div", class_="row-cell-value")
    if len(cols) >= 8:  # ensure we have all columns
        records.append([col.get_text(strip=True) for col in cols])


In [None]:
# Converting the collected records to a dataframe
column_headers = ["RANK", "NAME", "HEADQUARTERS", "INDUSTRY", "SALES", "PROFIT", "ASSETS", "MARKET VALUE"]
df = pd.DataFrame(records, columns=column_headers)

In [3]:
df.head()

Unnamed: 0,RANK,NAME,HEADQUARTERS,INDUSTRY,SALES,PROFIT,ASSETS,MARKET VALUE
0,1,JPMorganChase,United States,Banking,$285.11 B,$59.36 B,"$4,357.86 B",$677.8 B
1,2,Berkshire Hathaway,United States,Insurance,$371.43 B,$89 B,"$1,153.88 B","$1,145.46 B"
2,3,ICBC,China,Banking,$221.96 B,$50.84 B,"$6,688.6 B",$251.33 B
3,4,Saudi Arabian Oil Company (Saudi Aramco),Saudi Arabia,Oil & Gas Operations,$480.15 B,$104.97 B,$645.03 B,"$1,663.38 B"
4,5,Amazon,United States,Retail and Wholesale,$637.96 B,$59.25 B,$624.89 B,"$2,005.64 B"


In [39]:
df.isna().sum()

RANK            0
NAME            0
HEADQUARTERS    0
INDUSTRY        0
SALES           0
PROFIT          0
ASSETS          0
MARKET VALUE    0
dtype: int64

In [13]:
# Let's first save this scraped dataset before proceeding to the Data Cleaning stage.
df.to_csv("Original_Scraped_data.csv", index=False)

### 2. Data Cleaning

In [31]:
# The first thing that catches my eye are the numeric values. 
# Let's just retain the numbers since we know all of them are in Billions (B) and the currency is Dollars ($)

df = pd.read_csv("Original_Scraped_data.csv")

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   RANK          2000 non-null   object
 1   NAME          2000 non-null   object
 2   HEADQUARTERS  2000 non-null   object
 3   INDUSTRY      2000 non-null   object
 4   SALES         2000 non-null   object
 5   PROFIT        2000 non-null   object
 6   ASSETS        2000 non-null   object
 7   MARKET VALUE  2000 non-null   object
dtypes: object(8)
memory usage: 125.1+ KB


In [None]:
# # We'll will be using Regex for pattern matching and use .replace along with it
def numeric_col_cleaning(col_name):
    # Remove $ and B/M/K
    df.loc[:, col_name] = df[col_name].str.replace(r'[$B]', '', regex=True) 

    # Strip any whitespaces
    df[col_name] = df[col_name].str.strip()


    
numeric_col_cleaning('SALES')
numeric_col_cleaning('PROFIT')
numeric_col_cleaning('ASSETS')
numeric_col_cleaning('MARKET VALUE')






In [None]:
# We'll will be using Regex for pattern matching replace all the $ signs, B (billion)/M (million)/K (thousands) signs
# with empty strings and remove any whitespaces if present.
# We will be keeping all the values in billions

import re

def numeric_col_cleaning(value: str) -> float:
    match value.upper():
        case value if 'K' in value:
            return float((re.sub(r'[\$,K]', '', value)).strip()) / 1000000
        case value if 'M' in value:
            return float((re.sub(r'[\$,M]', '', value)).strip()) / 1000
        case value if 'B' in value:
            return float((re.sub(r'[\$,B]', '', value)).strip())
        case _:
            return float((re.sub(r'[\$,]', '', value)).strip())

In [None]:
df.loc[:, 'SALES'] = df['SALES'].apply(numeric_col_cleaning)
df.loc[:, 'PROFIT'] = df['PROFIT'].apply(numeric_col_cleaning)
df.loc[:, 'ASSETS'] = df['ASSETS'].apply(numeric_col_cleaning)
df.loc[:, 'MARKET VALUE'] = df['MARKET VALUE'].apply(numeric_col_cleaning)

In [None]:
# Convert all the columns to Numeric type (float64)
df[['SALES', 'PROFIT', 'ASSETS', 'MARKET VALUE']] = df[['SALES', 'PROFIT', 'ASSETS', 'MARKET VALUE']].apply(pd.to_numeric)

In [45]:
df.head()

Unnamed: 0,RANK,NAME,HEADQUARTERS,INDUSTRY,SALES,PROFIT,ASSETS,MARKET VALUE
0,1,JPMorganChase,United States,Banking,285.11,59.36,4357.86,677.8
1,2,Berkshire Hathaway,United States,Insurance,371.43,89.0,1153.88,1145.46
2,3,ICBC,China,Banking,221.96,50.84,6688.6,251.33
3,4,Saudi Arabian Oil Company (Saudi Aramco),Saudi Arabia,Oil & Gas Operations,480.15,104.97,645.03,1663.38
4,5,Amazon,United States,Retail and Wholesale,637.96,59.25,624.89,2005.64


In [None]:
# Checking if any other columns need to be cleaned or formatted
df

Unnamed: 0,RANK,NAME,HEADQUARTERS,INDUSTRY,SALES,PROFIT,ASSETS,MARKET VALUE
0,1,JPMorganChase,United States,Banking,285.110,59.3600,4357.86,677.800
1,2,Berkshire Hathaway,United States,Insurance,371.430,89.0000,1153.88,1145.460
2,3,ICBC,China,Banking,221.960,50.8400,6688.60,251.330
3,4,Saudi Arabian Oil Company (Saudi Aramco),Saudi Arabia,Oil & Gas Operations,480.150,104.9700,645.03,1663.380
4,5,Amazon,United States,Retail and Wholesale,637.960,59.2500,624.89,2005.640
...,...,...,...,...,...,...,...,...
1995,1996,Dino Polska,Poland,Food Markets,7.360,0.3781,3.16,13.840
1996,1997,China Aviation Oil,Singapore,Trading Companies,15.520,0.0784,1.99,0.559
1997,1998,Jet2,United Kingdom,Business Services & Supplies,8.790,0.6288,7.95,3.930
1998,1998,Keiyo Bank,Japan,Banking,0.474,0.0927,42.56,0.679


In [68]:
df.to_csv("Cleaned_data.csv", index=False)

### 3. Creating columns for analysis

In [None]:
# Let's take a look at the Headquarters column first

In [None]:
# Here each country is classified as a developed, emerging or frontier market
market_classification = {
    # Developed Markets
    "Australia": "Developed",
    "Austria": "Developed",
    "Belgium": "Developed",
    "Bermuda": "Developed",
    "Canada": "Developed",
    "Cayman Islands": "Developed",
    "Denmark": "Developed",
    "Finland": "Developed",
    "France": "Developed",
    "Germany": "Developed",
    "Hong Kong": "Developed",
    "Ireland": "Developed",
    "Israel": "Developed",
    "Italy": "Developed",
    "Japan": "Developed",
    "Luxembourg": "Developed",
    "Netherlands": "Developed",
    "Norway": "Developed",
    "Portugal": "Developed",   
    "Singapore": "Developed",
    "Spain": "Developed",
    "Sweden": "Developed",
    "Switzerland": "Developed",
    "United Kingdom": "Developed",
    "United States": "Developed",

    # Emerging Markets
    "Argentina": "Emerging",
    "Brazil": "Emerging",
    "Chile": "Emerging",
    "China": "Emerging",
    "Colombia": "Emerging",
    "Czech Republic": "Emerging",
    "Egypt": "Emerging",
    "Greece": "Emerging",
    "Hungary": "Emerging",
    "India": "Emerging",
    "Indonesia": "Emerging",
    "Kuwait": "Emerging",
    "Malaysia": "Emerging",
    "Mexico": "Emerging",
    "Peru": "Emerging",
    "Philippines": "Emerging",
    "Poland": "Emerging",
    "Qatar": "Emerging",
    "Saudi Arabia": "Emerging",
    "South Africa": "Emerging",
    "South Korea": "Emerging",
    "Taiwan": "Emerging",
    "Thailand": "Emerging",
    "Turkey": "Emerging",
    "United Arab Emirates": "Emerging",

    # Frontier Markets
    "Bahrain": "Frontier",
    "Cyprus": "Frontier",
    "Jordan": "Frontier",
    "Kazakhstan": "Frontier",
    "Lebanon": "Frontier",
    "Morocco": "Frontier",
    "Oman": "Frontier",
    "Panama": "Frontier",
    "Romania": "Frontier",
    "Slovenia": "Frontier",
    "Vietnam": "Frontier"
}
