# Forbes' 2025 Global 2000

#### 1. Web Scraping

Site: https://www.forbes.com/lists/global2000/      
We will be scraping the table records listing the top 2000 public companies in the world.

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
base_url = "https://www.forbes.com/lists/global2000/"
records = []

# Collect HTML data from this page
response = requests.get(base_url)

# Parse content
content = response.content
parsed_content = BeautifulSoup(content, 'html.parser')

# Each row is wrapped in <a class="table-row ...">
for row in parsed_content.find_all("a", class_="table-row"):
    cols = row.find_all("div", class_="row-cell-value")
    if len(cols) >= 8:  # ensure we have all columns
        records.append([col.get_text(strip=True) for col in cols])


In [None]:
# Converting the collected records to a dataframe
column_headers = ["RANK", "NAME", "HEADQUARTERS", "INDUSTRY", "SALES", "PROFIT", "ASSETS", "MARKET VALUE"]
df = pd.DataFrame(records, columns=column_headers)

In [3]:
df.head()

Unnamed: 0,RANK,NAME,HEADQUARTERS,INDUSTRY,SALES,PROFIT,ASSETS,MARKET VALUE
0,1,JPMorganChase,United States,Banking,$285.11 B,$59.36 B,"$4,357.86 B",$677.8 B
1,2,Berkshire Hathaway,United States,Insurance,$371.43 B,$89 B,"$1,153.88 B","$1,145.46 B"
2,3,ICBC,China,Banking,$221.96 B,$50.84 B,"$6,688.6 B",$251.33 B
3,4,Saudi Arabian Oil Company (Saudi Aramco),Saudi Arabia,Oil & Gas Operations,$480.15 B,$104.97 B,$645.03 B,"$1,663.38 B"
4,5,Amazon,United States,Retail and Wholesale,$637.96 B,$59.25 B,$624.89 B,"$2,005.64 B"


In [4]:
df.isna().sum()

RANK            0
NAME            0
HEADQUARTERS    0
INDUSTRY        0
SALES           0
PROFIT          0
ASSETS          0
MARKET VALUE    0
dtype: int64

In [13]:
# Let's first save this scraped dataset before proceeding to the Data Cleaning stage.
df.to_csv("Original_Scraped_data.csv", index=False)

### 2. Data Cleaning

In [2]:
# The first thing that catches my eye are the numeric values. 
# Let's just retain the numbers since we know all of them are in Billions (B) and the currency is Dollars ($)

df = pd.read_csv("Original_Scraped_data.csv")

In [None]:
# # We'll will be using Regex for pattern matching and use .replace along with it
# def numeric_col_cleaning(col_name):
#     # Remove $ and B
#     df.loc[:, col_name] = df[col_name].str.replace(r'[$B]', '', regex=True) 

#     # convert to numeric
#     df.loc[:, col_name] = (
#         pd.to_numeric(df[col_name], errors="coerce").astype(float)
# )
    
# numeric_col_cleaning('SALES')
# numeric_col_cleaning('PROFIT')
# numeric_col_cleaning('ASSETS')
# numeric_col_cleaning('MARKET VALUE')

