In [1]:
import requests
import pandas as pd

from bs4 import BeautifulSoup

import numpy as np

from io import StringIO

In [2]:
# Get raw html using wikipedia api
url = "https://en.wikipedia.org/w/api.php"
params = {
    "action":"parse",
    "page": "The World's Billionaires",
    "format":"json",
    "prop":"text"
}

response = requests.get(url, params= params)

response.status_code

200

In [3]:
data = response.json()

In [4]:
type(data['parse']['text'])

dict

In [5]:
type(data['parse']['text']['*'])

str

In [6]:
html_content = data['parse']['text']['*']

In [7]:
soup = BeautifulSoup(html_content, 'html.parser')


In [8]:
tables = soup.find_all("table")
headings = soup.find_all("h3")

In [9]:
# store segmented tables with associated year
dfs = []
current_year = "unknown"

for tag in soup.find_all(['h3', 'table']):
    if tag.name in ['h3']:
        text = tag.get_text()
        if any(str(year) in text for year in list(range(1987,2026))):
            current_year = [str(year) for year in list(range(1987,2026)) if str(year) in text][0]
    elif tag.name == 'table':
        try:
            df = pd.read_html(StringIO(str(tag)))[0]
            df['year'] = current_year
            dfs.append(df)
        except Exception as e:
            print("Skipped a table:", e)



# combine all ttables
final_df = pd.concat(dfs, ignore_index= True)

display(final_df.head())

            

Unnamed: 0,0,1,year,Icon,Description,No.,Name,Net worth (USD),Age,Nationality,...,Year,Number of billionaires,Group's combined net worth,vteForbes magazine,vteForbes magazine.1,vteForbes magazine.2,vteBillionaires,vteBillionaires.1,vteExtreme wealth,vteExtreme wealth.1
0,"List of the world's billionaires, ranked in or...","List of the world's billionaires, ranked in or...",unknown,,,,,,,,...,,,,,,,,,,
1,The net worth of the world's billionaires incr...,The net worth of the world's billionaires incr...,unknown,,,,,,,,...,,,,,,,,,,
2,Publication details,Publication details,unknown,,,,,,,,...,,,,,,,,,,
3,Publisher,Whale Media InvestmentsForbes family,unknown,,,,,,,,...,,,,,,,,,,
4,Publication,Forbes,unknown,,,,,,,,...,,,,,,,,,,


In [10]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 483 entries, 0 to 482
Data columns (total 26 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   0                            31 non-null     object 
 1   1                            31 non-null     object 
 2   year                         483 non-null    object 
 3   Icon                         0 non-null      float64
 4   Description                  3 non-null      object 
 5   No.                          262 non-null    float64
 6   Name                         392 non-null    object 
 7   Net worth (USD)              392 non-null    object 
 8   Age                          302 non-null    object 
 9   Nationality                  392 non-null    object 
 10  Primary source(s) of wealth  40 non-null     object 
 11  Source(s) of wealth          352 non-null    object 
 12  No.[49]                      100 non-null    float64
 13  No.[61]             

In [11]:
df = final_df[["year","Nationality","Name","Net worth (USD)"]]

In [12]:
df = df.dropna(how = 'all')

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 483 entries, 0 to 482
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   year             483 non-null    object
 1   Nationality      392 non-null    object
 2   Name             392 non-null    object
 3   Net worth (USD)  392 non-null    object
dtypes: object(4)
memory usage: 15.2+ KB


In [14]:
df['year'] = df.year.replace("unknown",np.nan)
df.dropna(subset=['year'], inplace= True)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 464 entries, 19 to 482
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   year             464 non-null    object
 1   Nationality      392 non-null    object
 2   Name             392 non-null    object
 3   Net worth (USD)  392 non-null    object
dtypes: object(4)
memory usage: 18.1+ KB


In [16]:
df.head()

Unnamed: 0,year,Nationality,Name,Net worth (USD)
19,2025,South Africa Canada United States,Elon Musk,$342 billion
20,2025,United States,Mark Zuckerberg,$216 billion
21,2025,United States,Jeff Bezos,$215 billion
22,2025,United States,Larry Ellison,$192 billion
23,2025,France,Bernard Arnault & family,$178 billion


In [17]:
df[df.isna().any(axis= 1)]

Unnamed: 0,year,Nationality,Name,Net worth (USD)
411,1987,,,
412,1987,,,
413,1987,,,
414,1987,,,
415,1987,,,
...,...,...,...,...
478,1987,,,
479,1987,,,
480,1987,,,
481,1987,,,


In [18]:
df = df.dropna()

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 392 entries, 19 to 410
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   year             392 non-null    object
 1   Nationality      392 non-null    object
 2   Name             392 non-null    object
 3   Net worth (USD)  392 non-null    object
dtypes: object(4)
memory usage: 15.3+ KB


In [20]:
df.head()

Unnamed: 0,year,Nationality,Name,Net worth (USD)
19,2025,South Africa Canada United States,Elon Musk,$342 billion
20,2025,United States,Mark Zuckerberg,$216 billion
21,2025,United States,Jeff Bezos,$215 billion
22,2025,United States,Larry Ellison,$192 billion
23,2025,France,Bernard Arnault & family,$178 billion


In [21]:
df.year.unique()

array(['2025', '2024', '2023', '2022', '2021', '2020', '2019', '2018',
       '2017', '2016', '2015', '2014', '2013', '2012', '2011', '2010',
       '2009', '2008', '2007', '2006', '2005', '2004', '2003', '2002',
       '2001', '2000', '1999', '1998', '1997', '1996', '1995', '1994',
       '1993', '1992', '1991', '1990', '1989', '1988', '1987'],
      dtype=object)

## Population data pull

In [22]:
from io import StringIO

In [23]:
url = "https://en.wikipedia.org/w/api.php"
params = {
    "action": "parse",
    "page":"List of countries and dependencies by population",
    "format":"json"
}

response = requests.get(url, params= params)

response.status_code

200

In [24]:
data = response.json()

In [25]:
data.keys()

dict_keys(['parse'])

In [26]:
type(data["parse"])

dict

In [27]:
data["parse"].keys()



In [28]:
type(data['parse']['text'])

dict

In [29]:
html = data['parse']['text']['*']

In [30]:
soup = BeautifulSoup(html, 'html.parser')

In [31]:
table = soup.find_all('table')

In [32]:
df = pd.read_html(StringIO(str(table)), skiprows= [1])[0]

In [33]:
type(df)

pandas.core.frame.DataFrame

In [34]:
df.head()

Unnamed: 0,Location,Population,% of world,Date,Source (official or from the United Nations),Notes
0,India,1413324000,17.3%,1 Mar 2025,Official projection[4],[b]
1,China,1408280000,17.2%,31 Dec 2024,Official estimate[5],[c]
2,United States,340110988,4.2%,1 Jul 2024,Official estimate[6],[d]
3,Indonesia,282477584,3.5%,30 Jun 2024,National annual projection[7],
4,Pakistan,241499431,3.0%,1 Mar 2023,2023 census result[8],[e]


In [35]:
df = df.iloc[:,0:4].drop(columns = ['% of world'])

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 239 entries, 0 to 238
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Location    239 non-null    object
 1   Population  239 non-null    int64 
 2   Date        239 non-null    object
dtypes: int64(1), object(2)
memory usage: 5.7+ KB


In [37]:
df.Location.unique()

array(['India', 'China', 'United States', 'Indonesia', 'Pakistan',
       'Nigeria', 'Brazil', 'Bangladesh', 'Russia', 'Mexico', 'Japan',
       'Philippines', 'Ethiopia', 'Democratic Republic of the Congo',
       'Egypt', 'Vietnam', 'Iran', 'Turkey', 'Germany', 'France',
       'United Kingdom', 'Thailand', 'South Africa', 'Tanzania', 'Italy',
       'Colombia', 'Kenya', 'Myanmar', 'South Korea', 'Sudan', 'Spain',
       'Algeria', 'Argentina', 'Uganda', 'Iraq', 'Afghanistan', 'Canada',
       'Uzbekistan', 'Poland', 'Morocco', 'Angola', 'Malaysia', 'Peru',
       'Mozambique', 'Ghana', 'Ukraine', 'Yemen', 'Saudi Arabia',
       'Madagascar', 'Ivory Coast', 'Nepal', 'Cameroon', 'Venezuela',
       'Australia', 'Niger', 'North Korea', 'Syria', 'Burkina Faso',
       'Taiwan', 'Mali', 'Sri Lanka', 'Kazakhstan', 'Malawi', 'Chile',
       'Zambia', 'Romania', 'Somalia', 'Chad', 'Senegal', 'Netherlands',
       'Guatemala', 'Cambodia', 'Ecuador', 'Zimbabwe', 'South Sudan',
       'Guinea'

In [38]:
#df.to_csv("countries_population.csv",encoding= "utf-8", header= True, index= False)