In [88]:
# Import libraries as needed
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re

In [89]:

url = "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)"
response = requests.get(url)
html_content = response.text

In [92]:
soup = BeautifulSoup(html_content, "html.parser")
table = soup.find_all("table")
print("Total number of tables are {}".format(len(table)))


Total number of tables are 7


In [8]:
data_table = soup.find("table", {"class": "wikitable"})
print(type(data_table))

<class 'bs4.element.Tag'>


In [20]:
colspan_cells = data_table.find_all("th", attrs={"colspan": True})

source_names = []
for cell in colspan_cells:
    link = cell.find("a")
    if link:
        source_name = link.text.strip()
        source_names.append(source_name)

print(source_names)

['IMF', 'World Bank', 'United Nations']


In [82]:
# Exclude header rows
data_rows = data_table.find_all("tr")[3:]  

# Extract first source name
first_source_name = soup.find("th", {"colspan": "2"}).find("a").text.strip()

pattern = re.compile(r"\s*\[.*?\]")

# Extract data for the first source
source_data = []
for row in data_rows:
    columns = row.find_all("td")
    if len(columns) > 1:
        country = columns[0].text.strip()
        estimate = re.sub(pattern, "", columns[2].text.strip()) # Extract numeric values only
        year = re.sub(r"\D", "", columns[3].text.strip())  # Extract numeric values only
        source_data.append([country, estimate, year])

# Create dataframe
df = pd.DataFrame(source_data, columns=["Country/Territory", "Estimate", "Year"])

print("First Source Name:", first_source_name)
print(df)


First Source Name: IMF
    Country/Territory    Estimate   Year
0       United States  26,854,599   2023
1               China  19,373,586  12023
2               Japan   4,409,738   2023
3             Germany   4,308,854   2023
4               India   3,736,882   2023
..                ...         ...    ...
208          Anguilla           —       
209          Kiribati         248   2023
210             Nauru         151   2023
211        Montserrat           —       
212            Tuvalu          65   2023

[213 rows x 3 columns]


In [83]:
# Exclude header rows
data_rows = data_table.find_all("tr")[3:]  

# Extract second source name
second_source_name = soup.find("th", {"colspan": "2"}).find_next_sibling("th").find("a").text.strip()

# Extract data for the second source
second_source_data = []
for row in data_rows:
    columns = row.find_all("td")
    if len(columns) > 1:
        country = columns[0].text.strip()
        estimate = re.sub(pattern, "", columns[4].text.strip()) # Extract numeric values only
        year = re.sub(r"\D", "", columns[5].text.strip())  # Extract numeric values only
        second_source_data.append([country, estimate, year])

# Create dataframe for the second source
second_df = pd.DataFrame(second_source_data, columns=["Country/Territory", "Estimate", "Year"])

print("Second Source Name:", second_source_name)
print(second_df)


Second Source Name: World Bank
    Country/Territory    Estimate   Year
0       United States  25,462,700   2022
1               China  17,963,171  32022
2               Japan   4,231,141   2022
3             Germany   4,072,192   2022
4               India   3,385,090   2022
..                ...         ...    ...
208          Anguilla         303   2021
209          Kiribati         223   2022
210             Nauru         151   2022
211        Montserrat          72   2021
212            Tuvalu          60   2022

[213 rows x 3 columns]


In [94]:
data_rows = data_table.find_all("tr")[2:]  # Exclude header rows

# Extract third source name
third_source_name = soup.find('th', colspan='2').find_next_sibling('th').find_next_sibling('th').find('a').text.strip()
third_source_name

'United Nations'

In [93]:
# Extract data for the second source
third_source_data = []
for row in data_rows:
    columns = row.find_all('td')
    if len(columns) > 1:
        country = columns[0].text.strip()
        estimate_3 = re.sub(r'[^0-9,]', '', columns[6].text.strip())  # Extract numeric values only
        year_3 = re.sub(r'[^0-9]', '', columns[7].text.strip())  # Extract numeric values only
        third_source_data.append([country, estimate, year])

# Create dataframe for the third source
third_df = pd.DataFrame(third_source_data, columns=['Country/Territory', 'Estimate', 'Year'])

print('Third Source Name:', third_source_name)
print(third_df)

Third Source Name: United Nations
  Country/Territory Estimate  Year
0     United States       60  2022
1             China       60  2022
2             Japan       60  2022
3           Germany       60  2022


In [96]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)"
response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, "html.parser")
table = soup.find("table", {"class": "wikitable"})
source_names = []
data_rows = []

for row in table.find_all("tr"):
    columns = row.find_all("td")

    # Extract source names
    if len(columns) > 1:
        source_name = columns[0].text.strip()
        source_names.append(source_name)

    # Extract data rows
    if len(columns) > 2:
        data_row = [column.text.strip() for column in columns[1:]]
        data_rows.append(data_row)

# Iterate through the sources and create separate dataframes
dfs = []
for i, source_name in enumerate(source_names):
    header_row = data_rows[0]
    data_rows = data_rows[1:]

    # Handle varying number of columns
    num_columns = len(header_row)
    data_rows = [row[:num_columns] for row in data_rows]

    df_source = pd.DataFrame(data_rows, columns=header_row)
    dfs.append(df_source)

# Access the dataframes for each source
for i, df in enumerate(dfs):
    print(f"Dataframe for source {i+1}:")
    print(df)
    print()


Dataframe for source 1:
            — 105,568,776       2023 101,002,997       2022  96,698,005  \
0    Americas  26,854,599       2023  25,462,700       2022  23,315,081   
1        Asia  19,373,586  [n 1]2023  17,963,171  [n 3]2022  17,734,131   
2        Asia   4,409,738       2023   4,231,141       2022   4,940,878   
3      Europe   4,308,854       2023   4,072,192       2022   4,259,935   
4        Asia   3,736,882       2023   3,385,090       2022   3,201,471   
..        ...         ...        ...         ...        ...         ...   
208  Americas           —          —         303       2021        None   
209   Oceania         248       2023         223       2022         227   
210   Oceania         151       2023         151       2022         155   
211  Americas           —          —          72       2021        None   
212   Oceania          65       2023          60       2022          60   

          2021  
0         2021  
1    [n 1]2021  
2         2021  
3      

      Africa  3,916   2023  3,515   2022
0     Europe  3,669   2023  3,352   2022
1   Americas  3,633   2023      —  3,126
2     Africa  3,520   2023  3,970   2022
3   Americas  3,470   2023  3,621   2022
4     Africa  3,234   2023  3,073   2022
5   Americas  3,162   2023  2,824   2022
6   Americas      —      —  3,273   2021
7     Africa  2,736   2023  2,383   2022
8   Americas      —      —  2,700   2021
9       Asia  2,683   2023      —  2,381
10    Africa  2,666   2023      —  2,255
11    Africa  2,584   2023  2,553   2022
12    Africa  2,468   2023  2,315   2022
13    Africa  2,277   2023  2,273   2022
14  Americas  2,262   2023  2,065   2022
15      Asia  1,988   2023  3,163   2022
16    Africa  1,950   2023  1,588   2022
17    Africa  1,887   2023  1,634   2022
18  Americas  1,864   2023  1,758   2022
19    Europe  1,807   2023      —  1,702
20    Africa      —      —  2,080   2021
21   Oceania  1,701   2023  1,596   2022
22  Americas      —      —  1,539   2021
23    Africa  1,

In [98]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)"
response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, "html.parser")
table = soup.find("table", {"class": "wikitable"})
source_names = []
data_rows = []
current_source = None

for row in table.find_all("tr"):
    columns = row.find_all("td")

    # Extract source names
    if len(columns) > 1:
        source_name = columns[0].text.strip()
        source_names.append(source_name)
        current_source = source_name

    # Extract data rows
    if len(columns) > 2:
        data_row = [column.text.strip() for column in columns[1:]]
        data_row.insert(0, current_source)  # Add the current source to the data row
        data_rows.append(data_row)

# Create a DataFrame with the combined data
header_row = ['Source'] + data_rows[0][1:]  # Use the first data row as the header row
data_rows = data_rows[1:]  # Remove the header row from the data rows
df_combined = pd.DataFrame(data_rows, columns=header_row)

# Print the DataFrame in tabular format
print(df_combined.to_string(index=False))


                          Source        — 105,568,776       2023 101,002,997       2022 96,698,005       2021
                   United States Americas  26,854,599       2023  25,462,700       2022 23,315,081       2021
                           China     Asia  19,373,586  [n 1]2023  17,963,171  [n 3]2022 17,734,131  [n 1]2021
                           Japan     Asia   4,409,738       2023   4,231,141       2022  4,940,878       2021
                         Germany   Europe   4,308,854       2023   4,072,192       2022  4,259,935       2021
                           India     Asia   3,736,882       2023   3,385,090       2022  3,201,471       2021
                  United Kingdom   Europe   3,158,938       2023   3,070,668       2022  3,131,378       2021
                          France   Europe   2,923,489       2023   2,782,905       2022  2,957,880       2021
                           Italy   Europe   2,169,745       2023   2,010,432       2022  2,107,703       2021
          