In [39]:
%pip install lxml

import pandas as pd

tables = pd.read_html('https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)')


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [40]:
tables # Extract the first table which contains the GDP data

[                                                   0
 0  Largest economies in the world by GDP (nominal...,
                                                    0  \
 0  > $20 trillion $10–20 trillion $5–10 trillion ...   
 
                                                    1  \
 0  $750 billion – $1 trillion $500–750 billion $2...   
 
                                                    2  
 0  $50–100 billion $25–50 billion $5–25 billion <...  ,
     Country/Territory IMF[1][12]            World Bank[13]             \
     Country/Territory   Forecast       Year       Estimate       Year   
 0               World  113795678       2025      105435540       2023   
 1       United States   30507217       2025       27360935       2023   
 2               China   19231705  [n 1]2025       17794782  [n 3]2023   
 3             Germany    4744804       2025        4456081       2023   
 4               India    4187017       2025        3549919       2023   
 ..                ...      

In [41]:
len(tables)  # Check how many tables were extracted

7

In [42]:
failure = tables[0]  # Extract the first table which contains the GDP data
failure.head()  # Display the first few rows of the table to verify its structure

Unnamed: 0,0
0,Largest economies in the world by GDP (nominal...


In [43]:
failure1 = tables[1]  # Extract the second table which contains the GDP data
failure1.head()
  # Drop the first level of the MultiIndex columns

Unnamed: 0,0,1,2
0,> $20 trillion $10–20 trillion $5–10 trillion ...,$750 billion – $1 trillion $500–750 billion $2...,$50–100 billion $25–50 billion $5–25 billion <...


In [44]:
failure2 = tables[2]  # Extract the third table which contains the GDP data
failure2.head()  # Display the first few rows of the third table to verify its structure

Unnamed: 0_level_0,Country/Territory,IMF[1][12],IMF[1][12],World Bank[13],World Bank[13],United Nations[14],United Nations[14]
Unnamed: 0_level_1,Country/Territory,Forecast,Year,Estimate,Year,Estimate,Year
0,World,113795678,2025,105435540,2023,100834796,2022
1,United States,30507217,2025,27360935,2023,25744100,2022
2,China,19231705,[n 1]2025,17794782,[n 3]2023,17963170,[n 1]2022
3,Germany,4744804,2025,4456081,2023,4076923,2022
4,India,4187017,2025,3549919,2023,3465541,2022


In [45]:
failure3 = tables[3]  # Extract the fourth table which contains the GDP data
failure3.head()  # Display the first few rows of the fourth table to verify its structure

Unnamed: 0,vteLists of countries by financial rankings,vteLists of countries by financial rankings.1
0,Trade,Account balance Exports by product merchandise...
1,Investment,FDI received past FDI abroad GFI
2,Funds,Forex reserves Gold reserves Sovereign wealth ...
3,Budget and debt,Government budget PPP % of GDP per capita Cred...
4,Income and taxes,Wage average Disposable minimum Wage growth GN...


In [46]:
failure2.columns

MultiIndex([( 'Country/Territory', 'Country/Territory'),
            (        'IMF[1][12]',          'Forecast'),
            (        'IMF[1][12]',              'Year'),
            (    'World Bank[13]',          'Estimate'),
            (    'World Bank[13]',              'Year'),
            ('United Nations[14]',          'Estimate'),
            ('United Nations[14]',              'Year')],
           )

In [47]:
# Extract only the 4-digit year from the string, then convert to datetime
close_time = pd.to_datetime(
	failure2[('IMF[1][12]', 'Year')].astype(str).str.extract(r'(\d{4})')[0],
	format='%Y'
)
# Assign the extracted datetime to the 'close_time' column
failure2['close_time'] = close_time
# Display the first few rows of the modified table to verify the new 'close_time' column
close_time.dt.year.value_counts
# Extract the GDP data for the year 2021

<bound method IndexOpsMixin.value_counts of 0      2025.0
1      2025.0
2      2025.0
3      2025.0
4      2025.0
        ...  
205    2025.0
206    2025.0
207    2025.0
208    2025.0
209    2025.0
Name: 0, Length: 210, dtype: float64>

In [48]:
close_time.dt.year.value_counts()
# Filter the table for the year 2021

0
2025.0    190
2023.0      1
Name: count, dtype: int64

In [57]:
from lxml import objectify

# Sanitize column names to be valid XML tags
import re

failure2_clean = failure2.copy()
def sanitize_xml_tag(tag):
    # Replace invalid characters with underscore and remove brackets
    tag = re.sub(r'[^a-zA-Z0-9_.-]', '_', str(tag))
    tag = re.sub(r'[\[\]]', '', tag)
    # Ensure tag does not start with a digit
    if re.match(r'^\d', tag):
        tag = '_' + tag
    return tag

failure2_clean.columns = [
    sanitize_xml_tag('_'.join([str(c) for c in col])) if isinstance(col, tuple)
    else sanitize_xml_tag(col)
    for col in failure2_clean.columns
]

# Save the DataFrame to an XML file
xml_path = 'gdp_data.xml'
failure2_clean.to_xml(xml_path, index=False)

with open(xml_path) as f:
    parsed = objectify.parse(f)
root = parsed.getroot()

# Example: Extract all rows as dicts
data_gdp = []
for row in root.iterchildren():
    row_data = {}
    for child in row.iterchildren():
        row_data[child.tag] = child.pyval
    data_gdp.append(row_data)

# Convert the extracted data into a DataFrame
performance_df = pd.DataFrame(data_gdp)
performance_df.head()

Unnamed: 0,Country_Territory_Country_Territory,IMF_1__12__Forecast,IMF_1__12__Year,World_Bank_13__Estimate,World_Bank_13__Year,United_Nations_14__Estimate,United_Nations_14__Year,close_time_
0,World,113795678,2025,105435540,2023,100834796,2022,2025-01-01 00:00:00
1,United States,30507217,2025,27360935,2023,25744100,2022,2025-01-01 00:00:00
2,China,19231705,[n 1]2025,17794782,[n 3]2023,17963170,[n 1]2022,2025-01-01 00:00:00
3,Germany,4744804,2025,4456081,2023,4076923,2022,2025-01-01 00:00:00
4,India,4187017,2025,3549919,2023,3465541,2022,2025-01-01 00:00:00


In [None]:
skip_rows = ["close_time", "IMF[1][12]"]

# Clear data_gdp if you want to start fresh, otherwise you will append to existing data
data_gdp = []

for row in root.iterchildren():
    el_data = {}
    for child in row.iterchildren():
        if child.tag in skip_rows:
            continue  # Skip the 'close_time' and 'IMF[1][12]' columns
        el_data[child.tag] = child.pyval
    data_gdp.append(el_data)

performance2 = pd.DataFrame(data_gdp)  # Convert the list of dictionaries to a DataFrame
performance2.head()  # Display the first few rows of the DataFrame to verify its structure

AttributeError: no such child: INDICATOR