# World GDP web scrapping with BeatifulSoup & Pandas

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)"

In [3]:
response = requests.get(url)
soup = BeautifulSoup(response.content, 'lxml')
#print(soup)

In [4]:
rows = soup.find_all('table')[2]
#print(rows)

In [5]:
# Get country name with data
datas = rows.find_all('td')
gdp = list()
for cnt in datas:
    gdp.append(cnt.text.strip())
print(gdp[154:161])

# Data correction add 'Nan' after '-'
for index, i in enumerate(gdp):
    if i == '—':
        gdp.insert(index+1, 'Nan')
print(gdp[154:161])

#Data correction '—' replace with 'Nan'
#gdp = ['Nan' if i == '—' else i for i in gdp]
gdp = list(map(lambda i:'Nan' if i == '—' else i, gdp))
print(gdp[154:161])
print(gdp[434:441])
print(len(gdp))

['Taiwan', '802,958', '[n 4]2024', '—', '—', 'Belgium', '655,192']
['Taiwan', '802,958', '[n 4]2024', '—', 'Nan', '—', 'Nan']
['Taiwan', '802,958', '[n 4]2024', 'Nan', 'Nan', 'Nan', 'Nan']
['Cuba', 'Nan', 'Nan', 'Nan', 'Nan', '147,193', '2022']
1470


In [57]:
# change list shape from 1470x1 to 210x7
gdp = [gdp[i:i+7] for i in range(0, len(gdp), 7)]
print(gdp[0])

['World', '109,529,216', '2024', '105,435,540', '2023', '100,834,796', '2022']


In [58]:
# Get descriptions
descriptions = rows.find_all('th')
#print(descriptions)
description = list()
for des in descriptions:
    #print(des.text)
    description.append(des.text)
description = [i for i in description if i not in (description[1:4])]
#description[1] = 'Forecast-IMF'
#description[2] = 'Year-IMF'
#description[4] = 'Year-WorldBank'
#description[3] = 'Estimate-WorldBank'
#description[6] = 'Year-UN'
#description[5] = 'Estimate-UN'
#description[0] = 'Country/Territory'
description= ['Country/Territory', 'Forecast-IMF', 'Year-IMF', 'Estimate-WorldBank', 'Year-WorldBank', 'Estimate-UN', 'Year-UN']
print(description)

['Country/Territory', 'Forecast-IMF', 'Year-IMF', 'Estimate-WorldBank', 'Year-WorldBank', 'Estimate-UN', 'Year-UN']


In [59]:
# create data frame to combine data
df = pd.DataFrame(gdp, columns=description)
df

Unnamed: 0,Country/Territory,Forecast-IMF,Year-IMF,Estimate-WorldBank,Year-WorldBank,Estimate-UN,Year-UN
0,World,109529216,2024,105435540,2023,100834796,2022
1,United States,28781083,2024,27360935,2023,25744100,2022
2,China,18532633,[n 1]2024,17794782,[n 3]2023,17963170,[n 1]2022
3,Germany,4591100,2024,4456081,2023,4076923,2022
4,Japan,4110452,2024,4212945,2023,4232173,2022
...,...,...,...,...,...,...,...
205,Kiribati,311,2024,279,2023,223,2022
206,Palau,308,2024,263,2023,225,2022
207,Marshall Islands,305,2024,284,2023,279,2022
208,Nauru,161,2024,154,2023,147,2022


In [65]:
#print(df['Year-IMF'])
#for i in df['Year-IMF']:
    #print(i)
    #if len(i) > 4:
        #print(i)
        #i = i[-4:]
        #print(i)
# Correction on Year-IMF
df['Year-IMF'] = df['Year-IMF'].apply(lambda i:i[-4:] if len(i) > 4 else i)    
#print(df['Year-IMF'])

# Correction on Year-WorldBank
df['Year-WorldBank'] = df['Year-WorldBank'].apply(lambda i:i[-4:] if len(i) > 4 else i)
#print(df['Year-WorldBank'])

# Correction on Year-UN
df['Year-UN'] = df['Year-UN'].apply(lambda i:i[-4:] if len(i) > 4 else i)
#print(df['Year-UN'])
print(df[['Year-IMF','Year-WorldBank', 'Year-UN']])

    Year-IMF Year-WorldBank Year-UN
0       2024           2023    2022
1       2024           2023    2022
2       2024           2023    2022
3       2024           2023    2022
4       2024           2023    2022
..       ...            ...     ...
205     2024           2023    2022
206     2024           2023    2022
207     2024           2023    2022
208     2024           2023    2022
209     2024           2023    2022

[210 rows x 3 columns]


In [61]:
df

Unnamed: 0,Country/Territory,Forecast-IMF,Year-IMF,Estimate-WorldBank,Year-WorldBank,Estimate-UN,Year-UN
0,World,109529216,2024,105435540,2023,100834796,2022
1,United States,28781083,2024,27360935,2023,25744100,2022
2,China,18532633,2024,17794782,2023,17963170,2022
3,Germany,4591100,2024,4456081,2023,4076923,2022
4,Japan,4110452,2024,4212945,2023,4232173,2022
...,...,...,...,...,...,...,...
205,Kiribati,311,2024,279,2023,223,2022
206,Palau,308,2024,263,2023,225,2022
207,Marshall Islands,305,2024,284,2023,279,2022
208,Nauru,161,2024,154,2023,147,2022


In [62]:
df.to_csv('World_GDP.csv', index=False)