## Web Scraping of Country GDP From Wikipedia.

## Import libraries

In [5]:
from bs4 import BeautifulSoup
import requests
from csv import writer
import pandas as pd

### Check if connection is successful

In [2]:
url = "https://en.m.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)"
page = requests.get(url)
page

<Response [200]>

### Extracting page content using html Parser

In [3]:
soup = BeautifulSoup(page.content, "html.parser")
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of countries by GDP (nominal) - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"8735acc7-b03e-423d-83ac-163aba3c4067","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"List_of_countries_by_GDP_(nominal)","wgTitle":"List of countries by GDP (nominal)","wgCurRevisionId":1107211592,"wgRevisionId":1107211592,"wgArticleId":380845,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgRelevantPageName":"List_of

### Extracting the GDP table using the table tag with the  wikitable... class

In [4]:
table = soup.find("table", class_ = "wikitable")
print(table.prettify())

<table border="1" class="wikitable sortable static-row-numbers plainrowheaders srn-white-background" style="text-align:right;">
 <caption>
  GDP (US$ million) by country
 </caption>
 <tbody>
  <tr class="static-row-header" style="text-align:center;vertical-align:bottom;">
   <th rowspan="2">
    Country/Territory
   </th>
   <th rowspan="2">
    <a href="/wiki/United_Nations_geoscheme" title="United Nations geoscheme">
     UN Region
    </a>
   </th>
   <th colspan="2">
    <a href="/wiki/International_Monetary_Fund" title="International Monetary Fund">
     IMF
    </a>
    <sup class="reference" id="cite_ref-GDP_IMF_1-2">
     <a href="#cite_note-GDP_IMF-1">
      [1]
     </a>
    </sup>
    <sup class="reference" id="cite_ref-14">
     <a href="#cite_note-14">
      [13]
     </a>
    </sup>
   </th>
   <th colspan="2">
    <a href="/wiki/United_Nations" title="United Nations">
     United Nations
    </a>
    <sup class="reference" id="cite_ref-UN_15-0">
     <a href="#cite_note-

### Extracting and Inspecting each table row

In [5]:
# Extracting all the table row using the <tr> tag.
rows = table.find_all("tr")


# Extracting all the table row data  using the <td> and <th> tags.
for row in rows[2:]:
    cells = row.find_all(["td", "th"])
    
    cells_text = [cell.get_text(strip = True) for cell in cells]
    print(cells_text)

['World', '—', '93,863,851', '2021', '87,461,674', '2020', '96,100,091', '2021']
['United States', 'Americas', '25,346,805', '2022', '20,893,746', '2020', '22,996,100', '2021']
['China', 'Asia', '19,911,593', '[n 2]2022', '14,722,801', '[n 3]2020', '17,734,063', '2021']
['Japan', 'Asia', '4,912,147', '2022', '5,057,759', '2020', '4,937,422', '2021']
['Germany', 'Europe', '4,256,540', '2022', '3,846,414', '2020', '4,223,116', '2021']
['India', 'Asia', '3,534,743', '2022', '2,664,749', '2020', '3,173,398', '2021']
['United Kingdom', 'Europe', '3,376,003', '2022', '2,764,198', '2020', '3,186,860', '2021']
['France', 'Europe', '2,936,702', '2022', '2,630,318', '2020', '2,937,473', '2021']
['Canada', 'Americas', '2,221,218', '2022', '1,644,037', '2020', '1,990,762', '2021']
['Italy', 'Europe', '2,058,330', '2022', '1,888,709', '2020', '2,099,880', '2021']
['Brazil', 'Americas', '1,833,274', '2022', '1,444,733', '2020', '1,608,981', '2021']
['Russia', 'Europe', '1,829,050', '2022', '1,483,49

### Converting to a .csv file

In [6]:
with open("countries_GDP.csv", "w", encoding = "utf8", newline = '') as f:
    thewriter = writer(f)
    header = ["Country/Territory", "Subregion", "Region", "IMF_Estimate", "IMF_Year", "UN_Estimate", "UN_Year", "WB_Estimate", "WB_Year"]
    thewriter.writerow(header)
    
    for row in rows[2:]:
        cells = row.find_all(["td", "th"])
        
        cells_text = [cell.get_text(strip = True) for cell in cells]
        thewriter.writerow(cells_text)

### Confirm by importing data

In [11]:
data = pd.read_csv("countries_GDP.csv")
data.head(10)

Unnamed: 0,Country/Territory,Subregion,Region,IMF_Estimate,IMF_Year,UN_Estimate,UN_Year,WB_Estimate,WB_Year
0,World,—,93863851,2021,87461674,2020,96100091,2021,
1,United States,Americas,25346805,2022,20893746,2020,22996100,2021,
2,China,Asia,19911593,[n 2]2022,14722801,[n 3]2020,17734063,2021,
3,Japan,Asia,4912147,2022,5057759,2020,4937422,2021,
4,Germany,Europe,4256540,2022,3846414,2020,4223116,2021,
5,India,Asia,3534743,2022,2664749,2020,3173398,2021,
6,United Kingdom,Europe,3376003,2022,2764198,2020,3186860,2021,
7,France,Europe,2936702,2022,2630318,2020,2937473,2021,
8,Canada,Americas,2221218,2022,1644037,2020,1990762,2021,
9,Italy,Europe,2058330,2022,1888709,2020,2099880,2021,


In [9]:
data.tail(10)

Unnamed: 0,Country/Territory,Subregion,Region,IMF_Estimate,IMF_Year,UN_Estimate,UN_Year,WB_Estimate,WB_Year
207,São Tomé and Príncipe,Africa,526,2022,476,2020,547.0,2021.0,
208,Micronesia,Oceania,427,2022,403,2020,404.0,2021.0,
209,Cook Islands,Oceania,—,283,2020,—,,,
210,Marshall Islands,Oceania,267,2022,244,2020,249.0,2021.0,
211,Anguilla,Americas,—,258,2020,—,,,
212,Palau,Oceania,244,2022,264,2020,258.0,2020.0,
213,Kiribati,Oceania,216,2022,181,2020,181.0,2020.0,
214,Nauru,Oceania,134,2022,135,2020,133.0,2021.0,
215,Montserrat,Americas,—,68,2020,—,,,
216,Tuvalu,Oceania,66,2022,55,2020,63.0,2021.0,
