# Extracting Sovereign State information from Wikipedia

## Creating a dataframe

In [158]:
import pandas as pd
# Create an empty dataframe with the desired columns

columns = ['Country', 'Link', 'Description','Status', 'Capital','Total area' ,'Water area %','Population Density','Population - 2022 estimate','Official languages', 'Ethnic groups', 'Religion', 'Demonym', 'Government','GDP (PPP) - total','GDP (PPP) - per capita', 'GDP (nominal) - total','GDP (nominal) - per capita' ,'HDI','Gini' ,'Currency', 'Time zone','Date format', 'Driving side', 'Calling code', 'ISO code', 'Internet TLD']
df = pd.DataFrame(columns=columns)

## Extracting the sovereign state list with corresponding wikipedia links

In [165]:
import requests
from bs4 import BeautifulSoup
import re


# Make a GET request to the Wikipedia page
url = 'https://en.wikipedia.org/wiki/List_of_sovereign_states'
response = requests.get(url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Find the table containing the list of sovereign states
table = soup.find(class_ = 'sortable wikitable')

# Extract the names of the countries and their corresponding href links
countries = {}
for row in table.find_all('tr')[3:]:
    cells = row.find_all('td')

    names = cells[0].text
    name = re.split(r"\s*[→–]\s*", names)[0]
    name = name.split(",")[0]
    name = name.split("[")[0]
    name = name.replace("ZZZ","").replace("\xa0","").replace("\n","")
    
    link = cells[0].find('a')
    if link is not None:
            link = link['href']
            countries[name] = link


countries = {k: v for k, v in countries.items() if not v.startswith('#')}
del countries['↑ UN member states and General Assembly observer states ↑']
print(countries)


{'Abkhazia': '/wiki/Abkhazia', 'Afghanistan': '/wiki/Afghanistan', 'Albania': '/wiki/Albania', 'Algeria': '/wiki/Algeria', 'Andorra': '/wiki/Andorra', 'Angola': '/wiki/Angola', 'Antigua and Barbuda': '/wiki/Antigua_and_Barbuda', 'Argentina': '/wiki/Argentina', 'Armenia': '/wiki/Armenia', 'Artsakh': '/wiki/Republic_of_Artsakh', 'Australia': '/wiki/Australia', 'Austria': '/wiki/Austria', 'Azerbaijan': '/wiki/Azerbaijan', 'Bahamas': '/wiki/The_Bahamas', 'Bahrain': '/wiki/Bahrain', 'Bangladesh': '/wiki/Bangladesh', 'Barbados': '/wiki/Barbados', 'Belarus': '/wiki/Belarus', 'Belgium': '/wiki/Belgium', 'Belize': '/wiki/Belize', 'Benin': '/wiki/Benin', 'Bhutan': '/wiki/Bhutan', 'Bolivia': '/wiki/Bolivia', 'Bosnia and Herzegovina': '/wiki/Bosnia_and_Herzegovina', 'Botswana': '/wiki/Botswana', 'Brazil': '/wiki/Brazil', 'Brunei': '/wiki/Brunei', 'Bulgaria': '/wiki/Bulgaria', 'Burkina Faso': '/wiki/Burkina_Faso', 'Burundi': '/wiki/Burundi', 'Cambodia': '/wiki/Cambodia', 'Cameroon': '/wiki/Cameroon

## Exctracting information for each country and saving it to the dataframe

In [162]:
# Loop over the dictionary keys and values
for key, value in countries.items():
#import itertools

#for key, value in itertools.islice(countries.items(), 2):
    # Make a GET request to the website

    country_request = requests.get(('https://en.wikipedia.org' + value))

    website_soup = BeautifulSoup(country_request.content, 'html.parser')
    
    # Find the "mw-parser-output" class
    mw_content_container = website_soup.find('div', class_='mw-content-container')
    first_h2 = mw_content_container.find('h2')
    paragraphs = []
    sibling = first_h2.previous_sibling

    while sibling is not None and sibling.name != 'h2':
        if sibling.name == 'p':
            paragraphs.append(sibling)
        sibling = sibling.previous_sibling
        
    description = ''
    # Extract the text content of each paragraph
    for paragraph in reversed(paragraphs):
        text = paragraph.text.strip()
        description += text
       
     

    # Initialize variables for all the columns you want to extract
    status = ''
    capital = ''
    languages = ''
    ethnic_groups = ''
    religion = ''
    demonym = ''
    government = ''
    hdi = ''
    currency = ''
    time_zone = ''
    driving_side = ''
    calling_code = ''
    iso_code = ''
    internet_tld = ''
    total_area = ''
    water = ''
    population_density =''
    population = ''
    GDPppp = ''
    GDPppp_pc = ''
    GDPnominal = ''
    GDPnominal_pc = ''
    date_format = ''
    gini = ''


    
    # Find the "mw-parser-output" class
    infobox_table = website_soup.find('table', class_='infobox ib-country vcard')

    # Find all table rows (tr) in the infobox table
    rows = infobox_table.find_all('tr')


    # Loop over each row in the info table
    for row in infobox_table.find_all('tr'):
        header = row.find('th', class_='infobox-label')
        data = row.find('td', class_='infobox-data')

        # Check the header and extract data for the corresponding column
        if header and 'Status' in header.text:
            status = data.text.strip()
        elif header and 'Date format' in header.text:
            date_format = data.text.strip()
        elif header and 'Gini' in header.text:
            gini = data.text.strip() 
        elif header and 'Water' in header.text:
            water = data.text.strip()
            previous_row = row.find_previous_sibling('tr')
            previous_row_cells = previous_row.find_all(['th', 'td'])
            total_area = previous_row_cells[1].text.strip()
        elif header and 'Density' in header.text:
            population_density = data.text.strip()
            previous_row = row.find_previous_sibling('tr')
            previous_row_cells = previous_row.find_all(['th', 'td'])
            population = previous_row_cells[1].text.strip()   
        elif header and 'GDP' in header.text and 'PPP' in header.text:
            next_row = row.find_next_sibling('tr')
            next_row_cells = next_row.find_all(['th', 'td'])
            GDPppp = next_row_cells[1].text.strip()
            nnext_row = next_row.find_next_sibling('tr')
            nnext_row_cells = nnext_row.find_all(['th', 'td'])
            GDPppp_pc = nnext_row_cells[1].text.strip()
        elif header and 'GDP' in header.text and 'nominal' in header.text:
            next_row = row.find_next_sibling('tr')
            next_row_cells = next_row.find_all(['th', 'td'])
            GDPnominal = next_row_cells[1].text.strip()
            nnext_row = next_row.find_next_sibling('tr')
            nnext_row_cells = nnext_row.find_all(['th', 'td'])
            GDPnominal_pc = nnext_row_cells[1].text.strip()
        elif header and 'Capital' in header.text:
            capital = data.text.strip()
        elif header and 'language' in header.text:
            languages = data.text.strip()
        elif header and 'Ethnic' in header.text:
            ethnic_groups = data.text.strip()
        elif header and 'Religion' in header.text:
            religion = data.text.strip()
        elif header and 'Demonym' in header.text:
            demonym = data.text.strip()
        elif header and 'Government' in header.text:
            government = data.text.strip()
        elif header and 'HDI' in header.text:
            hdi = data.text.strip()
        elif header and 'Currency' in header.text:
            currency = data.text.strip()
        elif header and 'Time' in header.text:
            time_zone = data.text.strip()
        elif header and 'Driving' in header.text:
            driving_side = data.text.strip()
        elif header and 'Calling' in header.text:
            calling_code = data.text.strip()
        elif header and 'ISO' in header.text:
            iso_code = data.text.strip()
        elif header and 'Internet' in header.text:
            internet_tld = data.text.strip()



        # Append the extracted information to the dataframe
    df = df.append({
            'Country': key,
            'Link': 'https://en.wikipedia.org' + value,
            'Status': status,
            'Capital': capital,
            'Official languages': languages,
            'Ethnic groups': ethnic_groups,
            'Religion': religion,
            'Demonym': demonym,
            'Government': government,
            'HDI': hdi,
            'Currency': currency,
            'Time zone': time_zone,
            'Driving side': driving_side,
            'Calling code': calling_code,
            'ISO 3166 code': iso_code,
            'Internet TLD': internet_tld,
            'Total area' : total_area,
            'Water area %' : water,
            'Population Density' : population_density,
            'Population - 2022 estimate' : population,
            'GDP (PPP) - total' : GDPppp,
            'GDP (PPP) - per capita' : GDPppp_pc,
            'GDP (nominal) - total' : GDPnominal,
            'GDP (nominal) - per capita' : GDPnominal_pc,
            'Date format' : date_format, 
            'Gini' : gini,
            'Country': key,
            'Link': 'https://en.wikipedia.org' + value,
            'Description': description

        }, ignore_index=True)

  

## The Dataframe

In [166]:
df

Unnamed: 0,Country,Link,Description,Status,Capital,Total area,Water area %,Population Density,Population - 2022 estimate,Official languages,...,HDI,Gini,Currency,Time zone,Date format,Driving side,Calling code,ISO code,Internet TLD,ISO 3166 code
0,Abkhazia,https://en.wikipedia.org/wiki/Abkhazia,"Abkhazia[n 1] (/æbˈkɑːziə/ (listen)),[5] offic...",Recognised by 5 out of 193 member states of th...,Sukhumi43°00′N 40°59′E﻿ / ﻿43.000°N 40.983°E﻿ ...,,,28.3/km2 (73.3/sq mi),240705,AbkhazRussianGeorgianMingrelianSvanArmenian,...,,,Abkhazian apsarRussian rublec (RUB),UTC+3 (MSK),,right,+7 840 / 940 and +995 44[3][4],,,
1,Afghanistan,https://en.wikipedia.org/wiki/Afghanistan,"Afghanistan,[c] officially the Islamic Emirate...",UN member state under an unrecognized government,Kabul34°31′N 69°11′E﻿ / ﻿34.517°N 69.183°E﻿ / ...,"652,867[16] km2 (252,073 sq mi) (40th)",negligible,48.08/km2 (124.5/sq mi) (174th),"38,346,720[17] (37th)",PashtoDari,...,0.478[19][20]low · 180th,,Afghani (افغانی) (AFN),UTC+4:30Lunar Calendar[21] (Afghanistan Time),,right,+93,,.afافغانستان.,AF
2,Abkhazia,https://en.wikipedia.org/wiki/Abkhazia,"Abkhazia[n 1] (/æbˈkɑːziə/ (listen)),[5] offic...",Recognised by 5 out of 193 member states of th...,Sukhumi43°00′N 40°59′E﻿ / ﻿43.000°N 40.983°E﻿ ...,,,28.3/km2 (73.3/sq mi),240705,AbkhazRussianGeorgianMingrelianSvanArmenian,...,,,Abkhazian apsarRussian rublec (RUB),UTC+3 (MSK),,right,+7 840 / 940 and +995 44[3][4],,,
3,Afghanistan,https://en.wikipedia.org/wiki/Afghanistan,"Afghanistan,[c] officially the Islamic Emirate...",UN member state under an unrecognized government,Kabul34°31′N 69°11′E﻿ / ﻿34.517°N 69.183°E﻿ / ...,"652,867[16] km2 (252,073 sq mi) (40th)",negligible,48.08/km2 (124.5/sq mi) (174th),"38,346,720[17] (37th)",PashtoDari,...,0.478[19][20]low · 180th,,Afghani (افغانی) (AFN),UTC+4:30Lunar Calendar[21] (Afghanistan Time),,right,+93,,.afافغانستان.,AF
4,Albania,https://en.wikipedia.org/wiki/Albania,"Albania (/ælˈbeɪniə, ɔːl-/ (listen) a(w)l-BAY-...",,Tirana41°19′N 19°49′E﻿ / ﻿41.317°N 19.817°E﻿ /...,"28,748 km2 (11,100 sq mi) (140th)",4.7,97/km2 (251.2/sq mi),"2,821,977[3]",GreekAromanianMacedonian,...,0.796[6]high · 67th,34.3[5]medium,Lek (ALL),UTC+1 (CET),dd.mm.yyyy,right,+355,,.al,AL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,Vietnam,https://en.wikipedia.org/wiki/Vietnam,Coordinates: 16°N 108°E﻿ / ﻿16°N 108°E﻿ / 16; ...,,Hanoi21°2′N 105°51′E﻿ / ﻿21.033°N 105.850°E﻿ /...,"331,212 km2 (127,882 sq mi) (66th)",6.38,295.0/km2 (764.0/sq mi) (29th),"96,208,984[2]",Vietnamese (de facto)[n 1],...,0.703[8]high · 115th,35.7[7]medium,đồng (₫) (VND),UTC+07:00 (Vietnam Standard Time),dd/mm/yyyy,right,+84,,.vn,VN
200,Yemen,https://en.wikipedia.org/wiki/Yemen,"Yemen (/ˈjɛmən/ (listen); Arabic: ٱلْيَمَن, ro...",Yemeni Civil WarPresidential Leadership Counci...,Aden[n 2],"555,000[3] km2 (214,000 sq mi) (49th)",negligible,44.7/km2 (115.8/sq mi) (160th),"30,984,689[4] (48th)",Arabic[2],...,0.455[7]low · 183th,36.7[6]medium,Yemeni rial (YER),UTC+3 (AST),,right[8],+967,,".ye, اليمن.",YE
201,Zambia,https://en.wikipedia.org/wiki/Zambia,Coordinates: 15°S 30°E﻿ / ﻿15°S 30°E﻿ / -15; 3...,,Lusaka15°25′S 28°17′E﻿ / ﻿15.417°S 28.283°E﻿ /...,"752,617 km2 (290,587 sq mi)[2] (38th)",1,26.1/km2 (67.6/sq mi),"19,610,769[3] (63rd)",List\n 28.5% Bemba\n 13.8% Nyanja\n 11.4% T...,...,0.565[6]medium · 154th,57.1[5]high,Zambian kwacha (ZMW),UTC+2 (CAT),dd/mm/yyyy,left,+260,,.zm,ZM
202,Zimbabwe,https://en.wikipedia.org/wiki/Zimbabwe,Coordinates: 19°S 30°E﻿ / ﻿19°S 30°E﻿ / -19; 3...,,Harare17°49′45″S 31°03′08″E﻿ / ﻿17.82917°S 31....,"390,757 km2 (150,872 sq mi) (60th)",1,39/km2 (101.0/sq mi),"15,178,979[9]","16 languages:[4] ChewaChibarweEnglishKalanga""K...",...,0.593[12]medium · 146th,50.3[11]high,Zimbabwean dollarU.S. dollar ($) (USD)[13]Sout...,UTC+2 (CAT[14]),dd/mm/yyyy,left,+263,,.zw,ZW


## Saving the dataframe

In [163]:
df.to_csv('countries_data.csv', index=False)

In [164]:
df.to_excel('countries.xlsx', index=False)