# Introduction

In this notebook, we use web scraping principles to compile a data frame featuring information on countries worldwide, including some entities that may not be traditionally classified as countries. Our data is sourced from <a href='https://www.scrapethissite.com/pages/simple/'>www.scrapethissite.com</a>
 

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
url = 'https://www.scrapethissite.com/pages/simple/'
html_page = requests.get(url)

soup = BeautifulSoup(html_page.content, 'html.parser')
soup = BeautifulSoup(soup.prettify(), 'html.parser')
print(soup)

<!DOCTYPE html>

<html lang="en">
<head>
<meta charset="utf-8"/>
<title>
   Countries of the World: A Simple Example | Scrape This Site | A public sandbox for learning web scraping
  </title>
<link href="/static/images/scraper-icon.png" rel="icon" type="image/png"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<meta content="A single page that lists information about all the countries in the world. Good for those just get started with web scraping." name="description"/>
<link crossorigin="anonymous" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/css/bootstrap.min.css" integrity="sha256-MfvZlkHCEqatNoGiOXveE8FIwMzZg4W85qfrfIFBfYc= sha512-dTfge/zgoMYpP7QbHy4gWMEGsbsdZeCXz7irItjcC3sPUFtf0kuFbDz/ixG7ArTxmDjLXDmezHubeNikyKGVyQ==" rel="stylesheet"/>
<link href="https://fonts.googleapis.com/css?family=Lato:400,700" rel="stylesheet" type="text/css"/>
<link href="/static/css/styles.css" rel="stylesheet" type="text/css"/>
<meta content="noindex" name="robots"/>


In [3]:
#Creating a data frame and it's headers
headers = ['country', 'capital', 'population']
df = pd.DataFrame(columns=headers)
df

Unnamed: 0,country,capital,population


In [4]:
countries_content = soup.find_all(class_ = 'country-name')
print(countries_content)

[<h3 class="country-name">
<i class="flag-icon flag-icon-ad">
</i>
        Andorra
       </h3>, <h3 class="country-name">
<i class="flag-icon flag-icon-ae">
</i>
        United Arab Emirates
       </h3>, <h3 class="country-name">
<i class="flag-icon flag-icon-af">
</i>
        Afghanistan
       </h3>, <h3 class="country-name">
<i class="flag-icon flag-icon-ag">
</i>
        Antigua and Barbuda
       </h3>, <h3 class="country-name">
<i class="flag-icon flag-icon-ai">
</i>
        Anguilla
       </h3>, <h3 class="country-name">
<i class="flag-icon flag-icon-al">
</i>
        Albania
       </h3>, <h3 class="country-name">
<i class="flag-icon flag-icon-am">
</i>
        Armenia
       </h3>, <h3 class="country-name">
<i class="flag-icon flag-icon-ao">
</i>
        Angola
       </h3>, <h3 class="country-name">
<i class="flag-icon flag-icon-aq">
</i>
        Antarctica
       </h3>, <h3 class="country-name">
<i class="flag-icon flag-icon-ar">
</i>
        Argentina
       </h3>, <h3 c

In [5]:
#extract country names
country = [name.text.strip() for name in countries_content]
print(country)

['Andorra', 'United Arab Emirates', 'Afghanistan', 'Antigua and Barbuda', 'Anguilla', 'Albania', 'Armenia', 'Angola', 'Antarctica', 'Argentina', 'American Samoa', 'Austria', 'Australia', 'Aruba', 'Åland', 'Azerbaijan', 'Bosnia and Herzegovina', 'Barbados', 'Bangladesh', 'Belgium', 'Burkina Faso', 'Bulgaria', 'Bahrain', 'Burundi', 'Benin', 'Saint Barthélemy', 'Bermuda', 'Brunei', 'Bolivia', 'Bonaire', 'Brazil', 'Bahamas', 'Bhutan', 'Bouvet Island', 'Botswana', 'Belarus', 'Belize', 'Canada', 'Cocos [Keeling] Islands', 'Democratic Republic of the Congo', 'Central African Republic', 'Republic of the Congo', 'Switzerland', 'Ivory Coast', 'Cook Islands', 'Chile', 'Cameroon', 'China', 'Colombia', 'Costa Rica', 'Cuba', 'Cape Verde', 'Curacao', 'Christmas Island', 'Cyprus', 'Czech Republic', 'Germany', 'Djibouti', 'Denmark', 'Dominica', 'Dominican Republic', 'Algeria', 'Ecuador', 'Estonia', 'Egypt', 'Western Sahara', 'Eritrea', 'Spain', 'Ethiopia', 'Finland', 'Fiji', 'Falkland Islands', 'Micron

In [6]:
#extract capital cities 
capital_row = soup.find_all(class_ = 'country-capital')
capital_city = [name.text.strip() for name in capital_row]
print(capital_city)

['Andorra la Vella', 'Abu Dhabi', 'Kabul', "St. John's", 'The Valley', 'Tirana', 'Yerevan', 'Luanda', 'None', 'Buenos Aires', 'Pago Pago', 'Vienna', 'Canberra', 'Oranjestad', 'Mariehamn', 'Baku', 'Sarajevo', 'Bridgetown', 'Dhaka', 'Brussels', 'Ouagadougou', 'Sofia', 'Manama', 'Bujumbura', 'Porto-Novo', 'Gustavia', 'Hamilton', 'Bandar Seri Begawan', 'Sucre', 'Kralendijk', 'Brasília', 'Nassau', 'Thimphu', 'None', 'Gaborone', 'Minsk', 'Belmopan', 'Ottawa', 'West Island', 'Kinshasa', 'Bangui', 'Brazzaville', 'Bern', 'Yamoussoukro', 'Avarua', 'Santiago', 'Yaoundé', 'Beijing', 'Bogotá', 'San José', 'Havana', 'Praia', 'Willemstad', 'Flying Fish Cove', 'Nicosia', 'Prague', 'Berlin', 'Djibouti', 'Copenhagen', 'Roseau', 'Santo Domingo', 'Algiers', 'Quito', 'Tallinn', 'Cairo', 'Laâyoune / El Aaiún', 'Asmara', 'Madrid', 'Addis Ababa', 'Helsinki', 'Suva', 'Stanley', 'Palikir', 'Tórshavn', 'Paris', 'Libreville', 'London', "St. George's", 'Tbilisi', 'Cayenne', 'St Peter Port', 'Accra', 'Gibraltar', '

In [7]:
#extract country population
population_row = soup.find_all(class_ = 'country-population')
population = [number.text.strip() for number in population_row]
print(population)

['84000', '4975593', '29121286', '86754', '13254', '2986952', '2968000', '13068161', '0', '41343201', '57881', '8205000', '21515754', '71566', '26711', '8303512', '4590000', '285653', '156118464', '10403000', '16241811', '7148785', '738004', '9863117', '9056010', '8450', '65365', '395027', '9947418', '18012', '201103330', '301790', '699847', '0', '2029307', '9685000', '314522', '33679000', '628', '70916439', '4844927', '3039126', '7581000', '21058798', '21388', '16746491', '19294149', '1330044000', '47790000', '4516220', '11423000', '508659', '141766', '1500', '1102677', '10476000', '81802257', '740528', '5484000', '72813', '9823821', '34586184', '14790608', '1291170', '80471869', '273008', '5792984', '46505963', '88013491', '5244000', '875983', '2638', '107708', '48228', '64768389', '1545255', '62348447', '107818', '4630000', '195506', '65228', '24339838', '27884', '56375', '1593256', '10324025', '443000', '1014999', '11000000', '30', '13550440', '159358', '1565126', '748486', '689868

In [8]:
#populate the data frame
df['country'] = country
df['capital'] = capital_city
df['population'] = population
df

Unnamed: 0,country,capital,population
0,Andorra,Andorra la Vella,84000
1,United Arab Emirates,Abu Dhabi,4975593
2,Afghanistan,Kabul,29121286
3,Antigua and Barbuda,St. John's,86754
4,Anguilla,The Valley,13254
...,...,...,...
245,Yemen,Sanaa,23495361
246,Mayotte,Mamoudzou,159042
247,South Africa,Pretoria,49000000
248,Zambia,Lusaka,13460305


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   country     250 non-null    object
 1   capital     250 non-null    object
 2   population  250 non-null    object
dtypes: object(3)
memory usage: 6.0+ KB


In [10]:
#change the population data type from object to integer
df['population'] = df['population'].astype(int)
print(df.dtypes)

country       object
capital       object
population     int32
dtype: object


In [11]:
#sort by population
#Some of these are not actual countries but territories.
df = df.sort_values('population')
df

Unnamed: 0,country,capital,population
231,U.S. Minor Outlying Islands,,0
8,Antarctica,,0
33,Bouvet Island,,0
95,Heard Island and McDonald Islands,,0
89,South Georgia and the South Sandwich Islands,Grytviken,30
...,...,...,...
30,Brazil,Brasília,201103330
100,Indonesia,Jakarta,242968342
232,United States,Washington,310232863
104,India,New Delhi,1173108018


In [12]:
#create a csv file from the data frame
df.to_csv(r'C:\Users\Client\OneDrive\Web Scrapping\World Countries\WorldCountries.csv', index=False)