In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# save page in html
url = "https://www.worldometers.info/population/countries-in-the-eu-by-population/"

data = requests.get(url)
    
with open("C:/MKB_datalab/MKB_repository/Masterclasses/01-webscraping/data/webpages/world.html", "w+") as f:
    f.write(str(data.text.encode('utf8')))


In [3]:
def parse_html_file(path):
    """ 
    Parse content of a html file
    
    Args:
        path (str): path to the html file
        
    Return:
        parser: representation of the document as a nested data structure.
    """
    
    with open(path, 'r') as f:
        contents = f.read()

        parser = BeautifulSoup(contents, 'lxml')
    
    return parser

In [4]:
# parse html file
parser = parse_html_file("C:/MKB_datalab/MKB_repository/Masterclasses/01-webscraping/data/webpages/world.html")

In [5]:
print(parser.prettify())

<html>
 <body>
  <p>
   b'\n
   <!DOCTYPE html>
   <!--[if IE 8]> <html lang="en" class="ie8"> <![endif]-->
   <!--[if IE 9]> <html lang="en" class="ie9"> <![endif]-->
   <!--[if !IE]><!-->
   <!--<![endif]-->
  </p>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <title>
   Countries in the EU by Population (2022) - Worldometer
  </title>
  <meta content="List of countries the European Union ranked by population, from the most populous. Growth rate, median age, fertility rate, area, density, population density, urbanization, urban population, share of world population." name="description"/>
  <link href="/favicon/favicon.ico" rel="shortcut icon" type="image/x-icon"/>
  <link href="/favicon/apple-icon-57x57.png" rel="apple-touch-icon" sizes="57x57"/>
  <link href="/favicon/apple-icon-60x60.png" rel="apple-touch-icon" sizes="60x60"/>
  <link href="/favicon/apple-icon-72x72.png" r

In [6]:
# Obtain information from tag <table>
table = parser.find('table', id='example2')
table

<table cellspacing="0" class="table table-striped table-bordered" id="example2" width="100%"> <thead> <tr> <th>#</th> <th>Country (or dependency)</th> <th>Population<br/> (2020)</th> <th>Yearly<br/> Change</th> <th>Net<br/> Change</th> <th>Density<br/> (P/Km\xc2\xb2)</th> <th>Land Area<br/> (Km\xc2\xb2)</th> <th>Migrants<br/> (net)</th> <th>Fert.<br/> Rate</th> <th>Med.<br/> Age</th> <th>Urban<br/> Pop %</th> <th>World<br/> Share</th> </tr> </thead> <tbody> <tr> <td>1</td> <td style="font-weight: bold; font-size:15px; text-align:left"><a href="/world-population/germany-population/">Germany</a></td> <td style="font-weight: bold;">83,783,942</td> <td>0.32 %</td> <td>266,897</td> <td>240</td> <td>348,560</td> <td>543,822</td> <td>1.6</td> <td>46</td> <td>76 %</td> <td>1.07 %</td> </tr> <tr> <td>2</td> <td style="font-weight: bold; font-size:15px; text-align:left"><a href="/world-population/france-population/">France</a></td> <td style="font-weight: bold;">65,273,511</td> <td>0.22 %</td> <

In [7]:
print(table.prettify())

<table cellspacing="0" class="table table-striped table-bordered" id="example2" width="100%">
 <thead>
  <tr>
   <th>
    #
   </th>
   <th>
    Country (or dependency)
   </th>
   <th>
    Population
    <br/>
    (2020)
   </th>
   <th>
    Yearly
    <br/>
    Change
   </th>
   <th>
    Net
    <br/>
    Change
   </th>
   <th>
    Density
    <br/>
    (P/Km\xc2\xb2)
   </th>
   <th>
    Land Area
    <br/>
    (Km\xc2\xb2)
   </th>
   <th>
    Migrants
    <br/>
    (net)
   </th>
   <th>
    Fert.
    <br/>
    Rate
   </th>
   <th>
    Med.
    <br/>
    Age
   </th>
   <th>
    Urban
    <br/>
    Pop %
   </th>
   <th>
    World
    <br/>
    Share
   </th>
  </tr>
 </thead>
 <tbody>
  <tr>
   <td>
    1
   </td>
   <td style="font-weight: bold; font-size:15px; text-align:left">
    <a href="/world-population/germany-population/">
     Germany
    </a>
   </td>
   <td style="font-weight: bold;">
    83,783,942
   </td>
   <td>
    0.32 %
   </td>
   <td>
    266,897
   </td>


In [8]:
# Obtain column names within tag <th> with attribute col
list_col = table.find_all('th')
list_col = [item.text.strip() for item in list_col]
list_col

['#',
 'Country (or dependency)',
 'Population (2020)',
 'Yearly Change',
 'Net Change',
 'Density (P/Km\\xc2\\xb2)',
 'Land Area (Km\\xc2\\xb2)',
 'Migrants (net)',
 'Fert. Rate',
 'Med. Age',
 'Urban Pop %',
 'World Share']

In [9]:
# Create a dataframe
EU_population_data = pd.DataFrame(columns = list_col)

In [10]:
# Create a for loop to fill EU_population_data
for j in table.find_all('tr')[1:]:
    row_data = j.find_all('td')
    row = [i.text for i in row_data]
    length = len(EU_population_data)
    EU_population_data.loc[length] = row

In [11]:
EU_population_data

Unnamed: 0,#,Country (or dependency),Population (2020),Yearly Change,Net Change,Density (P/Km\xc2\xb2),Land Area (Km\xc2\xb2),Migrants (net),Fert. Rate,Med. Age,Urban Pop %,World Share
0,1,Germany,83783942,0.32 %,266897,240,348560,543822,1.6,46,76 %,1.07 %
1,2,France,65273511,0.22 %,143783,119,547557,36527,1.9,42,82 %,0.84 %
2,3,Italy,60461826,-0.15 %,-88249,206,294140,148943,1.3,47,69 %,0.78 %
3,4,Spain,46754778,0.04 %,18002,94,498800,40000,1.3,45,80 %,0.60 %
4,5,Poland,37846611,-0.11 %,-41157,124,306230,-29395,1.4,42,60 %,0.49 %
5,6,Romania,19237691,-0.66 %,-126866,84,230170,-73999,1.6,43,55 %,0.25 %
6,7,Netherlands,17134872,0.22 %,37742,508,33720,16000,1.7,43,92 %,0.22 %
7,8,Belgium,11589623,0.44 %,50295,383,30280,48000,1.7,42,98 %,0.15 %
8,9,Czech Republic (Czechia),10708981,0.18 %,19772,139,77240,22011,1.6,43,74 %,0.14 %
9,10,Greece,10423054,-0.48 %,-50401,81,128900,-16000,1.3,46,85 %,0.13 %


In [12]:
# save in csv
EU_population_data.to_csv("C:/MKB_datalab/MKB_repository/Masterclasses/01-webscraping/data/processed/world_table.csv", index = False )