## Scraping a table

In [1]:
import requests
import pandas as pd

from bs4 import BeautifulSoup

In [2]:
url = "https://www.worldometers.info/world-population/"

In [3]:
page = requests.get(url)
page

<Response [200]>

In [4]:
soup = BeautifulSoup(page.text, 'lxml')

In [5]:
#Subsets the HTML to only get the HTML of our table needed
table = soup.find('table', class_ = 'table table-striped table-bordered table-hover table-condensed table-list')

In [6]:
table.find_all('th')

[<th>Year<br/> (July 1) </th>,
 <th>Population</th>,
 <th>Yearly % <br/> Change</th>,
 <th>Yearly<br/> Change</th>,
 <th>Median <br/>Age</th>,
 <th>Fertility <br/>Rate</th>,
 <th>Density <br/>(P/Km²)</th>,
 <th>Urban<br/> Pop %</th>,
 <th>Urban Population</th>]

In [7]:
#Gets all the column headers of  table
headers = []
for i in table.find_all('th'):
    title = i.text.strip()
    headers.append(title)


In [8]:
headers

['Year (July 1)',
 'Population',
 'Yearly %  Change',
 'Yearly Change',
 'Median Age',
 'Fertility Rate',
 'Density (P/Km²)',
 'Urban Pop %',
 'Urban Population']

In [9]:
#Creates a dataframe using the column headers from our table
df = pd.DataFrame(columns = headers)
df

Unnamed: 0,Year (July 1),Population,Yearly % Change,Yearly Change,Median Age,Fertility Rate,Density (P/Km²),Urban Pop %,Urban Population


In [10]:
table.find_all('tr')[1:]

[<tr> <td>2020</td> <td><strong>7,794,798,739</strong></td> <td>1.05 %</td> <td>81,330,639</td> <td>30.9</td> <td>2.47</td> <td>52</td> <td>56.2 %</td> <td>4,378,993,944</td> </tr>,
 <tr> <td>2019</td> <td><strong>7,713,468,100</strong></td> <td>1.08 %</td> <td>82,377,060</td> <td>29.8</td> <td>2.51</td> <td>52</td> <td>55.7 %</td> <td>4,299,438,618</td> </tr>,
 <tr> <td>2018</td> <td><strong>7,631,091,040</strong></td> <td>1.10 %</td> <td>83,232,115</td> <td>29.8</td> <td>2.51</td> <td>51</td> <td>55.3 %</td> <td>4,219,817,318</td></tr>,
 <tr> <td>2017</td> <td><strong>7,547,858,925</strong></td> <td>1.12 %</td> <td>83,836,876</td> <td>29.8</td> <td>2.51</td> <td>51</td> <td>54.9 %</td> <td>4,140,188,594</td> </tr>,
 <tr> <td>2016</td> <td><strong>7,464,022,049</strong></td> <td>1.14 %</td> <td>84,224,910</td> <td>29.8</td> <td>2.51</td> <td>50</td> <td>54.4 %</td> <td>4,060,652,683</td> </tr>,
 <tr> <td>2015</td> <td><strong>7,379,797,139</strong></td> <td>1.19 %</td> <td>84,594,707<

In [11]:
for i in table.find_all('tr')[1:]:
    row_data = i.find_all('td')
    break

row_data

[<td>2020</td>,
 <td><strong>7,794,798,739</strong></td>,
 <td>1.05 %</td>,
 <td>81,330,639</td>,
 <td>30.9</td>,
 <td>2.47</td>,
 <td>52</td>,
 <td>56.2 %</td>,
 <td>4,378,993,944</td>]

In [12]:
for i in table.find_all('tr')[1:]:
    row_data = i.find_all('td')
    row = [tr.text for tr in row_data]
    length = len(df)
    df.loc[length] = row

In [13]:
row

['1955',
 '2,773,019,936',
 '1.80 %',
 '47,317,757',
 '23',
 '4.97',
 '19',
 'N.A.',
 'N.A.']

In [15]:
df.head()

Unnamed: 0,Year (July 1),Population,Yearly % Change,Yearly Change,Median Age,Fertility Rate,Density (P/Km²),Urban Pop %,Urban Population
0,2020,7794798739,1.05 %,81330639,30.9,2.47,52,56.2 %,4378993944
1,2019,7713468100,1.08 %,82377060,29.8,2.51,52,55.7 %,4299438618
2,2018,7631091040,1.10 %,83232115,29.8,2.51,51,55.3 %,4219817318
3,2017,7547858925,1.12 %,83836876,29.8,2.51,51,54.9 %,4140188594
4,2016,7464022049,1.14 %,84224910,29.8,2.51,50,54.4 %,4060652683


In [16]:
#exports the data as a csv
df.to_csv('file_name.csv', index=False)
