  # <h1>                          WEB SCRAPING USING PYTHON 


 we will hit the following target to achieve our goal

• Data extraction from the web using Python's Beautiful Soup module

• Data manipulation and cleaning using Python's Pandas library

1) importing the required libraries

In [None]:
import pandas as pd

In [2]:
import requests
from bs4 import BeautifulSoup

2) we are extracting the data from worldometer(world-population-by-year)

In [3]:
# requests module helps to fetch the website
url = "https://www.worldometers.info/world-population/world-population-by-year/"
response = requests.get(url)
data = response.text

<h4> Data extraction from the web using Python's Beautiful Soup module

In [4]:
# beautiful soup helps to fetch html tags, that can helpful to exract data from those tags
soup = BeautifulSoup(data, 'lxml')
type(soup)

bs4.BeautifulSoup

In [25]:
title = soup.title
title.string

'World Population by Year - Worldometer'

In [6]:
# Print out the text
text = soup.get_text()
#print(soup.text)

In [7]:
# fetch all 'a' tags from html code 
soup.find_all('a')

[<a class="navbar-brand" href="/"><img border="0" src="/img/worldometers-logo.gif" title="Worldometer"/></a>,
 <a href="/coronavirus/"><span style="color:#FF9900; font-weight:bold">Coronavirus</span></a>,
 <a href="/population/">Population</a>,
 <a href="/">W</a>,
 <a href="/population/">Population</a>,
 <a href="/population/world/">World</a>,
 <a href="/">www.Worldometers.info</a>,
 <a href="https://esa.un.org/unpd/wpp/">World Population Prospects: The 2019 Revision</a>,
 <a href="/"><img border="0" class="img-footer" src="/img/worldometers-logo-footer.png"/></a>,
 <a href="/about/">about</a>,
 <a href="/faq/">faq</a>,
 <a href="/languages/">languages</a>,
 <a href="/contact/">contact</a>,
 <a data-placement="bottom" data-toggle="tooltip" href="/newsletter-subscribe/" title="Newsletter"><i class="fa fa-bullhorn fa-round"></i></a>,
 <a href="https://twitter.com/Worldometers"><i class="fa fa-twitter fa-round"></i></a>,
 <a href="https://www.facebook.com/Worldometers.info"><i class="fa f

In [8]:
all_links = soup.find_all("a")
for link in all_links:
    print(link.get("href"))

/
/coronavirus/
/population/
/
/population/
/population/world/
/
https://esa.un.org/unpd/wpp/
/
/about/
/faq/
/languages/
/contact/
/newsletter-subscribe/
https://twitter.com/Worldometers
https://www.facebook.com/Worldometers.info
/disclaimer/


In [9]:
rows = soup.find_all('tr')
print(rows[:10])

[<tr> <th>Year</th><th><div align="right">World Population</div></th><th style="text-align:center">Yearly<br/>Change</th><th style="text-align:center">Net<br/>Change</th><th style="text-align:center">Density<br/>(P/Km²)</th><th style="text-align:center">Urban<br/>Pop</th><th style="text-align:center">Urban<br/>Pop %</th> </tr>, <tr> <td>2020</td> <td style="font-weight: bold; text-align :right">7,794,798,739</td> <td style="text-align:right">1.05 %</td><td style="text-align:right">81,330,639</td><td style="text-align:right">52</td> <td style="text-align:right">4,378,993,944</td> <td style="text-align:right">56 %</td> </tr>, <tr> <td>2019</td> <td style="font-weight: bold; text-align:right">7,713,468,100</td> <td style="text-align:right">1.08 %</td><td style="text-align:right">82,377,060</td><td style="text-align:right">52</td> <td style="text-align:right">4,299,438,618</td> <td style="text-align:right">56 %</td> </tr>, <tr> <td>2018</td> <td style="font-weight: bold; text-align:right">

In [10]:
for row in rows:
    row_td = row.find_all('td')
print(row_td)
type(row_td)

[<td>-5000</td>, <td style="font-weight: bold; text-align:right">5,000,000</td>, <td style="text-align:right"> </td>, <td style="text-align:right"> </td>, <td style="text-align:right"> </td>, <td style="text-align:right"> </td>, <td style="text-align:right"> </td>]


bs4.element.ResultSet

In [11]:
str_cells = str(row_td)
cleantext = BeautifulSoup(str_cells, "lxml").get_text()
print(cleantext)

[-5000, 5,000,000,  ,  ,  ,  ,  ]


In [12]:
# reguler expression module helps to extract useful info from our tags code. 
import re

list_rows = []
for row in rows:
    cells = row.find_all('td')
    str_cells = str(cells)
    clean = re.compile('<.*?>')
    clean2 = (re.sub(clean, '',str_cells))
    list_rows.append(clean2)
print(clean2)
type(clean2)

[-5000, 5,000,000,  ,  ,  ,  ,  ]


str

In [13]:
df = pd.DataFrame(list_rows)
df.head(10)

Unnamed: 0,0
0,[]
1,"[2020, 7,794,798,739, 1.05 %, 81,330,639, 52, ..."
2,"[2019, 7,713,468,100, 1.08 %, 82,377,060, 52, ..."
3,"[2018, 7,631,091,040, 1.10 %, 83,232,115, 51, ..."
4,"[2017, 7,547,858,925, 1.12 %, 83,836,876, 51, ..."
5,"[2016, 7,464,022,049, 1.14 %, 84,224,910, 50, ..."
6,"[2015, 7,379,797,139, 1.16 %, 84,506,374, 50, ..."
7,"[2014, 7,295,290,765, 1.17 %, 84,708,789, 49, ..."
8,"[2013, 7,210,581,976, 1.19 %, 84,753,917, 48, ..."
9,"[2012, 7,125,828,059, 1.20 %, 84,633,758, 48, ..."


<h4> Data manipulation and cleaning using Python's Pandas library

In [14]:
# now, we can split our messy table into column format
df1 = df[0].str.split(', ', expand=True)
df1.head(10)

Unnamed: 0,0,1,2,3,4,5,6
0,[],,,,,,
1,[2020,7794798739.0,1.05 %,81330639.0,52.0,4378993944.0,56 %]
2,[2019,7713468100.0,1.08 %,82377060.0,52.0,4299438618.0,56 %]
3,[2018,7631091040.0,1.10 %,83232115.0,51.0,4219817318.0,55 %]
4,[2017,7547858925.0,1.12 %,83836876.0,51.0,4140188594.0,55 %]
5,[2016,7464022049.0,1.14 %,84224910.0,50.0,4060652683.0,54 %]
6,[2015,7379797139.0,1.16 %,84506374.0,50.0,3981497663.0,54 %]
7,[2014,7295290765.0,1.17 %,84708789.0,49.0,3902831934.0,53 %]
8,[2013,7210581976.0,1.19 %,84753917.0,48.0,3824990329.0,53 %]
9,[2012,7125828059.0,1.20 %,84633758.0,48.0,3747842586.0,53 %]


In [15]:
col_labels = soup.find_all('th')

In [16]:
df1[0] = df1[0].str.strip('[')
df1.head(10)

Unnamed: 0,0,1,2,3,4,5,6
0,],,,,,,
1,2020,7794798739.0,1.05 %,81330639.0,52.0,4378993944.0,56 %]
2,2019,7713468100.0,1.08 %,82377060.0,52.0,4299438618.0,56 %]
3,2018,7631091040.0,1.10 %,83232115.0,51.0,4219817318.0,55 %]
4,2017,7547858925.0,1.12 %,83836876.0,51.0,4140188594.0,55 %]
5,2016,7464022049.0,1.14 %,84224910.0,50.0,4060652683.0,54 %]
6,2015,7379797139.0,1.16 %,84506374.0,50.0,3981497663.0,54 %]
7,2014,7295290765.0,1.17 %,84708789.0,49.0,3902831934.0,53 %]
8,2013,7210581976.0,1.19 %,84753917.0,48.0,3824990329.0,53 %]
9,2012,7125828059.0,1.20 %,84633758.0,48.0,3747842586.0,53 %]


In [29]:
# we will clean our table heading and to summrised our data in perticuler column
all_header = []
col_str = str(col_labels)
cleantext2 = BeautifulSoup(col_str, "lxml").get_text()
all_header.append(cleantext2)
print(all_header)

['[Year, World Population, YearlyChange, NetChange, Density(P/Km²), UrbanPop, UrbanPop %]']


In [30]:
df2 = pd.DataFrame(all_header)
df2.head()

Unnamed: 0,0
0,"[Year, World Population, YearlyChange, NetChan..."


In [19]:
df3 = df2[0].str.split(',', expand=True)
df3.head()

Unnamed: 0,0,1,2,3,4,5,6
0,[Year,World Population,YearlyChange,NetChange,Density(P/Km²),UrbanPop,UrbanPop %]


In [20]:
frames = [df3, df1]

df4 = pd.concat(frames)
df4.head(10)

Unnamed: 0,0,1,2,3,4,5,6
0,[Year,World Population,YearlyChange,NetChange,Density(P/Km²),UrbanPop,UrbanPop %]
0,],,,,,,
1,2020,7794798739,1.05 %,81330639,52,4378993944,56 %]
2,2019,7713468100,1.08 %,82377060,52,4299438618,56 %]
3,2018,7631091040,1.10 %,83232115,51,4219817318,55 %]
4,2017,7547858925,1.12 %,83836876,51,4140188594,55 %]
5,2016,7464022049,1.14 %,84224910,50,4060652683,54 %]
6,2015,7379797139,1.16 %,84506374,50,3981497663,54 %]
7,2014,7295290765,1.17 %,84708789,49,3902831934,53 %]
8,2013,7210581976,1.19 %,84753917,48,3824990329,53 %]


In [21]:
df5 = df4.rename(columns=df4.iloc[0])
df5.head()

Unnamed: 0,[Year,World Population,YearlyChange,NetChange,Density(P/Km²),UrbanPop,UrbanPop %]
0,[Year,World Population,YearlyChange,NetChange,Density(P/Km²),UrbanPop,UrbanPop %]
0,],,,,,,
1,2020,7794798739,1.05 %,81330639,52,4378993944,56 %]
2,2019,7713468100,1.08 %,82377060,52,4299438618,56 %]
3,2018,7631091040,1.10 %,83232115,51,4219817318,55 %]


In [22]:
df5.rename(columns={'[Year': 'Year'},inplace=True)
df5.rename(columns={'UrbanPop %]': 'urbanPop'},inplace=True)
df5.head()

Unnamed: 0,Year,World Population,YearlyChange,NetChange,Density(P/Km²),UrbanPop,UrbanPop %]
0,[Year,World Population,YearlyChange,NetChange,Density(P/Km²),UrbanPop,UrbanPop %]
0,],,,,,,
1,2020,7794798739,1.05 %,81330639,52,4378993944,56 %]
2,2019,7713468100,1.08 %,82377060,52,4299438618,56 %]
3,2018,7631091040,1.10 %,83232115,51,4219817318,55 %]


In [23]:
df5.head(10)

Unnamed: 0,Year,World Population,YearlyChange,NetChange,Density(P/Km²),UrbanPop,UrbanPop %]
0,[Year,World Population,YearlyChange,NetChange,Density(P/Km²),UrbanPop,UrbanPop %]
0,],,,,,,
1,2020,7794798739,1.05 %,81330639,52,4378993944,56 %]
2,2019,7713468100,1.08 %,82377060,52,4299438618,56 %]
3,2018,7631091040,1.10 %,83232115,51,4219817318,55 %]
4,2017,7547858925,1.12 %,83836876,51,4140188594,55 %]
5,2016,7464022049,1.14 %,84224910,50,4060652683,54 %]
6,2015,7379797139,1.16 %,84506374,50,3981497663,54 %]
7,2014,7295290765,1.17 %,84708789,49,3902831934,53 %]
8,2013,7210581976,1.19 %,84753917,48,3824990329,53 %]


In [24]:
df6 = df5.drop(df5.index[0])
df6.head()

Unnamed: 0,Year,World Population,YearlyChange,NetChange,Density(P/Km²),UrbanPop,UrbanPop %]
1,2020,7794798739,1.05 %,81330639,52,4378993944,56 %]
2,2019,7713468100,1.08 %,82377060,52,4299438618,56 %]
3,2018,7631091040,1.10 %,83232115,51,4219817318,55 %]
4,2017,7547858925,1.12 %,83836876,51,4140188594,55 %]
5,2016,7464022049,1.14 %,84224910,50,4060652683,54 %]


In [31]:
# so, the last step is to save our dataframe in csv or excel file.
df6.to_excel('G:\world_pop_year.xlsx')

we got our file

In [33]:
world_pop = pd.read_excel('world_pop_year.xlsx')
world_pop.head()

Unnamed: 0.1,Unnamed: 0,Year,World Population,YearlyChange,NetChange,Density(P/Km²),UrbanPop,UrbanPop %]
0,1,2020,7794798739,1.05 %,81330639,52,4378993944,56 %]
1,2,2019,7713468100,1.08 %,82377060,52,4299438618,56 %]
2,3,2018,7631091040,1.10 %,83232115,51,4219817318,55 %]
3,4,2017,7547858925,1.12 %,83836876,51,4140188594,55 %]
4,5,2016,7464022049,1.14 %,84224910,50,4060652683,54 %]


<h5> ----------------------------MISSION ACCOMPLISHED--------------------------------------