## Webscraping - Data Collection
You can get any data directly from internet

In [1]:
%pip install requests beautifulsoup4 lxml




In [2]:
from warnings import filterwarnings
filterwarnings('ignore')

## Get the head title for any webpage

In [3]:
url1 = 'https://en.wikipedia.org/wiki/World_population'
url1

'https://en.wikipedia.org/wiki/World_population'

In [4]:
import requests
response = requests.get(url1)
response

<Response [200]>

## 200 - OK all the data is collected

In [5]:
data = response.content
print(data[0:1000])

b'<!DOCTYPE html>\n<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-toc-available" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8">\n<title>World population - Wikipedia</title>\n<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 

## Get the title with beautiful soup package

In [6]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(data)

In [7]:
title_tag = soup.find('title')
title_tag

<title>World population - Wikipedia</title>

In [9]:
titlte_text = title_tag.text
titlte_text

'World population - Wikipedia'

## Find out h1 tag from body

In [14]:
h1_tag = soup.find('h1', class_='firstHeading')
h1_tag

<h1 class="firstHeading mw-first-heading" id="firstHeading"><span class="mw-page-title-main">World population</span></h1>

In [15]:
h1_text = h1_tag.text
h1_text

'World population'

## Get all the h2 tags

In [17]:
heading_divs = soup.find_all('div', class_ ='mw-heading')
heading_divs

[<div class="mw-heading mw-heading2"><h2 id="History">History</h2></div>,
 <div class="mw-heading mw-heading3"><h3 id="Ancient_and_post-classical_history">Ancient and post-classical history</h3></div>,
 <div class="mw-heading mw-heading3"><h3 id="Modern_history">Modern history</h3></div>,
 <div class="mw-heading mw-heading3"><h3 id="Milestones_by_the_billions">Milestones by the billions</h3></div>,
 <div class="mw-heading mw-heading2"><h2 id="Global_demographics">Global demographics</h2></div>,
 <div class="mw-heading mw-heading2"><h2 id="Population_by_region">Population by region</h2></div>,
 <div class="mw-heading mw-heading2"><h2 id="Largest_populations_by_country">Largest populations by country</h2></div>,
 <div class="mw-heading mw-heading3"><h3 id="Ten_most_populous_countries">Ten most populous countries</h3></div>,
 <div class="mw-heading mw-heading3"><h3 id="Most_densely_populated_countries">Most densely populated countries</h3></div>,
 <div class="mw-heading mw-heading2"><h2 i

In [18]:
h2_tags = [
    tag.find('h2').text
    for tag in heading_divs
    if tag.find('h2') is not None
]
h2_tags

['History',
 'Global demographics',
 'Population by region',
 'Largest populations by country',
 'Fluctuation',
 'Mathematical approximations',
 'Number of humans who have ever lived',
 'Human population as a function of food availability',
 'See also',
 'Explanatory notes',
 'References',
 'Further reading',
 'External links']

In [20]:
h3_tags= [
    tag.find('h3').text
    for tag in heading_divs
    if tag.find('h3') is not None
]
h3_tags

['Ancient and post-classical history',
 'Modern history',
 'Milestones by the billions',
 'Ten most populous countries',
 'Most densely populated countries',
 'Annual population growth',
 'Population growth by region',
 'Past population',
 'Projections',
 'Years for world population to double',
 'Citations',
 'General and cited sources']

In [21]:
heading_text = [tag.text for tag in heading_divs]
heading_text

['History',
 'Ancient and post-classical history',
 'Modern history',
 'Milestones by the billions',
 'Global demographics',
 'Population by region',
 'Largest populations by country',
 'Ten most populous countries',
 'Most densely populated countries',
 'Fluctuation',
 'Annual population growth',
 'Population growth by region',
 'Past population',
 'Projections',
 'Mathematical approximations',
 'Years for world population to double',
 'Number of humans who have ever lived',
 'Human population as a function of food availability',
 'See also',
 'Explanatory notes',
 'References',
 'Citations',
 'General and cited sources',
 'Further reading',
 'External links']

## Find all image urls in webpage

In [22]:
a_tags = soup.find_all('a', class_ = 'mw-file-description')
a_tags

[<a class="mw-file-description" href="/wiki/File:World_Population_Prospects.svg"><img class="mw-file-element" data-file-height="676" data-file-width="900" decoding="async" height="225" src="//upload.wikimedia.org/wikipedia/commons/thumb/0/0e/World_Population_Prospects.svg/300px-World_Population_Prospects.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/0/0e/World_Population_Prospects.svg/450px-World_Population_Prospects.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/0/0e/World_Population_Prospects.svg/600px-World_Population_Prospects.svg.png 2x" width="300"/></a>,
 <a class="mw-file-description" href="/wiki/File:Illustration_of_contemporary_and_past_human_populations_Our_World_in_Data.png"><img class="mw-file-element" data-file-height="7747" data-file-width="5201" decoding="async" height="328" src="//upload.wikimedia.org/wikipedia/commons/thumb/d/d2/Illustration_of_contemporary_and_past_human_populations_Our_World_in_Data.png/220px-Illustration_of_contempor

In [23]:
a_tags[0]

<a class="mw-file-description" href="/wiki/File:World_Population_Prospects.svg"><img class="mw-file-element" data-file-height="676" data-file-width="900" decoding="async" height="225" src="//upload.wikimedia.org/wikipedia/commons/thumb/0/0e/World_Population_Prospects.svg/300px-World_Population_Prospects.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/0/0e/World_Population_Prospects.svg/450px-World_Population_Prospects.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/0/0e/World_Population_Prospects.svg/600px-World_Population_Prospects.svg.png 2x" width="300"/></a>

In [24]:
a_tags[0].get('href')

'/wiki/File:World_Population_Prospects.svg'

In [25]:
home = 'https://en.wikipedia.org'
home + a_tags[0].get('href')

'https://en.wikipedia.org/wiki/File:World_Population_Prospects.svg'

In [26]:
home

'https://en.wikipedia.org'

In [27]:
img_urls = [home + tag.get('href') for tag in a_tags]
img_urls

['https://en.wikipedia.org/wiki/File:World_Population_Prospects.svg',
 'https://en.wikipedia.org/wiki/File:Illustration_of_contemporary_and_past_human_populations_Our_World_in_Data.png',
 'https://en.wikipedia.org/wiki/File:2020_1million_cities.jpg',
 'https://en.wikipedia.org/wiki/File:Expectancy_of_life.svg',
 'https://en.wikipedia.org/wiki/File:Population_pyramid_of_the_world_in_continental_groupings_2023.svg',
 'https://en.wikipedia.org/wiki/File:Global_population_cartogram.png',
 'https://en.wikipedia.org/wiki/File:People%27s_-Km%C2%B2_for_all_countries_(and_us_states,_uk_kingdoms).png',
 'https://en.wikipedia.org/wiki/File:Top_5_Country_Population_Graph_1901_to_2021.svg',
 'https://en.wikipedia.org/wiki/File:Population_Density,_v4.11,_2020_(48009093621).jpg',
 'https://en.wikipedia.org/wiki/File:World_population_(UN).svg',
 'https://en.wikipedia.org/wiki/File:Total_Fertility_Rate_Map_by_Country.svg',
 'https://en.wikipedia.org/wiki/File:World_population_counter,_Eureka,_Halifax,_

## Find all the tables and show as dataframes

In [28]:
table_tags = soup.find_all('table', class_='wikitable')
table_tags

[<table class="wikitable" style="text-align:center; float:right; clear:right; margin-left:8px; margin-right:0;">
 <caption>World population milestones in billions<sup class="reference" id="cite_ref-:6_58-0"><a href="#cite_note-:6-58"><span class="cite-bracket">[</span>58<span class="cite-bracket">]</span></a></sup> (Worldometers estimates)
 </caption>
 <tbody><tr>
 <th scope="row">Population
 </th>
 <th scope="col">1
 </th>
 <th scope="col">2
 </th>
 <th scope="col">3
 </th>
 <th scope="col">4
 </th>
 <th scope="col">5
 </th>
 <th scope="col">6
 </th>
 <th scope="col">7
 </th>
 <th scope="col">8
 </th>
 <th scope="col">9
 </th>
 <th scope="col">10
 </th></tr>
 <tr>
 <th scope="row">Year
 </th>
 <td>1804</td>
 <td>1927</td>
 <td>1960</td>
 <td>1974</td>
 <td>1987</td>
 <td>1999</td>
 <td>2011</td>
 <td>2022</td>
 <td><i>2037</i></td>
 <td><i>2057</i>
 </td></tr>
 <tr>
 <th scope="row">Years elapsed
 </th>
 <td>200,000+</td>
 <td>123</td>
 <td>33</td>
 <td>14</td>
 <td>13</td>
 <td>12</t

In [29]:
import pandas as pd
df = pd.read_html(str(table_tags[0]))[0]
df

Unnamed: 0,Population,1,2,3,4,5,6,7,8,9,10
0,Year,1804,1927,1960,1974,1987,1999,2011,2022,2037,2057
1,Years elapsed,"200,000+",123,33,14,13,12,12,11,15,20


In [30]:
tables = []
for i in table_tags:
    df = pd.read_html(str(i))[0]
    tables.append(df)
    
tables[0]

Unnamed: 0,Population,1,2,3,4,5,6,7,8,9,10
0,Year,1804,1927,1960,1974,1987,1999,2011,2022,2037,2057
1,Years elapsed,"200,000+",123,33,14,13,12,12,11,15,20


In [31]:
tables[1]

Unnamed: 0,Region,2022 (percent),2030 (percent),2050 (percent)
0,Sub-Saharan Africa,"1,152 (14.51%)","1,401 (16.46%)","2,094 (21.62%)"
1,Northern Africa and Western Asia,549 (6.91%),617 (7.25%),771 (7.96%)
2,Central Asia and Southern Asia,"2,075 (26.13%)","2,248 (26.41%)","2,575 (26.58%)"
3,Eastern Asia and Southeastern Asia,"2,342 (29.49%)","2,372 (27.87%)","2,317 (23.92%)"
4,Europe and Northern America,"1,120 (14.10%)","1,129 (13.26%)","1,125 (11.61%)"
5,Latin America and the Caribbean,658 (8.29%),695 (8.17%),749 (7.73%)
6,Australia and New Zealand,31 (0.39%),34 (0.40%),38 (0.39%)
7,Oceania,14 (0.18%),15 (0.18%),20 (0.21%)
8,World,7942,8512,9687


## Create Class WikipediaScraper to scrape any wikipedia url

In [32]:
%pip install pydantic

Note: you may need to restart the kernel to use updated packages.
