<a href="https://colab.research.google.com/github/KrishaManandhar/Web-Scraping/blob/main/1_Webscrap_using_Beautiful_Soup_and_pandas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# <b>Scrape data from HTML tables into a Dataframe using BeautifulSoup and Pandas</b>

## Importing necessary modules

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

## Url contaning html tables with data about world population

In [2]:
popu_url = "https://en.wikipedia.org/wiki/World_population"

## Use 'get' to download contents of webpage in text format

In [3]:
popu_data = requests.get(popu_url).text

## To parse a document, pass it into the BeautifulSoup constructor <br>
<p>Then the BeautifulSoup object here 'soup' represents the document as a nested data structure</p>

In [4]:
popu_soup = BeautifulSoup(popu_data, "html5lib")

## Find all the table

In [5]:
popu_table = popu_soup.find_all('table') # here in html table is represented by the tag <table>

## Find number of tables

In [6]:
len(popu_table) # how many tables were found by checking the length of the table list

24

## Find the index of table of 10 most densely populated countries 

In [7]:
# Searching of 10 most densely populated countries
for index,table in enumerate(popu_table):
  if("10 most densely populated countries" in str(table)):
    table_index = index
print(table_index)

4


## Using method prettify() to display the HTML in nested structure

In [8]:
print(popu_table[table_index].prettify())

<table class="wikitable sortable" style="text-align:right">
 <caption>
  10 most densely populated countries
  <small>
   (with population above 5 million)
  </small>
  <sup class="reference" id="cite_ref-:10_108-0">
   <a href="#cite_note-:10-108">
    [103]
   </a>
  </sup>
 </caption>
 <tbody>
  <tr>
   <th scope="col">
    Rank
   </th>
   <th scope="col">
    Country
   </th>
   <th scope="col">
    Population
   </th>
   <th scope="col">
    Area
    <br/>
    <small>
     (km
     <sup>
      2
     </sup>
     )
    </small>
   </th>
   <th scope="col">
    Density
    <br/>
    <small>
     (pop/km
     <sup>
      2
     </sup>
     )
    </small>
   </th>
  </tr>
  <tr>
   <td>
    1
   </td>
   <td align="left">
    <span class="flagicon">
     <img alt="" class="thumbborder" data-file-height="600" data-file-width="900" decoding="async" height="15" src="//upload.wikimedia.org/wikipedia/commons/thumb/4/48/Flag_of_Singapore.svg/23px-Flag_of_Singapore.svg.png" srcset="//upload

## Making an empty dataframe

In [9]:
population_data = pd.DataFrame(columns=["Rank","Country","Population","Area","Density"])

## Scraping data from HTML tables into a DataFrame

In [10]:
for popu_row in popu_table[table_index].tbody.find_all('tr'):
  popu_col = popu_row.find_all("td")
  if (popu_col!=[]):
    rank = popu_col[0].text
    country = popu_col[1].text
    population = popu_col[2].text
    area = popu_col[3].text
    density = popu_col[4].text
    population_data = population_data.append({"Rank":rank, "Country":country, "Population":population, "Area":area,"Density":density},ignore_index = True)
population_data  

Unnamed: 0,Rank,Country,Population,Area,Density
0,1,Singapore,5921231,719,"8,235\n"
1,2,Bangladesh\n,"165,650,475\n","148,460\n","1,116\n"
2,3,\n Palestine[104]\n\n,"5,223,000\n","6,025\n",867\n
3,4,Lebanon,5296814,10400,509\n
4,5,Taiwan,23580712,35980,655\n
5,6,South Korea,51844834,99720,520\n
6,7,Rwanda,13173730,26338,500\n
7,8,Israel\n,"8,914,885\n","21,937\n",406\n
8,9,Haiti,11334637,27750,408\n
9,10,Netherlands\n,"17,400,824\n","41,543\n",419\n
