# Data Scraping Using Python (BeautifulSoup and Requests)

In [1]:
# Import necessary libraries for web scraping and data handling
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
# Fetch the page and parse it
url = 'https://en.wikipedia.org/wiki/World_population'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')

In [3]:
# Find the first table element on the webpage
t1 = soup.find('table')

In [4]:
# Find the first table element with a specific class name
# This retrieves the first <table> tag that has the class 'wikitable sortable'.
# The 'class_' parameter is used to specify the class attribute (since 'class' is a reserved keyword in Python).
t2 = soup.find('table', class_='wikitable sortable')  # Retrieves the first table with the specified class

In [5]:
# Find all table elements on the page
# This will return a list of all <table> tags present in the HTML content.
# The index [9] retrieves the tenth table in the list (since indexing starts at 0).
table = soup.find_all('table')[9]  # Retrieves the second <table> tag from the page

In [6]:
print(table)

<table class="wikitable sortable" style="text-align:right">
<caption>Countries ranking highly in both total population <small>(more than 20 million people)</small> and population density <small>(more than 250 people per square kilometer)</small><sup class="reference" id="cite_ref-:10_104-1"><a href="#cite_note-:10-104"><span class="cite-bracket">[</span>99<span class="cite-bracket">]</span></a></sup>
</caption>
<tbody><tr>
<th scope="col">Rank
</th>
<th scope="col">Country
</th>
<th scope="col">Population
</th>
<th scope="col">Area<br/><small>(km<sup>2</sup>)</small>
</th>
<th scope="col">Density<br/><small>(pop/km<sup>2</sup>)</small>
</th>
<th scope="col">Population <br/> trend<sup class="noprint Inline-Template Template-Fact" style="white-space:nowrap;">[<i><a href="/wiki/Wikipedia:Citation_needed" title="Wikipedia:Citation needed"><span title="This claim needs references to reliable sources. (November 2022)">citation needed</span></a></i>]</sup>
</th></tr>
<tr>
<td>1</td>
<td align

In [7]:
# Extract all header cells from the table
titles = table.find_all('th')

In [8]:
titles

[<th scope="col">Rank
 </th>,
 <th scope="col">Country
 </th>,
 <th scope="col">Population
 </th>,
 <th scope="col">Area<br/><small>(km<sup>2</sup>)</small>
 </th>,
 <th scope="col">Density<br/><small>(pop/km<sup>2</sup>)</small>
 </th>,
 <th scope="col">Population <br/> trend<sup class="noprint Inline-Template Template-Fact" style="white-space:nowrap;">[<i><a href="/wiki/Wikipedia:Citation_needed" title="Wikipedia:Citation needed"><span title="This claim needs references to reliable sources. (November 2022)">citation needed</span></a></i>]</sup>
 </th>]

In [9]:
# Clean and store the header text
table_titles = [title.text.strip() for title in titles]

print(table_titles)  

['Rank', 'Country', 'Population', 'Area(km2)', 'Density(pop/km2)', 'Population  trend[citation needed]']


In [10]:
# Create an empty DataFrame with the headers
df = pd.DataFrame(columns=table_titles)

In [11]:
df

Unnamed: 0,Rank,Country,Population,Area(km2),Density(pop/km2),Population trend[citation needed]


In [12]:
# Find all rows (<tr>) in the table
column_data = table.find_all('tr')

In [13]:
# Iterate over each row (skipping the first row with headers)
for row in column_data[1:]:
    # Find all data cells (<td>) in the current row
    row_data = row.find_all('td')
    
    # Extract and clean the text from each data cell
    individual_row_data = [data.text.strip() for data in row_data]
    
    # Add the cleaned row data to the DataFrame
    length = len(df)  # Get the current length of the DataFrame
    df.loc[length] = individual_row_data  # Append the row to the DataFrame


In [14]:
df

Unnamed: 0,Rank,Country,Population,Area(km2),Density(pop/km2),Population trend[citation needed]
0,1,India,1389637446,3287263,423,Growing
1,2,Pakistan,242923845,796095,305,Rapidly growing
2,3,Bangladesh,165650475,148460,1116,Growing
3,4,Japan,124214766,377915,329,Declining[101]
4,5,Philippines,114597229,300000,382,Growing
5,6,Vietnam,103808319,331210,313,Growing
6,7,United Kingdom,67791400,243610,278,Growing
7,8,South Korea,51844834,99720,520,Steady
8,9,Taiwan,23580712,35980,655,Steady
9,10,Sri Lanka,23187516,65610,353,Growing


In [15]:
# Save the DataFrame to a CSV file without including the index column
df.to_csv('World_population.csv', index=False)
