# Web Scrapping

## importing Libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

## inputing url

In [2]:
url = "https://www.worldometers.info/world-population/world-population-by-year/"
headers = { "User-Agent": "Mozilla/5.0" }

resp = requests.get(url, headers=headers)
soup = BeautifulSoup(resp.content, "html.parser")


## Extracting the First HTML Table using BeautifulSoup

In [3]:
table = soup.find("table")
table

<table class="datatable w-full border border-zinc-200" data-locale="en" data-options='{"searchable":false,"sortable":false,"pageSize":false}' data-sort-col="-1" data-sort-dir="asc" id=""> <thead> <tr> <th class="px-2 border-e border-zinc-200 font-semibold border-b-3 py-1" data-sortable="false"> <span>Year</span> </th><th class="px-2 border-e border-zinc-200 font-semibold border-b-3 py-1" data-sortable="false" data-type="number"> <span>Population</span> </th><th class="px-2 border-e border-zinc-200 font-semibold border-b-3 py-1" data-sortable="false" data-type="number"> <span>Yearly % Change</span> </th><th class="px-2 border-e border-zinc-200 font-semibold border-b-3 py-1" data-sortable="false" data-type="number"> <span>Net Change</span> </th><th class="px-2 border-e border-zinc-200 font-semibold border-b-3 py-1" data-sortable="false" data-type="number"> <span>Density (P/Km²)</span> </th> </tr> </thead> <tbody> <tr> <td class="px-2 border-e border-zinc-200 text-end py-1.5 border-b"> 20

## Extracting Table Headers from an HTML Table

In [4]:
headers = [th.text.strip() for th in table.find_all("th")]
headers

['Year', 'Population', 'Yearly % Change', 'Net Change', 'Density (P/Km²)']

## Populating a Pandas DataFrame from HTML Table Rows

In [5]:

rows = []
for tr in table.find_all("tr")[1:]:
    cols = [td.text.strip().replace(",", "") for td in tr.find_all("td")]
    if cols:
        rows.append(cols)


df = pd.DataFrame(rows, columns=headers)
df


Unnamed: 0,Year,Population,Yearly % Change,Net Change,Density (P/Km²)
0,2025,8231613070,0.85%,69640498,55
1,2024,8161972572,0.87%,70237642,55
2,2023,8091734930,0.88%,70327738,54
3,2022,8021407192,0.84%,66958801,54
4,2021,7954448391,0.86%,67447099,53
...,...,...,...,...,...
88,900,240000000,,,2
89,800,220000000,,,1
90,700,210000000,,,1
91,600,200000000,,,1


In [6]:
df_clean = df.dropna(subset=['Yearly % Change', 'Net Change'])

In [7]:
# Assuming 'df' is your DataFrame:
df_clean.to_csv("world_population_data.csv", index=False, encoding='utf-8')

In [8]:
import pyodbc

conn = pyodbc.connect(
    'DRIVER={ODBC Driver 17 for SQL Server};'
    r'SERVER=KEROPC\SQLEXPRESS;' 
    'DATABASE=world_population;'
    'Trusted_Connection=yes;'
)

cursor = conn.cursor()
cursor.execute("SELECT @@VERSION")
for row in cursor:
    print(row)

conn.close()


('Microsoft SQL Server 2014 - 12.0.2000.8 (X64) \n\tFeb 20 2014 20:04:26 \n\tCopyright (c) Microsoft Corporation\n\tExpress Edition (64-bit) on Windows NT 6.3 <X64> (Build 22621: ) (Hypervisor)\n',)
