### <b>Importing Libraries</b>

In [21]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import service as ChromeService
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
import time

#### <b>Configuring WebDriver:</b>

In [22]:
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

#### <b>Initializing WebDriver</b>

In [23]:
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service,options=chrome_options)

#### <b>Opening the WebPage</b>

In [24]:
# to open the webpage
driver.get('http://www.commonlii.org/pk/legis/const/1973/')

# to wait for the page to load
#time.sleep(3)

#### <b>Explicitly defining the Wait until "toc" element is loaded</b>

In [25]:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

#### <b>Printing the Page Source</b>

In [27]:
print(driver.page_source)

<html><head>
<title>
Constitution of the Islamic Republic of Pakistan 1973 - Table of Contents
</title>
</head>
<body bgcolor="#FFFFFF" text="#000000" link="#0044BB" alink="#FF0000" vlink="#660099" marginheight="20" marginwidth="20">
<p>
<table border="0" width="100%" cellspacing="2" cellpadding="2">
<tbody><tr valign="top">
<td align="left" rowspan="2" width="10%">
<a href="/"><img alt="CommonLII" border="0" src="/images/logos/CommonLII_small.jpg"></a></td>
<td align="left" width="80%">

<small>[<a href="/">Home</a>]
[<a href="/databases.html">Databases</a>]
[<a href="http://www.commonlii.org">CommonLII</a>]
[<a href="/forms/search1.html">Search</a>]
[<a href="/commonlii/feedback.html">Feedback</a>]
[<a href="/commonlii/help/">Help</a>]
</small>
<h1>Constitution of the Islamic Republic of Pakistan</h1>
</td>
</tr>
</tbody></table>
<small><b>You are here:</b>&nbsp; <a href="/">CommonLII</a> &gt;&gt; <a href="/databases.html">Databases</a> 
&gt;&gt; <a href="../">Constitution of the Isl

#### <b>Observation:</b>
- By looking at HTML source, we can examine that there are no specific `Unique Identifiers`, or `Class Names` that can identify "Table Of Contents (TOC)".
- Instead, the `Table of Contents (TOC)` is a nested `<ul>` (unordered list), with each section linked by `<a>` tags.
- Hence, we'll utilize each chapter wrapped in `<li>` elements within `<ul>` tags, forming unordered list structure under the main `ul` for the table of contents.
- Moreover, we will target `//ul` and its `<li>` childeren to extract the TOC items.

In [29]:
try: 
    toc = WebDriverWait(driver,10).until(
        EC.presence_of_element_located((By.XPATH,"//ul"))
    )
    toc_links = toc.find_elements(By.TAG_NAME,'a')
    toc_texts = [link.text for link in toc_links]
    print(f'Found {len(toc_links)} links')
    print(f'Found {len(toc_texts)} texts')

except Exception as e:
    print('Failed to find links or texts:',e)

Found 41 links
Found 41 texts


#### <b>Extracting the links and texts from Table of Contents.</b>

In [31]:
# XPaths queries for extracting links within a table of contents
toc_links = driver.find_elements(By.XPATH,'//a')

# for loop testing
if not toc_links:
    print('No links found in table of contents')
else:
    print(f'Found{len(toc_links)} links')

#toc_texts = driver.find_elements(By.XPATH, '//div[@id="toc"]//a/text()')
toc_texts=[link.text for link in toc_links]

# for loop testing
if not toc_texts:
    print("No texts found for the links")
else:
    print(f"Found{len(toc_texts)} texts.")

Found66 links
Found66 texts.


#### <b>Storing the Extracted Data as a list of Dictionaries</b>

In [32]:
toc_data = [{'URL':link.get_attribute('href'),'Text': text} 
            for link, text in zip(toc_links,toc_texts)]

#### <b>Print the Extracted Data</b>

In [33]:
for item in toc_data:
    print(f"URL:{item['URL']},'Text':{item['Text']}")

URL:http://www.commonlii.org/,'Text':
URL:http://www.commonlii.org/,'Text':Home
URL:http://www.commonlii.org/databases.html,'Text':Databases
URL:http://www.commonlii.org/,'Text':CommonLII
URL:http://www.commonlii.org/forms/search1.html,'Text':Search
URL:http://www.commonlii.org/commonlii/feedback.html,'Text':Feedback
URL:http://www.commonlii.org/commonlii/help/,'Text':Help
URL:http://www.commonlii.org/,'Text':CommonLII
URL:http://www.commonlii.org/databases.html,'Text':Databases
URL:http://www.commonlii.org/pk/legis/const/,'Text':Constitution of the Islamic Republic of Pakistan
URL:http://www.commonlii.org/form/search/?mask=pk/legis/const,'Text':Database Search
URL:http://www.commonlii.org/form/search/?mask=pk/legis/const&title=1,'Text':Name Search
URL:http://www.commonlii.org/commonlii/help/,'Text':Help
URL:http://www.commonlii.org/pk/legis/const/1973/1.html,'Text':Preamble
URL:http://www.commonlii.org/pk/legis/const/1973/2.html,'Text':Part I: Introductory [Articles 1-6]
URL:http://ww

In [34]:
for item in toc_data:
    print(f"URL: {item['URL']}, Text: {item['Text']}")

URL: http://www.commonlii.org/, Text: 
URL: http://www.commonlii.org/, Text: Home
URL: http://www.commonlii.org/databases.html, Text: Databases
URL: http://www.commonlii.org/, Text: CommonLII
URL: http://www.commonlii.org/forms/search1.html, Text: Search
URL: http://www.commonlii.org/commonlii/feedback.html, Text: Feedback
URL: http://www.commonlii.org/commonlii/help/, Text: Help
URL: http://www.commonlii.org/, Text: CommonLII
URL: http://www.commonlii.org/databases.html, Text: Databases
URL: http://www.commonlii.org/pk/legis/const/, Text: Constitution of the Islamic Republic of Pakistan
URL: http://www.commonlii.org/form/search/?mask=pk/legis/const, Text: Database Search
URL: http://www.commonlii.org/form/search/?mask=pk/legis/const&title=1, Text: Name Search
URL: http://www.commonlii.org/commonlii/help/, Text: Help
URL: http://www.commonlii.org/pk/legis/const/1973/1.html, Text: Preamble
URL: http://www.commonlii.org/pk/legis/const/1973/2.html, Text: Part I: Introductory [Articles 1-6

In [35]:
# Print the extracted data
for link, text in zip(toc_links, toc_texts):
    print(f'URL: {link.get_attribute("href")},Text:{text}')

URL: http://www.commonlii.org/,Text:
URL: http://www.commonlii.org/,Text:Home
URL: http://www.commonlii.org/databases.html,Text:Databases
URL: http://www.commonlii.org/,Text:CommonLII
URL: http://www.commonlii.org/forms/search1.html,Text:Search
URL: http://www.commonlii.org/commonlii/feedback.html,Text:Feedback
URL: http://www.commonlii.org/commonlii/help/,Text:Help
URL: http://www.commonlii.org/,Text:CommonLII
URL: http://www.commonlii.org/databases.html,Text:Databases
URL: http://www.commonlii.org/pk/legis/const/,Text:Constitution of the Islamic Republic of Pakistan
URL: http://www.commonlii.org/form/search/?mask=pk/legis/const,Text:Database Search
URL: http://www.commonlii.org/form/search/?mask=pk/legis/const&title=1,Text:Name Search
URL: http://www.commonlii.org/commonlii/help/,Text:Help
URL: http://www.commonlii.org/pk/legis/const/1973/1.html,Text:Preamble
URL: http://www.commonlii.org/pk/legis/const/1973/2.html,Text:Part I: Introductory [Articles 1-6]
URL: http://www.commonlii.or

#### <b>Saving the Extracted Data to a file (i.e., csv)</b>


In [37]:
import csv
with open('1976_data',mode='w',newline='') as file:
    writer = csv.DictWriter(file, fieldnames=['URL',"Text"])
    writer.writeheader()
    writer.writerows(toc_data)

## <b>B. BeautifulSoup Script</b> 
- In this script, we will read the URLs from `1976_data.csv` and extract its content.

#### <b>B.1: Importing the Libraries:</b> 

In [39]:
import csv
import requests
from bs4 import BeautifulSoup
import json

#### <b>B.2: Loading URLS and Texts from the CSV file:</b> 

In [52]:
import os
print(os.getcwd())

f:\scrapper\1973-scrapper


In [53]:
# making an empty list
data = []

with open('F:\\scrapper\\1973-scrapper\\1976_data', mode='r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        data.append({"URL":row["URL"],"Text":row["Text"]})

def fetch_text_from_url(url):
    try:
        response = requests.get(url,timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.text,'html.parser')

        # Extracting texts from specific HTML tags
        paragraphs = soup.find_all(['p','h1','h2','h3'])
        page_text = '\n'.join([p.get_text(strip=True) for p in paragraphs])

        return page_text
    
    except Exception as e:
        print(f'Error fetching {url}:{e}')
        return None

#### <b>B.3: Extracting Text from URLs & Storing it in a List:</b> 

In [54]:
scrapped_data = []
for item in data:
    url = item['URL']
    label = item['Text']
    text = fetch_text_from_url(url)

    if text:
        scrapped_data.append({
            'URL': url,
            'Label':label,
            'Content': text
        }
        )

Error fetching http://www.commonlii.org/pk/legis/const/1973/13.html:HTTPConnectionPool(host='www.commonlii.org', port=80): Read timed out. (read timeout=10)
Error fetching http://www.commonlii.org/pk/legis/const/1973/13.html#c4:HTTPConnectionPool(host='www.commonlii.org', port=80): Read timed out. (read timeout=10)


#### <b>B.4: Save Scrapped Data to a JSON File:</b> 

In [55]:
with open('scrapped_data.json','w',encoding='utf-8') as file:
     json.dump(scrapped_data, file, ensure_ascii=False, indent=4)

     print('Scraping complete!')

Scraping complete!


## <b>C. Data Preprocessing</b>

#### <b>C.1 Loading the Scraped Data from the JSON file.</b>

In [59]:
import pandas as pd

# Loading the scrapped data into Pandas DataFrame
df = pd.read_json(r'F:\scrapper\1973-scrapper\scrapped_data.json',dtype=str,encoding='utf-8') 
df.head()

Unnamed: 0,URL,Label,Content
0,http://www.commonlii.org/,,[Advanced \n Search][LawCite][Hel...
1,http://www.commonlii.org/,Home,[Advanced \n Search][LawCite][Hel...
2,http://www.commonlii.org/databases.html,Databases,CommonLII Databases\n\nAfrica\nBotswana|Gambia...
3,http://www.commonlii.org/,CommonLII,[Advanced \n Search][LawCite][Hel...
4,http://www.commonlii.org/forms/search1.html,Search,Find>using autosearch>this Boolean query>any o...


In [47]:
print(df.dtypes)

print(df['Content'].apply(len))



URL        object
Label      object
Content    object
dtype: object
0     1931
1     1931
2     1374
3     1931
4      428
      ... 
59    3734
60     915
61    1825
62    6449
63    1026
Name: Content, Length: 64, dtype: int64


In [48]:
print(df['Content'].describe())

count                                                    64
unique                                                   31
top       [Home]\n[Databases]\n[CommonLII]\n[Search]\n[F...
freq                                                      6
Name: Content, dtype: object


#### <b>C.2 Data Cleaning </b>

##### <b>C.2.1 Removing HTML Tags</b>

In [60]:
from bs4 import BeautifulSoup

# Function to remove HTML Tags
def remove_html_tags(text):
    if isinstance(text,str):
        soup=BeautifulSoup(text,'html.parser')
        return soup.get_text()
    return text

# Applying the function to all string columns in the dataframe 
df_cleaned = df.applymap(remove_html_tags)

# Checking the Cleaned Data
df_cleaned.head()

  df_cleaned = df.applymap(remove_html_tags)
  soup=BeautifulSoup(text,'html.parser')


Unnamed: 0,URL,Label,Content
0,http://www.commonlii.org/,,[Advanced \n Search][LawCite][Hel...
1,http://www.commonlii.org/,Home,[Advanced \n Search][LawCite][Hel...
2,http://www.commonlii.org/databases.html,Databases,CommonLII Databases\n\nAfrica\nBotswana|Gambia...
3,http://www.commonlii.org/,CommonLII,[Advanced \n Search][LawCite][Hel...
4,http://www.commonlii.org/forms/search1.html,Search,Find>using autosearch>this Boolean query>any o...


##### <b>C.2.2: Handling Special Characters</b>

In [61]:
import html

# Function to decode HTML special Characters
def decode_special_characters(text):
    if isinstance(text,str):
        return html.unescape(text)
    return text

# Applying this function to "Content" feature
df_cleaned['Content'] = df_cleaned['Content'].apply(decode_special_characters)

# Reviewing first 5 records
df_cleaned.head()


Unnamed: 0,URL,Label,Content
0,http://www.commonlii.org/,,[Advanced \n Search][LawCite][Hel...
1,http://www.commonlii.org/,Home,[Advanced \n Search][LawCite][Hel...
2,http://www.commonlii.org/databases.html,Databases,CommonLII Databases\n\nAfrica\nBotswana|Gambia...
3,http://www.commonlii.org/,CommonLII,[Advanced \n Search][LawCite][Hel...
4,http://www.commonlii.org/forms/search1.html,Search,Find>using autosearch>this Boolean query>any o...


##### <b>C.2.3: Handling Extra Whitespaces & Newlines</b>

In [62]:
import re

# Function to remove extra whitespaces & newlines
def clean_whitespace_and_newlines(text):
    if isinstance(text,str):
        # it will replace multiple space/newlines with single space
        text = re.sub(r'\s+',' ',text)
        # removing leading and trailing whitespaces
        text = text.strip()
    return text

# Applying function to target column
df_cleaned['Content']=df['Content'].apply(clean_whitespace_and_newlines)

# Checking first five records
df_cleaned.head()

Unnamed: 0,URL,Label,Content
0,http://www.commonlii.org/,,[Advanced Search][LawCite][Help] 1444 database...
1,http://www.commonlii.org/,Home,[Advanced Search][LawCite][Help] 1444 database...
2,http://www.commonlii.org/databases.html,Databases,CommonLII Databases Africa Botswana|Gambia|Gha...
3,http://www.commonlii.org/,CommonLII,[Advanced Search][LawCite][Help] 1444 database...
4,http://www.commonlii.org/forms/search1.html,Search,Find>using autosearch>this Boolean query>any o...


##### <b>C.2.4: Removing Unwanted Segments</b>
- Here, we will remove the unwanted segments like `Footer`, `Navigation Links`, etc.

In [75]:
# defining function to remove unwanted segments
def remove_unwanted_segments(text):
    if isinstance(text,str):
        unwanted_keywords = ['Privacy Policy', 'Terms of Services','Feedback','Contact us','Legal Disclaimer']
        for keyword in unwanted_keywords:
            text =text.replace(keyword,"")
        
        # removing any text in square brackets
        text = re.sub(r'\[.*?]','',text)
        # removing 1444 database term
        text = re.sub(r'1444\s+database','',text)
        # remove database (case-insensitive)
        text = re.sub(r'database','',text, flags=re.IGNORECASE)
        # Removing "CommonLII" branding term
        text = re.sub(r'CommonLII','',text)
        text = re.sub(r'Commonlii\s*>>?','',text, flags=re.IGNORECASE)
        # replacing the pipe "|" symbol with space
        text = re.sub(r'\|',' ',text) 
        
        # Removing specific bracketed terms individually
        bracketed_terms = [
            "Home", "Databases", "Commonlii", "Search", 
            "Feedback", "Help", "Disclaimer", "Privacy Policy"
        ]
        for term in bracketed_terms:
            text = re.sub(fr'\[{term}\]', '', text, flags=re.IGNORECASE)

        # Removing any remaining isolated brackets or multiple whitespace
        text = re.sub(r'\[\s*\]', '', text)

        text = re.sub(r'Commonlii:Copyright Policy Disclaimers','',text,flags=re.IGNORECASE)

        return text
    
# Applying the function to the content column
df_cleaned['Content'] = df_cleaned['Content'].apply(remove_unwanted_segments)

# Checking first five records
df_cleaned.head()

Unnamed: 0,URL,Label,Content
0,http://www.commonlii.org/,,[Advanced \n Search]\n1444 sfrom ...
1,http://www.commonlii.org/,Home,[Advanced \n Search]\n1444 sfrom ...
2,http://www.commonlii.org/databases.html,Databases,Commonlii s\n\nAfrica\nBotswana Gambia Ghana K...
3,http://www.commonlii.org/,CommonLII,[Advanced \n Search]\n1444 sfrom ...
4,http://www.commonlii.org/forms/search1.html,Search,Find>Using Autosearch>This Boolean Query>Any O...


##### <b>C.2.4: Removing Links (URLS)</b>

In [68]:
def remove_urls(text):
    if isinstance(text,str):
        # Using RE to match URLs
        text = re.sub(r'http[s]?://\S+','',text) # removes both http & https URLs
        text = re.sub(r'www.\.\S+','',text) # removes www URLs
    return text

# applying the fucntion
df_cleaned['Content'] = df_cleaned['Content'].apply(remove_urls)

# Reviewing first five records
df_cleaned.head()

Unnamed: 0,URL,Label,Content
0,http://www.commonlii.org/,,[Advanced \n Search]\n1444 sfrom ...
1,http://www.commonlii.org/,Home,[Advanced \n Search]\n1444 sfrom ...
2,http://www.commonlii.org/databases.html,Databases,Commonlii s\n\nAfrica\nBotswana Gambia Ghana K...
3,http://www.commonlii.org/,CommonLII,[Advanced \n Search]\n1444 sfrom ...
4,http://www.commonlii.org/forms/search1.html,Search,Find>Using Autosearch>This Boolean Query>Any O...


##### <b>C.2.5: Standardizing Case</b>

In [69]:
def format_heading(text):
    return text.title() # converting to title case

# Applying it to headings
df_cleaned['Content'] = df['Content'].apply(format_heading)

# 
df_cleaned.head()

Unnamed: 0,URL,Label,Content
0,http://www.commonlii.org/,,[Advanced \n Search][Lawcite][Hel...
1,http://www.commonlii.org/,Home,[Advanced \n Search][Lawcite][Hel...
2,http://www.commonlii.org/databases.html,Databases,Commonlii Databases\n\nAfrica\nBotswana|Gambia...
3,http://www.commonlii.org/,CommonLII,[Advanced \n Search][Lawcite][Hel...
4,http://www.commonlii.org/forms/search1.html,Search,Find>Using Autosearch>This Boolean Query>Any O...


In [76]:
for i in range(14,16):
    print(f'record{i+1}')
    print(df_cleaned['Content'].iloc[i])
    print('='*50)

record15





Constitution Of The Islamic Republic Of Pakistanyou Are Here:s>>Constitution Of The Islamic Republic Of Pakistan>>1973>> Part I



Part Iintroductory1. The Republic And Its Territories(1) Pakistan Shall Be A Federal Republic To Be Known As The Islamic Republic 
    Of Pakistan, Hereinafter Referred To As Pakistan.(2) The Territories Of Pakistan Shall Comprise :-(A) The Provinces Of Baluchistan, The North-West Frontier, The Punjab And 
      Sind;(B) The Islamabad Capital Territory, Hereinafter Referred To As The Federal 
      Capital;(C) Federally Administered Tribal Areas; And(D) Such States And Territories As Are Or May Be Included In Pakistan, 
      Whether By Accession Or Otherwise.(3) Majlis-E-Shoora (Parliament) May By Law Admit Into The Federation New 
    States Or Areas On Such Terms And Conditions As It Thinks Fit.2. Islam To Be State Religionislam Shall Be The State Religion Of Pakistan.2A. The Objective Resolution To Form Part Of Substantive Provisions.The P

In [77]:
df.head()

Unnamed: 0,URL,Label,Content
0,http://www.commonlii.org/,,[Advanced \n Search][LawCite][Hel...
1,http://www.commonlii.org/,Home,[Advanced \n Search][LawCite][Hel...
2,http://www.commonlii.org/databases.html,Databases,CommonLII Databases\n\nAfrica\nBotswana|Gambia...
3,http://www.commonlii.org/,CommonLII,[Advanced \n Search][LawCite][Hel...
4,http://www.commonlii.org/forms/search1.html,Search,Find>using autosearch>this Boolean query>any o...


## <b>D. Exporting to MongoDB</b>

In [None]:
import pymongo
from pymongo import MongoClient
import pandas as pd

# connecting to the MongoDB
client = MongoClient("mongodb://localhost:27017")

# Defining the database and collection
db = client['1973_constitution']
collection = db['1973_collection']

# Convert DataFrame to list of dictionaries for MongoDB
data_dict = df_cleaned.to_dict('records')

try: 
    collection.insert_many(data_dict)
    print('Data Successfully exported to MongoDB')

except Exception as e:
    print(f'An Error Occurred: {e}')

# Close the MongoDB Collection
client.close()

Data Successfully exported to MongoDB
