This Scraping is Manual, with each link provided and data extracted for each link

In [1]:
# scrape_articles.py
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [None]:
# Main HTML entities to be noticed while scraping constitution articles (documents) are:
# 1) h2 class = doc_title : Title of the document (articles)
# 2) section class = akn-section: The full section of the article
# 3) section class = akn-subsection: The sub sections of the full section

In [3]:
def scrape_constitution_article(url):
    """Scrape a constitution article page from Indian Kanoon."""
    # getting responses
    response = requests.get(url)
    if response.status_code != 200:
        print(f"‚ùå Failed to access {url}")
        return None

    # extracting html based text from the url
    soup = BeautifulSoup(response.text, "html.parser")

    # Extracting doc_title and main section from the document page, not search page.
    title = soup.find("h2", class_="doc_title")
    if title:
        title = title.get_text(strip=True)
    else:
        title = "No title found"

    # Extracting main section : section class="akn-section"
    main_section = soup.find("section", class_="akn-section")
    sections_data = []

    # Extracting sub section: section class="akn-subsection"
    if main_section:
        subsections = main_section.find_all("section", class_="akn-subsection")
        for sub in subsections:
            text = sub.get_text(separator=" ", strip=True)
            if not text or text.strip() == "":
                text = "Not Available"
                sections_data.append(text)
            else:
                sections_data.append(text)
    else:
        print(f"‚ö†Ô∏è No section found for {url}")

    return {
        "url": url,
        "title": title,
        "content": " ".join(sections_data)
    }

In [None]:
# Manually adding 25 articles data from the website and then scraping desired details
article_urls = [
    "https://indiankanoon.org/doc/367586/", # Article 14
    "https://indiankanoon.org/doc/609295/", # Article 15
    "https://indiankanoon.org/doc/211089/", # Article 16
    "https://indiankanoon.org/doc/1987997/", # Article 17
    "https://indiankanoon.org/doc/1163710/", # Article 18
    "https://indiankanoon.org/doc/1218090/", # Article 19
    "https://indiankanoon.org/doc/655638/" , # Article 20
    "https://indiankanoon.org/doc/1199182/", # Article 21
    "https://indiankanoon.org/doc/581566/",  # Artcle 22
    "https://indiankanoon.org/doc/1071750/", # Article 23
    "https://indiankanoon.org/doc/1540780/", # Article 24
    "https://indiankanoon.org/doc/631708/", # Article 25
    "https://indiankanoon.org/doc/1858991/", # Article 26
    "https://indiankanoon.org/doc/211413/", # Article 27
    "https://indiankanoon.org/doc/1734560/", # Article 28
    "https://indiankanoon.org/doc/1734560/", # Article 29
    "https://indiankanoon.org/doc/1983234/", # Article 30
    "https://indiankanoon.org/doc/354224/", # Article 31
    "https://indiankanoon.org/doc/981147/",  # Artcle 32
    "https://indiankanoon.org/doc/829916/", # Article 33
    "https://indiankanoon.org/doc/846153/", # Article 34
    "https://indiankanoon.org/doc/448465/", # Article 35
    "https://indiankanoon.org/doc/784506/", # Article 36
    "https://indiankanoon.org/doc/76375/",  # Article 37 - Not extracted
    "https://indiankanoon.org/doc/1673816/", # Article 38
    "https://indiankanoon.org/doc/555882/", # Article 39 - Not extracted
    "https://indiankanoon.org/doc/1714884/", # Article 40 - Not extracted
]

results = []

for url in article_urls:
    print(f"üîç Scraping: {url}")
    data = scrape_constitution_article(url)
    if data:
        results.append(data)
    time.sleep(1)  # polite delay

üîç Scraping: https://indiankanoon.org/doc/367586/
üîç Scraping: https://indiankanoon.org/doc/609295/
üîç Scraping: https://indiankanoon.org/doc/211089/
üîç Scraping: https://indiankanoon.org/doc/1987997/
üîç Scraping: https://indiankanoon.org/doc/1163710/
üîç Scraping: https://indiankanoon.org/doc/1218090/
üîç Scraping: https://indiankanoon.org/doc/655638/
üîç Scraping: https://indiankanoon.org/doc/1199182/
üîç Scraping: https://indiankanoon.org/doc/581566/
üîç Scraping: https://indiankanoon.org/doc/1071750/
üîç Scraping: https://indiankanoon.org/doc/1540780/
üîç Scraping: https://indiankanoon.org/doc/631708/
üîç Scraping: https://indiankanoon.org/doc/1858991/
üîç Scraping: https://indiankanoon.org/doc/211413/
üîç Scraping: https://indiankanoon.org/doc/1734560/
üîç Scraping: https://indiankanoon.org/doc/1734560/
üîç Scraping: https://indiankanoon.org/doc/1983234/
üîç Scraping: https://indiankanoon.org/doc/354224/
üîç Scraping: https://indiankanoon.org/doc/981147/
ü

In [None]:
# Converting list to data frame and seeing the result
df = pd.DataFrame(results)
df.to_csv("constitution_articles.csv", index=False, encoding="utf-8-sig")

# print("‚úÖ Saved all articles successfully to constitution_articles.csv")
df.head(25)

Unnamed: 0,url,title,content
0,https://indiankanoon.org/doc/367586/,Article 14 in Constitution of India,
1,https://indiankanoon.org/doc/609295/,Article 15 in Constitution of India,(1) The State shall not discriminate against a...
2,https://indiankanoon.org/doc/211089/,Article 16 in Constitution of India,(1) There shall be equality of opportunity for...
3,https://indiankanoon.org/doc/1987997/,Article 17 in Constitution of India,
4,https://indiankanoon.org/doc/1163710/,Article 18 in Constitution of India,"(1) No title, not being a military or academic..."
5,https://indiankanoon.org/doc/1218090/,Article 19 in Constitution of India,(1) All citizens shall have the right- (a) to ...
6,https://indiankanoon.org/doc/655638/,Article 20 in Constitution of India,(1) No person shall be convicted of any offenc...
7,https://indiankanoon.org/doc/1199182/,Article 21 in Constitution of India,
8,https://indiankanoon.org/doc/581566/,Article 22 in Constitution of India,(1) No person who is arrested shall be detaine...
9,https://indiankanoon.org/doc/1071750/,Article 23 in Constitution of India,(1) Traffic in human beings and beggar and oth...


In [None]:
# 25 articles with title and url is complete
# Some of the content has not been displayed, probably they have dynamic HTML structure or some other class name.