# Extracting scheme links

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

In [2]:
# Set up headless Chrome
options = webdriver.ChromeOptions()
options.add_argument("--headless")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

In [3]:
scheme_data = []

try:
    base_url = "https://www.myscheme.gov.in/search"
    driver.get(base_url)

    # Click on "All Schemes" tab
    WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//span[text()="All Schemes"]'))
    ).click()
    time.sleep(2)  # Wait for tab content to load

    # Find the last page number dynamically, excluding '…' or other non-numeric elements
    WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.XPATH, '//li[contains(@class, "h-8") and not(contains(@class, "hidden"))]'))
    )
    page_numbers = driver.find_elements(By.XPATH, '//li[contains(@class, "h-8") and not(contains(@class, "hidden"))]')
    
    # Filter out non-numeric page numbers (like '...')
    numeric_page_numbers = [int(pn.text) for pn in page_numbers if pn.text.isdigit()]
    
    if numeric_page_numbers:
        last_page = numeric_page_numbers[-1]  # The last numeric page number
        print(f"Last page number: {last_page}")
    else:
        print("Could not determine last page number.")
        last_page = 1  # Default to page 1 if not found

    # Loop through all pages from 1 to last page
    for page_num in range(1, last_page + 1):
        print(f"\n Scraping page {page_num}...")

        # Wait for scheme links to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.XPATH, '//a[contains(@href, "/schemes/")]'))
        )

        # Extract all scheme links on current page
        scheme_links = driver.find_elements(By.XPATH, '//a[contains(@href, "/schemes/")]')
        print(f"Found {len(scheme_links)} links on page {page_num}")

        for link in scheme_links:
            title = link.text.strip()
            href = link.get_attribute("href").strip()
            print(f"{title} → {href}")
            scheme_data.append({"Title": title, "URL": href})

        # Click the next page if it's not the last page
        if page_num != last_page:
            try:
                next_page_btn = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, f'//li[text()="{page_num + 1}"]'))
                )
                driver.execute_script("arguments[0].click();", next_page_btn)
                time.sleep(2)  # Let new page content load
            except Exception as e:
                print(f"Could not click on page {page_num + 1}: {e}")
                break

finally:
    driver.quit()

# Save results
df = pd.DataFrame(scheme_data)
df.to_csv("myscheme_pages_click_test.csv", index=False)
print(f"\n Done! Scraped {len(df)} unique schemes and saved to myscheme_pages_click_test.csv")

Last page number: 371

 Scraping page 1...
Found 10 links on page 1
Financial Assistance To Disabled Students Pursuing (10th, 11th, 12th Equivalent Exams) → https://www.myscheme.gov.in/schemes/fadsp1012e
ICMR- Post Doctoral Fellowship → https://www.myscheme.gov.in/schemes/icmr-pdf
Tool Kit Grant for Traditional Handicrafts Experts → https://www.myscheme.gov.in/schemes/tkgthe
Snehasanthwanam → https://www.myscheme.gov.in/schemes/skerala
Scheme for Grant of Additional Scholarship to the Students of Other Backward Classes of Andaman and Nicobar Islands, for Pursuing Higher Studies Anywhere in India after Secondary Level (Except Class XI & XII) → https://www.myscheme.gov.in/schemes/sgassobcaniphsaislecxixii
National Family Benefit Scheme - Uttar Pradesh → https://www.myscheme.gov.in/schemes/nfbsup
Dr. Ambedakar Centrally Sponsored Scheme of Post-Matric Scholarships for the Economically Backward Class (EBC) Students → https://www.myscheme.gov.in/schemes/dacsspostmsebcs
Financial Assistance 

In [4]:
df = pd.read_csv("myscheme_pages_click_test.csv")

In [5]:
df.head()

Unnamed: 0,Title,URL
0,Financial Assistance To Disabled Students Purs...,https://www.myscheme.gov.in/schemes/fadsp1012e
1,ICMR- Post Doctoral Fellowship,https://www.myscheme.gov.in/schemes/icmr-pdf
2,Tool Kit Grant for Traditional Handicrafts Exp...,https://www.myscheme.gov.in/schemes/tkgthe
3,Snehasanthwanam,https://www.myscheme.gov.in/schemes/skerala
4,Scheme for Grant of Additional Scholarship to ...,https://www.myscheme.gov.in/schemes/sgassobcan...


In [7]:
df.duplicated().sum()

1

In [10]:
df[df.duplicated()== True]

Unnamed: 0,Title,URL
2936,Family Pension (BBOCWWB),https://www.myscheme.gov.in/schemes/fpbbocwwb
