<a href="https://colab.research.google.com/github/Matlup45/Web-Scraping-projects/blob/main/Web_Scraping_Project_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install the necessary packages
!pip install selenium pandas
!apt-get update
!apt install -y chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

Collecting selenium
  Downloading selenium-4.23.1-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.26.0-py3-none-any.whl.metadata (8.8 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading selenium-4.23.1-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m42.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.26.0-py3-none-any.whl (475 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m475.7/475.7 kB[0m [31m10.

In [2]:
# Import required libraries
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd

In [3]:
# Set up Chrome options
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

In [6]:
# Initialize the WebDriver
driver = webdriver.Chrome(options=options)

base_url = "https://hprera.nic.in/PublicDashboard"
driver.get(base_url)


In [7]:
try:
    # Wait until the "Registered Projects" tab is loaded
    WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.ID, 'reg-Projects'))
    ).click()
    print("Registered Projects tab clicked")

    # Wait until the projects are loaded
    WebDriverWait(driver, 20).until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div#reg-Projects div.shadow a[onclick*="tab_project_main_ApplicationPreview"]'))
    )
    print("Projects loaded")

    # Find project links
    project_links = driver.find_elements(By.CSS_SELECTOR, 'div#reg-Projects div.shadow a[onclick*="tab_project_main_ApplicationPreview"]')
    print(f"Found {len(project_links)} project links")

    projects = []

    # Process each project link (limit to first 6)
    for link in project_links[:6]:
        # Use JavaScript to click on the link
        try:
            driver.execute_script("arguments[0].click();", link)

            # Wait for the modal to be visible and loading spinner to disappear
            WebDriverWait(driver, 30).until(
                EC.presence_of_element_located((By.ID, 'project-menu-html'))
            )
            WebDriverWait(driver, 30).until(
                EC.invisibility_of_element_located((By.CSS_SELECTOR, '.spinner-border'))
            )

            # Extract project details using XPath
            details = {}
            detail_elements = driver.find_elements(By.XPATH, '//*[@id="project-menu-html"]//table/tbody/tr')

            for element in detail_elements:
                try:
                    key = element.find_element(By.XPATH, './td[1]').text.strip()
                    value = element.find_element(By.XPATH, './td[2]').text.strip()
                    details[key] = value
                except Exception as e:
                    print(f"Error extracting details from element: {e}")

            print(f"Details fetched: {details}")
            projects.append(details)

            # Close the modal
            driver.find_element(By.CSS_SELECTOR, 'button.close').click()
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.ID, 'reg-Projects'))
            )
        except Exception as e:
            print(f"Error processing link: {e}")
            continue

finally:
    # Close the WebDriver
    driver.quit()

Registered Projects tab clicked
Projects loaded
Found 196 project links
Details fetched: {'Name': 'MANAVINDER SINGH', 'Gender': 'Male', 'Father/Mother/Guardian Name': 'Lt. Sh. MUKHINDER SINGH', 'Photograph': '', 'Promoter Type': 'Individual', 'PAN No.': 'ACLPS2284H PAN Card', 'Mobile No.': '8527733503', 'Alternate Mobile No.': '-NA-', 'Email Id': 'manav@imperialholding.in', 'Alternate Email Id': 'bhawna.kapoor@imperialholding.in', 'Correspondence Address': 'Villette Kothi, Khalini, Shimla, Khalini, Shimla, Himachal Pradesh (171002) Address Proof', 'Permanent Address': 'Villette Kothi, Khalini, Shimla, Khalini, Shimla, Himachal Pradesh (171002) Address Proof', 'GSTIN No.': '-NA-', 'Authority Letter': '-NA-', 'Whether Himachali Agriculturist?': 'Yes'}
Details fetched: {'Name': 'MANAVINDER SINGH', 'Gender': 'Male', 'Father/Mother/Guardian Name': 'Lt. Sh. MUKHINDER SINGH', 'Photograph': '', 'Promoter Type': 'Individual', 'PAN No.': 'ACLPS2284H PAN Card', 'Mobile No.': '8527733503', 'Altern

In [8]:
# Check if we have projects
if projects:
    df = pd.DataFrame(projects)
    print(df)

    df.to_csv('registered_projects.csv', index=False)
else:
    print("No projects found or an error occurred.")

                                  Name Gender Father/Mother/Guardian Name  \
0                     MANAVINDER SINGH   Male     Lt. Sh. MUKHINDER SINGH   
1                     MANAVINDER SINGH   Male     Lt. Sh. MUKHINDER SINGH   
2                     MANAVINDER SINGH   Male     Lt. Sh. MUKHINDER SINGH   
3              UMA BAGOLIA AND D KONDA    NaN                         NaN   
4                      MS URBAN GREENS    NaN                         NaN   
5  M/S. JANTA LAND PROMOTERS PVT. LTD.    NaN                         NaN   

  Photograph   Promoter Type              PAN No.  Mobile No.  \
0                 Individual  ACLPS2284H PAN Card  8527733503   
1                 Individual  ACLPS2284H PAN Card  8527733503   
2                 Individual  ACLPS2284H PAN Card  8527733503   
3        NaN  Non-Individual  AFDPB7079J PAN File  9816245566   
4        NaN  Non-Individual  AAGFU9110M PAN File  9816188888   
5        NaN  Non-Individual  AABCJ3450D PAN File  8284922445   

  Al