In [37]:
import os
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service
from webdriver_manager.firefox import GeckoDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

# --- CONFIGURATION ---
REPORTS_PAGE_URL = "https://www.dgca.gov.in/digigov-portal/?baseLocale=hi?dynamicPage=IncidentReports/500006/0/viewApplicationDtlsReq"
DOWNLOAD_FOLDER = os.path.join(os.getcwd(), "DGCA_Downloaded_PDFs_Firefox")
WAIT_TIMEOUT = 45
# --- END OF CONFIGURATION ---

def main():
    print("🚀 Starting Simplified DGCA Downloader (for Firefox)...")

    if not os.path.exists(DOWNLOAD_FOLDER):
        os.makedirs(DOWNLOAD_FOLDER)

    # Configure Firefox Options for automatic PDF downloading
    firefox_options = webdriver.FirefoxOptions()
    firefox_options.set_preference("browser.download.folderList", 2)
    firefox_options.set_preference("browser.download.dir", DOWNLOAD_FOLDER)
    firefox_options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf,application/octet-stream")
    firefox_options.set_preference("pdfjs.disabled", True)

    # Setup Selenium WebDriver for Firefox
    print("Setting up Firefox WebDriver...")
    service = Service(GeckoDriverManager().install())
    driver = webdriver.Firefox(service=service, options=firefox_options)
    
    driver.maximize_window()
    wait = WebDriverWait(driver, WAIT_TIMEOUT)

    try:
        print(f"Navigating to the reports page...")
        driver.get(REPORTS_PAGE_URL)

        current_page = 1
        while True:
            print(f"\n📄 Processing Page {current_page}...")
            wait.until(EC.visibility_of_element_located((By.XPATH, "//tbody/tr/td/a")))
            
            report_links = driver.find_elements(By.XPATH, "//tbody/tr/td/a")
            print(f"Found {len(report_links)} reports on this page.")

            for index, link in enumerate(report_links):
                print(f"  -> Downloading report #{index + 1}...")
                
                try:
                    files_before = os.listdir(DOWNLOAD_FOLDER)
                    
                    # **FIXED**: Use a robust JavaScript click to handle off-screen elements
                    driver.execute_script("arguments[0].click();", link)
                    
                    download_wait_start = time.time()
                    while True:
                        files_after = os.listdir(DOWNLOAD_FOLDER)
                        new_files = [f for f in files_after if f not in files_before]
                        # Firefox uses .part for temporary download files
                        if new_files and not any(".part" in f for f in new_files):
                            print(f"     ✅ Download complete: {new_files[0]}")
                            break
                        if time.time() - download_wait_start > 90:
                            print("     ❌ Download timed out.")
                            break
                        time.sleep(1)
                except Exception as e:
                    print(f"     ❌ An error occurred for this report: {e}")
                    continue

            # Pagination logic to click page numbers
            print("\nFinished all reports on this page. Looking for next page number...")
            try:
                next_page_number = current_page + 1
                next_page_link = wait.until(EC.element_to_be_clickable((By.XPATH, f"//a[contains(@class, 'paginate_button') and text()='{next_page_number}']")))
                print(f"Clicking page {next_page_number}...")
                # Use a JS click for pagination as well to be safe
                driver.execute_script("arguments[0].click();", next_page_link)
                current_page += 1
                time.sleep(2)
            except (TimeoutException, NoSuchElementException):
                print("No more pages found. Script complete.")
                break

    except Exception as e:
        print(f"\nAn unexpected script-level error occurred: {e}")
    finally:
        print("\nClosing the browser.")
        driver.quit()

if __name__ == "__main__":
    main()

🚀 Starting Simplified DGCA Downloader (for Firefox)...
Setting up Firefox WebDriver...
Navigating to the reports page...

📄 Processing Page 1...
Found 149 reports on this page.
  -> Downloading report #1...
     ✅ Download complete: IMW final report.pdf
  -> Downloading report #2...
     ✅ Download complete: Final Investigation Report.pdf
  -> Downloading report #3...
     ✅ Download complete: Investigation Report VT-CIH.pdf
  -> Downloading report #4...
     ✅ Download complete: VT-CAY Report.pdf
  -> Downloading report #5...
     ✅ Download complete: VTATJ V5.pdf
  -> Downloading report #6...
     ✅ Download complete: Final Report VT-MSP dtd 09122024.pdf
  -> Downloading report #7...
     ✅ Download complete: Final Report VT-EUK_250109_214511.pdf
  -> Downloading report #8...
     ✅ Download complete: Final Report VT-RKM Dated 07-11-2024.pdf
  -> Downloading report #9...
     ✅ Download complete: Final Investigation Report on runway excursion of VT-AHB at Chandrapur airport.pdf
  -> 