In [1]:
%pip install -q google-colab-selenium
import google_colab_selenium as gs
driver = gs.Chrome()
driver.get("https://www.google.com")
print("✅ Test Success: Page title =", driver.title)
driver.quit()

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━[0m [32m1.0/1.6 MB[0m [31m30.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
[?25h

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Test Success: Page title = Google


In [3]:
# ═══════════════════════════════════════════════════════════════════════
# Delhi District Courts Cause List Scraper - Google Colab (BEST FIX: google-colab-selenium)
# Works with: https://delhicourts.nic.in/
# Copy this ENTIRE code into ONE cell and run!
# ═══════════════════════════════════════════════════════════════════════

# ─── STEP 1: SETUP (Auto-Handles Chrome + Driver) ───
print("🚀 Setting up Delhi Courts Scraper...")
%pip install -q google-colab-selenium selenium beautifulsoup4 reportlab requests lxml PyPDF2

print("✅ Setup complete!")

# ─── STEP 2: IMPORTS ───
import os, time, json, warnings, re
from datetime import datetime, timedelta
from typing import Dict, List, Optional
from IPython.display import display, FileLink, HTML
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from bs4 import BeautifulSoup
import requests
from reportlab.lib.pagesizes import A4
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib import colors
from reportlab.lib.units import inch
import google_colab_selenium as gs  # Handles browser + driver magic
warnings.filterwarnings('ignore')

print("✅ Libraries imported!")

# ─── STEP 3: DELHI COURTS SCRAPER (gs.Chrome for Reliability) ───
class DelhiCourtsScraper:
    """Scraper for Delhi District Courts"""

    # Delhi court complexes with their URLs (unchanged)
    COURT_COMPLEXES = {
        "Central Delhi": {
            "url": "https://delhicourts.nic.in/central-district",
            "cause_list": "https://delhicourts.nic.in/writereaddata/Upload/CauseList/Central/"
        },
        "East Delhi": {
            "url": "https://delhicourts.nic.in/east-district",
            "cause_list": "https://delhicourts.nic.in/writereaddata/Upload/CauseList/East/"
        },
        "New Delhi": {
            "url": "https://delhicourts.nic.in/newdelhi-district",
            "cause_list": "https://delhicourts.nic.in/writereaddata/Upload/CauseList/NewDelhi/"
        },
        "North Delhi": {
            "url": "https://delhicourts.nic.in/north-district",
            "cause_list": "https://delhicourts.nic.in/writereaddata/Upload/CauseList/North/"
        },
        "North East Delhi": {
            "url": "https://delhicourts.nic.in/northeast-district",
            "cause_list": "https://delhicourts.nic.in/writereaddata/Upload/CauseList/NorthEast/"
        },
        "North West Delhi": {
            "url": "https://delhicourts.nic.in/northwest-district",
            "cause_list": "https://delhicourts.nic.in/writereaddata/Upload/CauseList/NorthWest/"
        },
        "Shahdara Delhi": {
            "url": "https://delhicourts.nic.in/shahdara-district",
            "cause_list": "https://delhicourts.nic.in/writereaddata/Upload/CauseList/Shahdara/"
        },
        "South Delhi": {
            "url": "https://delhicourts.nic.in/south-district",
            "cause_list": "https://delhicourts.nic.in/writereaddata/Upload/CauseList/South/"
        },
        "South East Delhi": {
            "url": "https://delhicourts.nic.in/southeast-district",
            "cause_list": "https://delhicourts.nic.in/writereaddata/Upload/CauseList/SouthEast/"
        },
        "South West Delhi": {
            "url": "https://delhicourts.nic.in/southwest-district",
            "cause_list": "https://delhicourts.nic.in/writereaddata/Upload/CauseList/SouthWest/"
        },
        "West Delhi": {
            "url": "https://delhicourts.nic.in/west-district",
            "cause_list": "https://delhicourts.nic.in/writereaddata/Upload/CauseList/West/"
        }
    }

    def __init__(self):
        self.dir = "/content/delhi_courts_output"
        os.makedirs(f"{self.dir}/pdfs", exist_ok=True)
        os.makedirs(f"{self.dir}/json", exist_ok=True)

        print("🌐 Starting browser...")
        # Magic: gs.Chrome() auto-installs matching driver + sets Colab-safe options
        self.driver = gs.Chrome()
        self.wait = WebDriverWait(self.driver, 20)
        print("✅ Browser ready!")

    def get_complexes(self):
        """Get list of court complexes"""
        return list(self.COURT_COMPLEXES.keys())

    def scrape_cause_list_page(self, complex_name, date_str):
        """Scrape cause list from court website"""
        try:
            if complex_name not in self.COURT_COMPLEXES:
                print(f"❌ Unknown complex: {complex_name}")
                return []

            complex_url = self.COURT_COMPLEXES[complex_name]["url"]
            print(f"📍 Opening {complex_name} court website...")

            self.driver.get(complex_url)
            time.sleep(3)

            # Look for cause list links
            print("🔍 Searching for cause list links...")

            # Try to find cause list section
            links = self.driver.find_elements(By.TAG_NAME, "a")
            cause_list_links = []

            for link in links:
                href = link.get_attribute("href")
                text = link.text.lower()
                if href and ("cause" in text or "daily" in text or ".pdf" in href.lower()):
                    cause_list_links.append({
                        "text": link.text,
                        "url": href
                    })

            print(f"✅ Found {len(cause_list_links)} potential cause list links")
            return cause_list_links

        except Exception as e:
            print(f"❌ Error: {str(e)[:200]}")
            return []

    def download_pdf_from_url(self, url, filename):
        """Download PDF from URL"""
        try:
            print(f"📥 Downloading: {filename}")
            response = requests.get(url, timeout=30)
            response.raise_for_status()

            filepath = f"{self.dir}/pdfs/{filename}"
            with open(filepath, 'wb') as f:
                f.write(response.content)

            print(f"✅ Saved: {filename}")
            return filepath

        except Exception as e:
            print(f"❌ Download failed: {str(e)[:100]}")
            return None

    def scrape_by_date_method(self, complex_name, date_str):
        """Alternative method: Try to construct direct PDF URLs"""
        results = []

        try:
            # Parse date
            dt = datetime.strptime(date_str, "%d-%m-%Y")

            # Common URL patterns for Delhi courts
            patterns = [
                f"{dt.strftime('%d%m%Y')}.pdf",
                f"{dt.strftime('%d-%m-%Y')}.pdf",
                f"causelist_{dt.strftime('%d%m%Y')}.pdf",
                f"CauseList_{dt.strftime('%d%m%Y')}.pdf",
                f"{dt.strftime('%d.%m.%Y')}.pdf",
            ]

            base_url = self.COURT_COMPLEXES[complex_name]["cause_list"]

            for pattern in patterns:
                url = base_url + pattern
                print(f"🔍 Trying: {url}")

                try:
                    response = requests.head(url, timeout=10)
                    if response.status_code == 200:
                        print(f"✅ Found PDF at: {url}")
                        filename = f"{complex_name.replace(' ', '_')}_{date_str.replace('-', '')}.pdf"
                        filepath = self.download_pdf_from_url(url, filename)
                        if filepath:
                            results.append({
                                "complex": complex_name,
                                "date": date_str,
                                "url": url,
                                "file": filepath,
                                "success": True
                            })
                except:
                    continue

            return results

        except Exception as e:
            print(f"❌ Error in date method: {str(e)[:100]}")
            return []

    def scrape_interactive_portal(self, complex_name, date_str):
        """Scrape using interactive portal"""
        try:
            # Navigate to the main cause list page
            base_url = "https://delhicourts.nic.in/"
            print(f"📍 Navigating to {base_url}")

            self.driver.get(base_url)
            time.sleep(3)

            # Look for "Cause List" or "Daily Board" link
            try:
                cause_list_link = self.driver.find_element(
                    By.XPATH,
                    "//a[contains(text(), 'Cause List') or contains(text(), 'Daily Board')]"
                )
                cause_list_link.click()
                time.sleep(3)
            except:
                print("⚠️ Could not find cause list link")

            # Try to find district selection
            try:
                district_select = Select(self.driver.find_element(By.ID, "district"))
                district_select.select_by_visible_text(complex_name)
                time.sleep(2)
            except:
                print("⚠️ Could not select district")

            # Try to enter date
            try:
                date_input = self.driver.find_element(By.ID, "date")
                date_input.clear()
                date_input.send_keys(date_str)
                time.sleep(1)
            except:
                print("⚠️ Could not enter date")

            # Click submit
            try:
                submit_btn = self.driver.find_element(
                    By.XPATH,
                    "//button[contains(text(), 'Search')] | //input[@type='submit']"
                )
                submit_btn.click()
                time.sleep(4)
            except:
                print("⚠️ Could not click submit")

            # Look for download links
            pdf_links = self.driver.find_elements(By.XPATH, "//a[contains(@href, '.pdf')]")

            results = []
            for i, link in enumerate(pdf_links, 1):
                url = link.get_attribute("href")
                text = link.text or f"Cause_List_{i}"

                filename = f"{complex_name.replace(' ', '_')}_{text.replace(' ', '_')}_{date_str.replace('-', '')}.pdf"
                filepath = self.download_pdf_from_url(url, filename)

                if filepath:
                    results.append({
                        "complex": complex_name,
                        "date": date_str,
                        "name": text,
                        "file": filepath,
                        "success": True
                    })

            return results

        except Exception as e:
            print(f"❌ Interactive portal error: {str(e)[:200]}")
            return []

    def download_all_for_date(self, date_str):
        """Download cause lists from all complexes for a date"""
        all_results = []
        complexes = self.get_complexes()

        print(f"\n{'='*70}")
        print(f"Downloading cause lists for {date_str} from ALL Delhi courts")
        print(f"{'='*70}\n")

        for i, complex_name in enumerate(complexes, 1):
            print(f"\n[{i}/{len(complexes)}] {complex_name}")
            print("-" * 50)

            # Try multiple methods
            results = []

            # Method 1: Direct PDF URL patterns
            results.extend(self.scrape_by_date_method(complex_name, date_str))

            # Method 2: Scrape court page
            if not results:
                links = self.scrape_cause_list_page(complex_name, date_str)
                for link in links:
                    if ".pdf" in link["url"].lower():
                        filename = f"{complex_name.replace(' ', '_')}_{link['text'].replace(' ', '_')}.pdf"
                        filepath = self.download_pdf_from_url(link["url"], filename)
                        if filepath:
                            results.append({
                                "complex": complex_name,
                                "file": filepath,
                                "success": True
                            })

            if results:
                print(f"✅ Downloaded {len(results)} files for {complex_name}")
            else:
                print(f"⚠️ No cause lists found for {complex_name}")

            all_results.extend(results)
            time.sleep(2)  # Be respectful to the server

        return all_results

    def download_for_complex(self, complex_name, date_str):
        """Download cause list for specific complex"""
        print(f"\n{'='*70}")
        print(f"Downloading: {complex_name} - {date_str}")
        print(f"{'='*70}\n")

        results = []

        # Method 1: Direct URL
        results.extend(self.scrape_by_date_method(complex_name, date_str))

        # Method 2: Scrape page
        if not results:
            results.extend(self.scrape_interactive_portal(complex_name, date_str))

        # Method 3: Page links
        if not results:
            links = self.scrape_cause_list_page(complex_name, date_str)
            for link in links:
                if ".pdf" in link["url"].lower():
                    filename = f"{complex_name.replace(' ', '_')}_{date_str.replace('-', '')}.pdf"
                    filepath = self.download_pdf_from_url(link["url"], filename)
                    if filepath:
                        results.append({
                            "complex": complex_name,
                            "date": date_str,
                            "file": filepath,
                            "success": True
                        })

        return results

print("✅ Delhi Courts Scraper ready!")

# ─── STEP 4: INTERACTIVE FUNCTION ───
def run_delhi_scraper():
    print("\n" + "="*70)
    print("  Delhi District Courts - Cause List Scraper")
    print("="*70 + "\n")

    scraper = DelhiCourtsScraper()

    try:
        complexes = scraper.get_complexes()

        print("🏛️ Available Delhi District Courts:\n")
        for i, comp in enumerate(complexes, 1):
            print(f"  {i}. {comp}")
        print(f"  {len(complexes) + 1}. ALL Courts (Download from all)")

        choice = input(f"\n👉 Select court (1-{len(complexes) + 1}): ").strip()

        # Date input
        date_input = input(f"\n📅 Enter date (DD-MM-YYYY) [Press Enter for today]: ").strip()
        if not date_input:
            date_input = datetime.now().strftime("%d-%m-%Y")
        print(f"✅ Date: {date_input}\n")

        choice_idx = int(choice) - 1

        if choice_idx == len(complexes):
            # Download from all courts
            print("📥 Downloading from ALL Delhi District Courts...\n")
            results = scraper.download_all_for_date(date_input)
        else:
            # Download from specific court
            if 0 <= choice_idx < len(complexes):
                selected_complex = complexes[choice_idx]
                results = scraper.download_for_complex(selected_complex, date_input)
            else:
                print("❌ Invalid selection")
                return

        # Display results
        print("\n" + "="*70)
        successful = [r for r in results if r.get("success")]
        print(f"✅ Download Complete! Got {len(successful)} files")
        print("="*70 + "\n")

        if successful:
            print("📄 Downloaded Files:\n")
            for result in successful:
                print(f"  • {result.get('complex', 'Unknown Court')}")
                if result.get('file'):
                    print(f"    {os.path.basename(result['file'])}")
                    display(FileLink(result['file']))
                print()
        else:
            print("⚠️ No cause lists were downloaded.")
            print("\nPossible reasons:")
            print("  • Cause lists not yet published for this date (try 16-10-2025)")
            print("  • Court website structure has changed")
            print("  • Internet connectivity issues")
            print("\nTry:")
            print("  • A different date (yesterday)")
            print("  • Checking the court website manually")

        # Save summary
        summary_file = f"{scraper.dir}/json/summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
        with open(summary_file, 'w') as f:
            json.dump(results, f, indent=2)
        print(f"\n💾 Summary saved: {os.path.basename(summary_file)}")
        display(FileLink(summary_file))

    except KeyboardInterrupt:
        print("\n\n⚠️ Cancelled by user")
    except Exception as e:
        print(f"\n❌ Error: {e}")
        import traceback
        traceback.print_exc()
    finally:
        try:
            scraper.driver.quit()
            print("\n✅ Browser closed")
        except:
            pass

# ─── STEP 5: QUICK DOWNLOAD FUNCTION ───
def quick_download_delhi(date=None, court="all"):
    """Quick download function for advanced users"""
    if date is None:
        date = datetime.now().strftime("%d-%m-%Y")

    scraper = DelhiCourtsScraper()

    try:
        if court == "all":
            results = scraper.download_all_for_date(date)
        else:
            results = scraper.download_for_complex(court, date)

        successful = [r for r in results if r.get("success")]
        print(f"\n✅ Downloaded {len(successful)} files")

        for result in successful:
            if result.get('file'):
                display(FileLink(result['file']))

        return results
    finally:
        scraper.driver.quit()

print("✅ All functions ready!")

# ─── STEP 6: RUN THE SCRAPER ───
print("\n🚀 Starting Delhi Courts Scraper...\n")
run_delhi_scraper()

print("\n" + "="*70)
print("✅ Process Complete!")
print("="*70)
print(f"\nFiles saved to: {os.path.abspath('/content/delhi_courts_output')}")
print("\nTo run again, execute: run_delhi_scraper()")
print("For quick download: quick_download_delhi(date='16-10-2025', court='New Delhi')")
print("="*70)

🚀 Setting up Delhi Courts Scraper...
✅ Setup complete!
✅ Libraries imported!
✅ Delhi Courts Scraper ready!
✅ All functions ready!

🚀 Starting Delhi Courts Scraper...


  Delhi District Courts - Cause List Scraper

🌐 Starting browser...


<IPython.core.display.Javascript object>

✅ Browser ready!
🏛️ Available Delhi District Courts:

  1. Central Delhi
  2. East Delhi
  3. New Delhi
  4. North Delhi
  5. North East Delhi
  6. North West Delhi
  7. Shahdara Delhi
  8. South Delhi
  9. South East Delhi
  10. South West Delhi
  11. West Delhi
  12. ALL Courts (Download from all)

👉 Select court (1-12): 1

📅 Enter date (DD-MM-YYYY) [Press Enter for today]: 16-10-2025
✅ Date: 16-10-2025


Downloading: Central Delhi - 16-10-2025

🔍 Trying: https://delhicourts.nic.in/writereaddata/Upload/CauseList/Central/16102025.pdf
🔍 Trying: https://delhicourts.nic.in/writereaddata/Upload/CauseList/Central/16-10-2025.pdf
🔍 Trying: https://delhicourts.nic.in/writereaddata/Upload/CauseList/Central/causelist_16102025.pdf
🔍 Trying: https://delhicourts.nic.in/writereaddata/Upload/CauseList/Central/CauseList_16102025.pdf
🔍 Trying: https://delhicourts.nic.in/writereaddata/Upload/CauseList/Central/16.10.2025.pdf
📍 Navigating to https://delhicourts.nic.in/
⚠️ Could not find cause list link
⚠


✅ Browser closed

✅ Process Complete!

Files saved to: /content/delhi_courts_output

To run again, execute: run_delhi_scraper()
For quick download: quick_download_delhi(date='16-10-2025', court='New Delhi')


In [1]:
!pip install requests beautifulsoup4 tqdm

