In [1]:
# pip install selenium webdriver-manager

import os, json, time, re, datetime
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import ElementClickInterceptedException, TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
BASE_URL = "https://cases.shc.gov.pk/"
COURT_NAME = "Larkana"
COURT_VALUE = "4"
BENCH_TEXT  = "Circuit Court Larkana"
OUTPUT_FILE = "SindhCourt_Larkana.json"


def make_driver(headless=False):
    from selenium.webdriver.chrome.options import Options
    opts = Options()
    if headless:
        opts.add_argument("--headless=new")
    opts.add_argument("--start-maximized")
    opts.add_argument("--disable-blink-features=AutomationControlled")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-dev-shm-usage")
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts)

def W(d, t=20): return WebDriverWait(d, t)

# -------------------- helpers: robust Karachi tile click --------------------
def safe_click(driver, el):
    try:
        driver.execute_script("arguments[0].scrollIntoView({block:'center'});", el)
        time.sleep(0.1)
        try:
            el.click()
        except ElementClickInterceptedException:
            driver.execute_script("arguments[0].click();", el)
    except Exception:
        driver.execute_script("arguments[0].click();", el)

def open_court_bench(driver):
    W(driver).until(EC.presence_of_all_elements_located(
        (By.CSS_SELECTOR, "button.btn.btn-dark.btn-sm.btn-block.mt-2.stretched-link")
    ))
    locators = [
        (By.XPATH, f"//div[.//a[contains(.,'{BENCH_TEXT}')]]//button[contains(@class,'stretched-link')]"),
        (By.XPATH, f"//div[contains(@class,'card') or contains(@class,'col')]//a[contains(.,'{BENCH_TEXT}')]/following::button[contains(@class,'stretched-link')][1]"),
        (By.XPATH, "//button[contains(@class,'stretched-link')]"),
    ]

    for by, value in locators:
        try:
            btn = driver.find_element(by, value)
            btn.click()
            return
        except:
            continue
    raise Exception(f"Bench button not found for: {BENCH_TEXT}")
# -------------------- search form --------------------
def set_status_all(driver):
    try:
        all_radio = driver.find_element(By.CSS_SELECTOR,
            "input[type='radio'][name='CasesSearch[isPending]'][value='3']")
        driver.execute_script("arguments[0].click()", all_radio)
    except Exception:
        pass

def select_karachi_and_search(driver):
    sel = Select(W(driver).until(EC.presence_of_element_located((By.ID, "casessearch-circuitcode"))))
    sel.select_by_value(COURT_VALUE)  # Karachi
    set_status_all(driver)
    W(driver).until(EC.element_to_be_clickable((By.ID, "submit_search"))).click()

def wait_for_table(driver):
    W(driver).until(EC.presence_of_element_located((By.CSS_SELECTOR, "table.kv-grid-table")))
    W(driver).until(lambda d: len(d.find_elements(By.CSS_SELECTOR, "table.kv-grid-table tbody tr.crud-datatable")) > 0)

# -------------------- list page parsing --------------------
def parse_page(driver):
    ths = driver.find_elements(By.CSS_SELECTOR, "table.kv-grid-table thead th")
    headers = []
    for th in ths:
        label = th.text.strip()
        if not label or label in ("#", "Actions"):
            continue
        headers.append(label)

    rows = driver.find_elements(By.CSS_SELECTOR, "table.kv-grid-table tbody tr.crud-datatable")
    page_data = []

    for r in rows:
        tds_all = r.find_elements(By.CSS_SELECTOR, "td.crud-datatable")
        cells = []
        for td in tds_all:
            if "skip-export" in (td.get_attribute("class") or "").lower():
                continue
            cells.append(td.text.strip())

        cells = cells[:len(headers)]
        header_map = {h.lower(): cells[i] if i < len(cells) else "NA" for i, h in enumerate(headers)}
        def get(h): return header_map.get(h.lower(), "NA")

        rec = {
            "CaseName":     get("Case Name"),
            "Caseno":       get("Caseno"),
            "Caseyear":     get("Caseyear"),
            "Bench":        COURT_NAME,
            "Circuitcode":  get("Circuitcode") or COURT_NAME,
            "CaseTitle":    get("CASE TITLE"),
            "Matter":       get("Matter"),
            "LastHearing":  get("Last Hearing"),
            "NextDate":     get("Next Date"),
            "DisposalDate": get("Disposal Date"),
            "Status":       get("Status"),
        }
        if isinstance(rec["Caseyear"], str):
            m = re.search(r"\d{4}", rec["Caseyear"])
            if m: rec["Caseyear"] = int(m.group(0))

        href = None
        try:
            href = r.find_element(By.CSS_SELECTOR, "td.skip-export a.btn.btn-primary").get_attribute("href")
        except Exception:
            pass
        rec["_detail_href"] = href
        page_data.append(rec)

    return page_data

# -------------------- existing profile/parties/advocates functions (no change) --------------------
# ... [keep your scrape_case_profile, scrape_parties_only, scrape_advocates_only, etc. as in your script] ...
# -------------------- CASE PROFILE ONLY (detail page) --------------------
def extract_inst_disp_cell(driver):
    cell = W(driver).until(EC.presence_of_element_located((
        By.XPATH,
        "//table[@id='w1']//table[contains(@class,'kv-child-table')]"
        "//th[contains(normalize-space(.), 'Institution / Admit Date')]/following-sibling::td[1]"
    )))
    raw = cell.text.strip()
    dates = re.findall(r"\b\d{2}-[A-Z]{3}-\d{2,4}\b", raw)
    inst = dates[0] if len(dates) >= 1 else "NA"
    disp = dates[1] if len(dates) >= 2 else "NA"
    cons = dates[2] if len(dates) >= 3 else "NA"
    m = re.search(r"\(([^)]+)\)", raw)
    note = m.group(1).strip() if m else "NA"
    return {
        "institution_admit_date": inst,
        "disposal_date": disp,
        "consigned_date": cons,
        "disposal_note": note
    }

def extract_last_hearing_detail(driver):
    cell = W(driver).until(EC.presence_of_element_located((
        By.XPATH,
        "//table[@id='w1']//table[contains(@class,'kv-child-table')]"
        "//th[normalize-space(.)='Last Hearing Detail']/following-sibling::td[1]"
    )))
    lines = [ln.strip() for ln in cell.text.splitlines() if ln.strip()]
    def val_after(prefix):
        for ln in lines:
            if ln.lower().startswith(prefix.lower()+":"):
                return ln.split(":",1)[1].strip() or "NA"
        return "NA"
    return {
        "date":    val_after("Date"),
        "list":    val_after("List"),
        "stage":   val_after("Stage"),
        "bench":   val_after("Bench"),
        "remarks": val_after("Other Info"),
    }

def scrape_case_profile(driver, href):
    default = {
        "profile": {"institution_admit_date":"NA","disposal_date":"NA","consigned_date":"NA","disposal_note":"NA"},
        "last_hearing": {"date":"NA","list":"NA","stage":"NA","bench":"NA","remarks":"NA"}
    }
    if not href:
        return default

    main = driver.current_window_handle
    driver.execute_script("window.open(arguments[0], '_blank');", href)
    W(driver).until(lambda d: len(d.window_handles) > 1)
    detail = [h for h in driver.window_handles if h != main][0]
    driver.switch_to.window(detail)

    try:
        profile = extract_inst_disp_cell(driver)
        last    = extract_last_hearing_detail(driver)
        return {"profile": profile, "last_hearing": last}
    except Exception:
        return default
    finally:
        driver.close()
        driver.switch_to.window(main)
# -------------------- PARTIES ONLY (open/close in its own pass) --------------------
def click_tab_and_wait_pane(driver, tab_text):
    a = W(driver, 12).until(EC.element_to_be_clickable(
        (By.XPATH, f"//ul[contains(@class,'nav-tabs')]//a[normalize-space()='{tab_text}']")))
    href = a.get_attribute("href") or ""
    target_id = a.get_attribute("aria-controls") or (href.split("#",1)[1] if "#" in href else None)
    try:
        a.click()
    except Exception:
        driver.execute_script("arguments[0].click();", a)

    if not target_id:
        pane = W(driver, 12).until(EC.presence_of_element_located(
            (By.XPATH, "//div[@class='tab-content']//div[contains(@class,'active')]")))
    else:
        pane = W(driver, 12).until(EC.presence_of_element_located((By.ID, target_id)))

    def pane_ready(_):
        cls = pane.get_attribute("class") or ""
        if "active" not in cls:
            return False
        if pane.find_elements(By.CSS_SELECTOR, "table.kv-grid-table tbody tr.crud-datatable"):
            return True
        if pane.find_elements(By.CSS_SELECTOR, "div.empty"):
            return True
        return False

    W(driver, 12).until(pane_ready)
    return pane

def extract_parties_from_pane(pane):
    rows = pane.find_elements(By.CSS_SELECTOR, "table.kv-grid-table tbody tr.crud-datatable")
    out = []
    for r in rows:
        try:
            num_el = r.find_element(By.CSS_SELECTOR, "td[data-col-seq='0']")
            name_el = r.find_element(By.CSS_SELECTOR, "td[data-col-seq='1']")
        except Exception:
            tds = r.find_elements(By.CSS_SELECTOR, "td")
            if len(tds) >= 2:
                num_el, name_el = tds[0], tds[1]
            else:
                continue
        out.append({
            "party_no": (num_el.text or "").strip() or "NA",
            "name":     (name_el.text or "").strip() or "NA"
        })
    return out

def scrape_parties_only(driver, href):
    if not href:
        return []
    main = driver.current_window_handle
    driver.execute_script("window.open(arguments[0], '_blank');", href)
    W(driver).until(lambda d: len(d.window_handles) > 1)
    detail = [h for h in driver.window_handles if h != main][0]
    driver.switch_to.window(detail)
    try:
        pane = click_tab_and_wait_pane(driver, "Parties Details")
        return extract_parties_from_pane(pane)
    except Exception:
        return []
    finally:
        driver.close()
        driver.switch_to.window(main)
# -------------------- ADVOCATES ONLY (open/close in its own pass) --------------------
def parse_adv_rows(rows):
    out = []
    for r in rows:
        tds = r.find_elements(By.TAG_NAME, "td")
        if len(tds) < 2:   # 'No results found.' (single td with colspan)
            continue
        name_cell = (tds[0].text or "").strip()
        date_cell = (tds[1].text or "").strip()
        m = re.search(r"\((ADVO-[^)]+)\)", name_cell, flags=re.I)
        ledger = m.group(1) if m else "NA"
        name = re.sub(r"\s*\(ADVO-[^)]+\)\s*$", "", name_cell, flags=re.I).strip()
        out.append({
            "name": name or "NA",
            "ledger_no": ledger,
            "entry_date": (date_cell if date_cell and date_cell.lower() != "(not set)" else "NA"),
        })
    return out

def extract_advocates_from_pane(pane):
    applicant_rows  = pane.find_elements(By.XPATH, ".//div[contains(@class,'panel')][contains(.,'Advocate for Applicant')]//table[contains(@class,'kv-grid-table')]//tbody//tr")
    respondent_rows = pane.find_elements(By.XPATH, ".//div[contains(@class,'panel')][contains(.,'Advocate for Respondent')]//table[contains(@class,'kv-grid-table')]//tbody//tr")
    return {
        "applicant": parse_adv_rows(applicant_rows),
        "respondent": parse_adv_rows(respondent_rows)
    }

def scrape_advocates_only(driver, href):
    if not href:
        return {"applicant": [], "respondent": []}
    main = driver.current_window_handle
    driver.execute_script("window.open(arguments[0], '_blank');", href)
    W(driver).until(lambda d: len(d.window_handles) > 1)
    detail = [h for h in driver.window_handles if h != main][0]
    driver.switch_to.window(detail)
    try:
        pane = click_tab_and_wait_pane(driver, "Case Advocates")
        return extract_advocates_from_pane(pane)
    except Exception:
        return {"applicant": [], "respondent": []}
    finally:
        driver.close()
        driver.switch_to.window(main)

# -------------------- pagination --------------------
def go_next_page(driver, current_page_idx):
    pagers = driver.find_elements(By.CSS_SELECTOR, "ul.pagination")
    if not pagers: return False
    pager = pagers[0]
    next_li = pager.find_elements(By.CSS_SELECTOR, "li.next")
    if not next_li: return False
    next_li = next_li[0]
    cls = (next_li.get_attribute("class") or "").lower()
    link = next_li.find_elements(By.CSS_SELECTOR, "a")
    if "disabled" in cls or not link:
        return False

    try:
        first_before = driver.find_element(By.CSS_SELECTOR,
            "table.kv-grid-table tbody tr.crud-datatable td.crud-datatable").text
    except Exception:
        first_before = ""

    driver.execute_script("arguments[0].scrollIntoView({block:'center'});", link[0])
    driver.execute_script("arguments[0].click();", link[0])

    def changed(d):
        try:
            new_first = d.find_element(By.CSS_SELECTOR,
                "table.kv-grid-table tbody tr.crud-datatable td.crud-datatable").text
            if new_first != first_before:
                return True
        except Exception:
            pass
        try:
            actives = d.find_elements(By.CSS_SELECTOR, "ul.pagination li.active")
            if actives and actives[0].text.strip() != str(current_page_idx):
                return True
        except Exception:
            pass
        return False

    WebDriverWait(driver, 20).until(changed)
    time.sleep(0.3)
    return True

# -------------------- pagination --------------------
def go_next_page(driver, current_page_idx):
    pagers = driver.find_elements(By.CSS_SELECTOR, "ul.pagination")
    if not pagers: return False
    pager = pagers[0]
    next_li = pager.find_elements(By.CSS_SELECTOR, "li.next")
    if not next_li: return False
    next_li = next_li[0]
    cls = (next_li.get_attribute("class") or "").lower()
    link = next_li.find_elements(By.CSS_SELECTOR, "a")
    if "disabled" in cls or not link:
        return False

    try:
        first_before = driver.find_element(By.CSS_SELECTOR,
            "table.kv-grid-table tbody tr.crud-datatable td.crud-datatable").text
    except Exception:
        first_before = ""

    driver.execute_script("arguments[0].scrollIntoView({block:'center'});", link[0])
    driver.execute_script("arguments[0].click();", link[0])

    def changed(d):
        try:
            new_first = d.find_element(By.CSS_SELECTOR,
                "table.kv-grid-table tbody tr.crud-datatable td.crud-datatable").text
            if new_first != first_before:
                return True
        except Exception:
            pass
        try:
            actives = d.find_elements(By.CSS_SELECTOR, "ul.pagination li.active")
            if actives and actives[0].text.strip() != str(current_page_idx):
                return True
        except Exception:
            pass
        return False

    WebDriverWait(driver, 20).until(changed)
    time.sleep(0.3)
    return True

# -------------------- JSON writer --------------------
def init_or_load_payload():
    if os.path.exists(OUTPUT_FILE):
        try:
            with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
                payload = json.load(f)
            if not isinstance(payload, dict) or "cases" not in payload:
                raise ValueError()
            return payload
        except Exception:
            pass
    return {
        "metadata": {
            "file_name": os.path.basename(OUTPUT_FILE),
            "created_on": datetime.date.today().strftime("%Y-%m-%d"),
            "bench": COURT_NAME,
            "source": "Sindh High Court Case Search Portal",
            "url": BASE_URL,
            "description": "Sindh High Court case metadata (Case Profile + Parties + Advocates)."
        },
        "cases": []
    }

def save_payload(payload):
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        json.dump(payload, f, ensure_ascii=False, indent=2)

# -------------------- RUN --------------------
driver = make_driver(headless=False)  # set True if you prefer headless
driver.get(BASE_URL)

open_court_bench(driver)
select_karachi_and_search(driver)
wait_for_table(driver)

payload = init_or_load_payload()
page = 1
srno = len(payload["cases"]) + 1   ### NEW ### start where last run left off

while True:
    rows = parse_page(driver)

    for rec in rows:
        href = rec.get("_detail_href")

        # 1) Profile visit
        prof = scrape_case_profile(driver, href)

        # 2) Parties visit
        parties = scrape_parties_only(driver, href)

        # 3) Advocates visit
        advocates = scrape_advocates_only(driver, href)

        rec.pop("_detail_href", None)
        rec["SrNo"] = srno             ### NEW ###
        rec["Details"] = {**prof, "parties": parties, "advocates": advocates}

        payload["cases"].append(rec)
        save_payload(payload)
        print(f"saved case {srno}: {rec.get('CaseName','')} / {rec.get('Caseno','')}")

        srno += 1   ### NEW ### increment after each case

    if not go_next_page(driver, page):
        break
    page += 1

driver.quit()
print(f"Done. Total cases saved: {len(payload['cases'])} -> {OUTPUT_FILE}")


saved case 1: Civil Revision / 101
saved case 2: Civil Revision / 102
saved case 3: Civil Revision / 103
saved case 4: Civil Revision / 104
saved case 5: Civil Revision / 105
saved case 6: Civil Revision / 106
saved case 7: Civil Revision / 108
saved case 8: Civil Revision / 109
saved case 9: Civil Revision / 110
saved case 10: Civil Revision / 111
saved case 11: Civil Revision / 112
saved case 12: Civil Revision / 113
saved case 13: Civil Revision / 114
saved case 14: Civil Revision / 115
saved case 15: Civil Revision / 116
saved case 16: Civil Revision / 117
saved case 17: Civil Revision / 118
saved case 18: Civil Revision / 119
saved case 19: Civil Revision / 120
saved case 20: Civil Revision / 121
saved case 21: Civil Revision / 122
saved case 22: Civil Revision / 123
saved case 23: Civil Revision / 124
saved case 24: Civil Revision / 46
saved case 25: Civil Revision / 50
saved case 26: Civil Revision / 6
saved case 27: Civil Revision / 18
saved case 28: Civil Revision / 26
saved c

TimeoutException: Message: 
