In [None]:
%pip show webdriver-manager
%pip show selenium

In [53]:
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

In [54]:
def fetch_misc_appln(driver, case_data):
    table_row_path = ".//div[@id='w3-tab4']//table[contains(@class,'kv-grid-table')]/tbody/tr"
    rows = driver.find_elements(By.XPATH, table_row_path)

    list = []
    for row in rows:
        cells = row.find_elements(By.TAG_NAME, "td")
        
        # skip "No results found"
        if len(cells) == 1 and "No results found" in cells[0].text:
            continue
        
        if len(cells) >= 5:
            data = {
                "sr_number": cells[0].text,
                "application_no": cells[1].text,
                "application_year": cells[2].text,
                "cma_type_name": cells[3].text,
                "applicant_name": cells[4].text,
            }
            list.append(data)
            
    case_data["details"]["misc_appln_cma"] = list


In [55]:
def fetch_connected_cases(driver, case_data):
    table_row_path = ".//div[@id='w3-tab1']//table[contains(@class,'kv-grid-table')]/tbody/tr"
    rows = driver.find_elements(By.XPATH, table_row_path)

    list = []
    for row in rows:
        cells = row.find_elements(By.TAG_NAME, "td")
        
        # skip "No results found"
        if len(cells) == 1 and "No results found" in cells[0].text:
            continue
        
        if len(cells) >= 5:
            data = {
                "sr_number": cells[0].text,
                "case_no": cells[1].text,
                "case_title": cells[2].text,
                "last_hearing": cells[3].text,
                "next_date": cells[4].text,
            }
            list.append(data)
            
    case_data["details"]["connected_cases"] = list



In [56]:
def fetch_case_history(driver, case_data):
    table_row_path = "//table[@class='kv-grid-table table table-bordered table-striped table-condensed']/tbody/tr"
    rows = driver.find_elements(By.XPATH, table_row_path)
    
    list = []
    for row in rows:
        cells = row.find_elements(By.TAG_NAME, "td")
        if len(cells) > 1:
            data = {
                "date": cells[0].text,
                "list": cells[1].text,
                "sr_number": cells[2].text,
                "cma": cells[3].text,
                "stage": cells[6].text if len(cells) > 6 else "",
                "bench": cells[7].text if len(cells) > 7 else "",
                "status": cells[8].text if len(cells) > 8 else "",
            }
            list.append(data)
    case_data["details"]["case_history"] = list

In [57]:

def fetch_parties_details(driver, case_data):
    table_row_path = ".//div[@id='w3-tab3']//table[contains(@class,'kv-grid-table')]/tbody/tr"
    rows = driver.find_elements(By.XPATH, table_row_path)

    list = []
    for row in rows:
        cells = row.find_elements(By.TAG_NAME, "td")
        
        # skip "No results found"
        if len(cells) == 1 and "No results found" in cells[0].text:
            continue
        data = {
            "sr_number": cells[0].text,
            "party_name": cells[1].text,
        }
        list.append(data)
            
    case_data["details"]["Parties Details"] = list

In [None]:
def per_page_extraction(driver, data):
    table_rows_path = "//table/tbody/tr"
    table_rows = driver.find_elements(By.XPATH, table_rows_path)
    
    for row_idx in range(len(table_rows)):
        # re-fetch rows every iteration (to avoid stale element reference)
        table_rows = driver.find_elements(By.XPATH, table_rows_path)
        row = table_rows[row_idx]

        cells = row.find_elements(By.TAG_NAME, "td")
        if not cells or len(cells) < 12:
            continue
        
        case_data = {
            "serial_number": cells[0].text,
            "case_catagory": cells[1].text,
            "case_number": cells[2].text,
            "case_year": cells[3].text,
            "bench": cells[4].text,
            "curcuit_code": cells[5].text,
            "case_title": cells[6].text,
            "summary/tag": cells[7].text,
            "last_hearing": cells[8].text,
            "next_date": cells[9].text,
            "disposal_date": cells[10].text,
            "status": cells[11].text,
            "details": {
                "connected_cases": [],
                "misc_appln_cma": [],
                "parties_details": [],
                "case_history": []
            }
        }
        
        action_btn = row.find_element(By.CLASS_NAME, "btn")
        action_btn.click()

        # wait for modal
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "printable"))
        )
        # subsection tabs
        section_handlers = {
            "Case History": fetch_case_history,
            "Connected Cases": fetch_connected_cases,
            "Misc Appln (CMA)": fetch_misc_appln,
            "Parties Details": fetch_parties_details,
        }
        path_to_subsections = "//div[@class='tabs-x  tabs-above tab-align-left tabs-krajee']/ul/li/a"

        for section, handler in section_handlers.items():
            try:
                # always re-locate the tab element fresh (avoid stale element reference)
                subsection = WebDriverWait(driver, 5).until(
                    EC.element_to_be_clickable((By.XPATH, f"{path_to_subsections}[normalize-space(text())='{section}']"))
                )
                subsection.click()

                # wait for the tab content to load
                WebDriverWait(driver, 5).until(
                    EC.presence_of_element_located(
                        (By.XPATH, "//table[contains(@class,'kv-grid-table')]/tbody/tr")
                    )
                )

                # call the right function for this subsection
                handler(driver, case_data)

            except Exception as e:
                pass

        data['cases'].append(case_data)
        driver.back()
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//table/tbody/tr"))
        )



In [None]:
driver = webdriver.Chrome()
driver.get("https://cases.shc.gov.pk/")
driver.implicitly_wait(10)


# Find all the district divs urls FIRST and store them
path = '//div[@class="row mt-4 mb-5"]/div[@class="col-md-2 mb-3"]/a'
items = driver.find_elements(By.XPATH, path)
items = items[:-1]
mrpkh_url = "https://cases.shc.gov.pk/mpkhas"
print(f"Total items found: {len(items)}")

# Store URLs and names FIRST before any navigation
district_data = []
for item in items:
    url = item.get_attribute('href')
    try:
        name = item.find_element(By.XPATH, './/div[@class="card-header"]').text
    except:
        continue
    district_data.append({'url': url, 'name': name})

district_data[0]["url"]= mrpkh_url

print("District URLs found:")
for data in district_data:
    print(f"{data['name'].split(' ')[-1]}: {data['url']}")

# Now process each district
for district in district_data:
    district["name"]=district["name"].split(' ')[-1]
    filename = f'SindhCount_{district["name"]}.json'
    district_json = {
    "metadata":{
        "filename" :filename,
        "url" : district['url'],
    },
    "districts_list":{
        "sub_catagory":{
            "name":district['name'],
            "circuits":[]
        }
    }
}
    print(f"\n=== Processing {district['name']} ===")
    
    # Navigate to district page
    driver.get(district['url'])
    
    try:
        # Wait for the dropdown to load on the new page
        WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.ID, "casessearch-circuitcode"))
        )
        
        # Find all options on THIS page
        options = driver.find_elements(By.XPATH, '//select[@id="casessearch-circuitcode"]/option')
        options = options[1:]  # Remove first placeholder option
        
        print(f"Number of court options found: {len(options)}")
        
        # Get values
        values = [option.get_attribute('value') for option in options]
        circuit_names = [option.text for option in options]
        print("Court values are:", values)
        
        # Process each sub court value
        for i, value in enumerate(values):
            try:
                print(f"Processing court value: {value}")
                circuit_name = circuit_names[i]
                
                # Set the value using JavaScript
                driver.execute_script(f"document.getElementById('casessearch-circuitcode').value = '{value}';")
                
                # Click search button
                search_button = WebDriverWait(driver, 30).until(
                    EC.element_to_be_clickable((By.ID, "submit_search"))
                )
                search_button.click()
                
                # Wait for results page
                WebDriverWait(driver, 30).until(
                    EC.presence_of_element_located((By.XPATH, "//table"))
                )
                
                circuit_data = {
                    "circuit_code": value,
                    "circuit_name": circuit_name,
                    "cases": []
                }
                
                # extract data from first page 
                try:
                    per_page_extraction(driver, circuit_data)

                    district_json['districts_list']['sub_catagory']['circuits'].append(circuit_data)
                except Exception as e:
                    continue
                page_count=1

                # Check for pagination and navigate through pages
                while True:
                    try:       
                        next_buttons = driver.find_elements(By.CSS_SELECTOR, "li.next a")
                        page_count +=1
                        if not next_buttons or page_count>=5:      # STOP SCRAPING AFTER 30 PAGES FROM EACH SUBCOURT IN EACH MAIN COURT
                            print("No next page button found. Stopping pagination.")
                            break

                        next_button = next_buttons[0]

                        # check if disabled
                        parent_li = next_button.find_element(By.XPATH, "./..")
                        if "disabled" in parent_li.get_attribute("class"):
                            print("Reached the last page.")
                            break

                        # go to next page
                        next_button.click()
                        WebDriverWait(driver, 10).until(
                            EC.presence_of_element_located((By.XPATH, "//table/tbody/tr"))
                        )

                        per_page_extraction(driver, circuit_data)
                            
                    except Exception as e:
                        break
                    

                # save data of each district
                with open(filename, 'w', encoding='utf-8') as f:
                    json.dump(district_json, f, indent=2)

                # Go back to the district search page
                driver.back()
                # Wait for the search page to reload
                WebDriverWait(driver, 30).until(
                    EC.presence_of_element_located((By.ID, "casessearch-circuitcode"))
                )
                print("Returned to district search page")
                
            except Exception as e:
                pass
                # Try to recover by going back to district page
                try:
                    driver.get(district['url'])
                    WebDriverWait(driver, 30).until(
                        EC.presence_of_element_located((By.ID, "casessearch-circuitcode"))
                    )
                except:
                    pass
                
    except Exception as e:
        continue
driver.quit()