In [None]:
!apt update
!apt install chromium-chromedriver
!pip install selenium

In [None]:
!wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb && apt install ./google-chrome-stable_current_amd64.deb


In [39]:
import os
import re
import subprocess
import requests

# The deb files we need to install
deb_files_startstwith = [
    "chromium-codecs-ffmpeg-extra_",
    "chromium-codecs-ffmpeg_",
    "chromium-browser_",
    "chromium-chromedriver_"
]

def get_latest_version() -> str:
    # A request to security.ubuntu.com for getting latest version of chromium-browser
    # e.g. "112.0.5615.49-0ubuntu0.18.04.1_amd64.deb"
    url = "http://security.ubuntu.com/ubuntu/pool/universe/c/chromium-browser/"
    r = requests.get(url)
    if r.status_code != 200:
        raise Exception("status_code code not 200!")
    text = r.text

    # Find latest version
    pattern = '<a\shref="chromium\-browser_([^"]+.ubuntu0\.18\.04\.1_amd64\.deb)'
    latest_version_search = re.search(pattern, text)
    if latest_version_search:
        latest_version = latest_version_search.group(1)
    else:
        raise Exception("Can not find latest version!")
    return latest_version

def download(latest_version: str, quiet: bool):
    deb_files = []
    for deb_file in deb_files_startstwith:
        deb_files.append(deb_file + latest_version)

    for deb_file in deb_files:
        url = f"http://security.ubuntu.com/ubuntu/pool/universe/c/chromium-browser/{deb_file}"

        # Download deb file
        if quiet:
            command = f"wget -q -O /content/{deb_file} {url}"
        else:
            command = f"wget -O /content/{deb_file} {url}"
        print(f"Downloading: {deb_file}")
        # os.system(command)
        !$command

        # Install deb file
        if quiet:
            command = f"apt-get install /content/{deb_file} >> apt.log"
        else:
            command = f"apt-get install /content/{deb_file}"
        print(f"Installing: {deb_file}\n")
        # os.system(command)
        !$command

        # Delete deb file from disk
        os.remove(f"/content/{deb_file}")

def check_chromium_installation():
    try:
        subprocess.call(["chromium-browser"])
        print("Chromium installation successfull.")
    except FileNotFoundError:
        print("Chromium Installation Failed!")

def install_selenium_package(quiet: bool):
    if quiet:
        !pip install selenium -qq >> pip.log
    else:
        !pip install selenium

def main(quiet: bool):
    # Get the latest version of chromium-browser for ubuntu 18.04
    latest_version = get_latest_version()
    # Download and install chromium-browser for ubuntu 20.04
    download(latest_version, quiet)
    # Check if installation succesfull
    check_chromium_installation()
    # Finally install selenium package
    install_selenium_package(quiet)

if __name__ == '__main__':
    quiet = True # verboseness of wget and apt
    main(quiet)

Downloading: chromium-codecs-ffmpeg-extra_112.0.5615.49-0ubuntu0.18.04.1_amd64.deb
Installing: chromium-codecs-ffmpeg-extra_112.0.5615.49-0ubuntu0.18.04.1_amd64.deb

Downloading: chromium-codecs-ffmpeg_112.0.5615.49-0ubuntu0.18.04.1_amd64.deb
Installing: chromium-codecs-ffmpeg_112.0.5615.49-0ubuntu0.18.04.1_amd64.deb

Downloading: chromium-browser_112.0.5615.49-0ubuntu0.18.04.1_amd64.deb
Installing: chromium-browser_112.0.5615.49-0ubuntu0.18.04.1_amd64.deb

Downloading: chromium-chromedriver_112.0.5615.49-0ubuntu0.18.04.1_amd64.deb
Installing: chromium-chromedriver_112.0.5615.49-0ubuntu0.18.04.1_amd64.deb

Chromium installation successfull.


In [47]:
from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
wd = webdriver.Chrome('chromedriver', options=chrome_options)


In [49]:
# Import the necessary libraries
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import StaleElementReferenceException

from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

# Define the base URL and the number of pages to search
base_url = "http://kenyalaw.org/caselaw/cases/advanced_search/page/11"
num_pages = 10

download_directory = r'G:\\Downloads New'
options = chrome_options
options.add_experimental_option('prefs', {'download.default_directory': download_directory})
driver = wd


# Loop through each page of search results
for page_num in range(1, num_pages + 1):
    # Construct the URL for the current search results page
    url = base_url + str(page_num) + "/"
    
    # Navigate to the specified URL
    driver.get(url)
    
    read_more_buttons = driver.find_elements(By.XPATH, "//a[@class='show-more pull-right']")

    
    # Click each "Read More" button to navigate to the case details page
    for button in read_more_buttons:
        try:
            driver.execute_script("arguments[0].click();", button)

            download_button = WebDriverWait(driver, 40).until(
                EC.element_to_be_clickable((By.XPATH, "//a[@title='Download Original Document']"))
            )
        

        
        # Click the download button to initiate a download
            driver.execute_script("arguments[0].click();", download_button)
        
        # Navigate back to the search results page
            driver.back()
        except StaleElementReferenceException:
            continue

# Close the webdriver
driver.quit()






MaxRetryError: ignored

In [50]:
import os
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

# Set the base URL for the search results page
base_url = "http://kenyalaw.org/caselaw/cases/advanced_search/page/"
num_pages = 10

# Set the download directory to the default downloads folder
download_directory = os.path.expanduser("~") + r"G:\\Downloads New"

# Configure ChromeOptions to set the download directory

options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--start-maximized")
options.add_argument("--disable-infobars")
options.add_argument("--disable-extensions")
options.add_argument("--incognito")
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_experimental_option("prefs", {
    "download.default_directory": download_directory,
    "download.prompt_for_download": False,
    "download.directory_upgrade": True,
    "safebrowsing.enabled": True
})

# Create a new instance of the webdriver with the configured options
driver = webdriver.Chrome(options=options)

# Loop through each page of search results
for page_num in range(1, num_pages + 1):
    # Construct the URL for the current search results page
    url = base_url + str(page_num) + "/"

    # Navigate to the specified URL
    driver.get(url)

    # Find the "Read More" buttons
    read_more_buttons = driver.find_elements(By.XPATH, "//a[@class='show-more pull-right']")

    # Click each "Read More" button to navigate to the case details page
    for button in read_more_buttons:
        try:
            driver.execute_script("arguments[0].click();", button)

            # Wait for the download button to become clickable
            download_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//a[@title='Download Original Document']"))
            )

            # Click the download button to initiate a download
            driver.execute_script("arguments[0].click();", download_button)

            # Navigate back to the search results page
            driver.back()
        except StaleElementReferenceException:
            continue

# Close the webdriver
driver.quit()
