In [1]:
# !pip list
#!pip install selenium
# !pip install python-dotenv

In [5]:
!pip list

Package            Version
------------------ -----------
appnope            0.1.4
asttokens          2.4.1
attrs              23.2.0
certifi            2024.6.2
comm               0.2.2
debugpy            1.8.2
decorator          5.1.1
exceptiongroup     1.2.0
executing          2.0.1
h11                0.14.0
idna               3.7
importlib_metadata 8.0.0
ipykernel          6.29.5
ipython            8.26.0
jedi               0.19.1
jupyter_client     8.6.2
jupyter_core       5.7.2
matplotlib-inline  0.1.7
nest_asyncio       1.6.0
numpy              2.0.0
outcome            1.3.0.post0
packaging          24.1
parso              0.8.4
pexpect            4.9.0
pickleshare        0.7.5
pip                24.0
platformdirs       4.2.2
prompt_toolkit     3.0.47
psutil             6.0.0
ptyprocess         0.7.0
pure-eval          0.2.2
Pygments           2.18.0
PySocks            1.7.1
python-dateutil    2.9.0
python-dotenv      1.0.1
pyzmq              26.0.3
selenium           4.22.0
set

In [6]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

import numpy as np

from dotenv import load_dotenv
import os

In [7]:
def NexisUni_search_url(search_terms):
    """
    Notes:
    * In the base URL, the "search" endpoint indicates that the user wants to run a search and view the results

    * _Serch tearms_ is a Query parameter that contains the search syntax. Anything in Nat lang.
    * The _context_ "Distinguishes between primary Lexis Advance and adaptations of that product, such as Nexis, Nexis Uni, and Prison Kiosk"
        it is hartcoded to 1516831 since it is the only option I found in the "Help" section.
    """

    base_url = "http://advance.lexis.com.proxy2.library.illinois.edu/api/search"
    context = "1516831"  # Hardcoded

    # Replace spaces with %20 in search terms
    formatted_search_terms = search_terms.replace(" ", "%20")

    # Construct the final URL
    final_url = f"{base_url}?q={formatted_search_terms}&context={context}"

    return final_url


In [8]:
search_terms = "Educational Policies Illinois"
url = NexisUni_search_url(search_terms)
print(url)

http://advance.lexis.com.proxy2.library.illinois.edu/api/search?q=Educational%20Policies%20Illinois&context=1516831


In [9]:
load_dotenv()

# Access environment variables.
print(os.getenv('usr'))

gfs3@illinois.edu


In [36]:
# Set up the Selenium WebDriver (adjust the path to your WebDriver if needed)
driver = webdriver.Chrome()  # or 'webdriver.Firefox()' for Firefox

try:
    # Step 1: Navigate to the initial URL
    driver.get(url)

    # Step 2: Wait for redirection to login page
    WebDriverWait(driver, 5).until(
        EC.url_contains("login")
    )

    # Step 3: Perform login
    username = os.getenv('usr')
    password = os.getenv('psw')

    # Enter username and proceed
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.NAME, 'loginfmt'))
    ).send_keys(username)
    driver.find_element(By.NAME, 'loginfmt').send_keys(Keys.RETURN)

    # Enter password and proceed
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.NAME, 'passwd'))
    ).send_keys(password)
    driver.find_element(By.NAME, 'passwd').send_keys(Keys.RETURN)

    # Click the "Sign in" button
    sign_in_button = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//input[@type="submit" and @value="Sign in"]'))
    )
    sign_in_button.click()

    # Step 4: Wait for redirection to Duo Security page
    WebDriverWait(driver, 20).until(
        EC.url_contains("duosecurity.com")
    )

    # Inform the user to approve 2FA on their device
    print("Please approve the 2FA request on your device...")

    # Step 5: Polling loop to wait for the Duo Security approval and the appearance of the "Yes, this is my device" button
    while True:
        try:
            # Check if the "Yes, this is my device" button is present
            yes_button = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Yes, this is my device")]'))
            )
            # Click the button if found
            yes_button.click()
            break
        except:
            # If the button is not found, wait and retry
            time.sleep(1)
            continue
    
    # Step 6: Wait for final redirection back to Nexis Uni
    WebDriverWait(driver, 20).until(
        EC.url_contains("advance-lexis-com")
    )

    # Verify the search results page is loaded
    print("Current URL after login:", driver.current_url)
    if "advance-lexis-com" in driver.current_url:
        print("Successfully logged in and redirected to the search results page.")
        
        #=======================================================#\
        no_docs = 6140 ####### Hardcoded...
        print("Number of pages: ", no_docs // 10)
        #=======================================================#

        for doc_set in range(0, no_docs, 100):
            print(f"Processing documents {doc_set + 1} to {doc_set + 100}...")
            
            ## 1. Define the 2 first buttons
            ## 1.1.
            select_all_checkbox = WebDriverWait(driver, 20).until(
                    EC.element_to_be_clickable((By.XPATH, '//*[@id="results-list-delivery-toolbar"]/div/ul[1]/li[1]/input'))
                )
            ## 1.2.
            next_page_button = driver.find_element(By.CSS_SELECTOR, 'a[data-action="nextpage"]')

            ## 2. Select all items on the page
            time.sleep(1)

            ## 3. Loop through the pages and select all docs
            for i in range(0, 10):  # Process 10 pages at a time
                if doc_set + i * 10 >= no_docs:
                    break

                # Step 3.1: Click the "Select all items on this page" checkbox
                select_all_checkbox = WebDriverWait(driver, 20).until(
                    EC.element_to_be_clickable((By.XPATH, '//*[@id="results-list-delivery-toolbar"]/div/ul[1]/li[1]/input'))
                )
                select_all_checkbox.click()
                time.sleep(5)

                # Step 3.2. Select next page
                next_page_button = WebDriverWait(driver, 20).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, 'a[data-action="nextpage"]'))
                )
                driver.execute_script("arguments[0].click();", next_page_button)
                print(f"Now on (page {doc_set/10 + i + 2}).")
                time.sleep(5)
            
            # Step 4: Define & Click the download button using JavaScript
            download_button = WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.XPATH, '//button[@data-action="downloadopt"]'))
            )
            driver.execute_script("arguments[0].click();", download_button)
            time.sleep(2.5)

            # Step 5: Define & Click the additional button using the provided XPath
            additional_button = WebDriverWait(driver, 20).until(
                EC.element_to_be_clickable((By.XPATH, '//*[@id="SeparateFiles"]'))
            )
            additional_button.click()
            time.sleep(2.5)

            # Step 6: Define & Click the final download button using the provided XPath
            final_download_button = WebDriverWait(driver, 20).until(
                EC.element_to_be_clickable((By.XPATH, '/html/body/aside/footer/div/button[1]'))
            )
            final_download_button.click()
            time.sleep(5)
            
            # Step 7: Click to bring up the dialog box for clearing all selections
            dialog_box_button = WebDriverWait(driver, 20).until(
                EC.element_to_be_clickable((By.XPATH, '//*[@id="results-list-delivery-toolbar"]/div/ul[1]/li[2]/div/button/span[2]'))
            )
            dialog_box_button.click()
            time.sleep(5)
            
            # Step 8: Unselect all items after downloading
            clear_all_button = WebDriverWait(driver, 20).until(
                EC.element_to_be_clickable((By.XPATH, '//*[@id="viewtray-dropdown"]/div/div[1]/div/button[2]'))
            )
            driver.execute_script("arguments[0].click();", clear_all_button)
            time.sleep(5)

            # Step 9: Confirm the clear all action
            confirm_clear_all_button = WebDriverWait(driver, 20).until(
                EC.element_to_be_clickable((By.XPATH, '/html/body/aside/footer/div/button[1]'))
            )
            confirm_clear_all_button.click()
            time.sleep(5)

            # Step 10: sleep if doc_set is 2000 or a multiple of 2000
            if (doc_set + 100) % 2000 == 0:
                print("Chilling 1 day (2000 per day limit)...")
                for _ in range(24 * 4):  # 24 hours * 4 intervals per hour (each interval is 15 minutes)
                    time.sleep(900)  # 15 minutes sleep in each iteration to keep the session alive
                    # Perform a generic click to keep the session alive
                    driver.execute_script("document.body.click();")

            # Step 11: sleep if doc_set is 1000 or a multiple of 1000
            elif (doc_set + 100) % 1000 == 0:
                print("Chilling 1 hr (1000 per hour limit)...")
                for _ in range(4):  # 4 intervals per hour (each interval is 15 minutes)
                    time.sleep(900)  # 15 minutes sleep in each iteration to keep the session alive
                    # Perform a generic click to keep the session alive
                    driver.execute_script("document.body.click();")

            # Step 12: sleep if doc_set is 500 or a multiple of 500
            elif (doc_set + 100) % 500 == 0:
                print("Chilling 1 hr (500 per hour limit)...")
                for _ in range(4):  # 4 intervals per hour (each interval is 15 minutes)
                    time.sleep(900)  # 15 minutes sleep in each iteration to keep the session alive
                    # Perform a generic click to keep the session alive
                    driver.execute_script("document.body.click();")


finally:
    # Close the browser
    print(f"Downloaded {doc_set} files. Closing the browser...")
    time.sleep(7)
    #driver.quit()


Please approve the 2FA request on your device...
Current URL after login: https://advance-lexis-com.proxy2.library.illinois.edu/search?crid=a3a31022-427e-42f5-ac93-ae66bb0b5e85&pdsearchterms=Educational+Policies+Illinois&pdtypeofsearch=urlapi&pdfiltertext=urn%3Ahlct%3A16%2Curn%3Ahlct%3A6%2Curn%3Ahlct%3A7%2Curn%3Ahlct%3A5%2Curn%3Ahlct%3A15%2Curn%3Ahlct%3A1%2Curn%3Ahlct%3A2%2Curn%3Ahlct%3A3%2Curn%3Ahlct%3A4%2Curn%3Ahlct%3A10%2Curn%3Ahlct%3A14%2Curn%3Ahlct%3A8%2Curn%3Ahlct%3A13%2Curn%3Ahlct%3A12%2Curn%3Ahlct%3A9%2Curn%3Ahlct%3A18%2Curn%3Ahlct%3A11&pdsearchtype=dynand&pdmfid=1519360&pdisurlapi=true
Successfully logged in and redirected to the search results page.
Number of pages:  614
Processing documents 1 to 100...
Now on (page 2.0).
Now on (page 3.0).
Now on (page 4.0).
Now on (page 5.0).
Now on (page 6.0).
Now on (page 7.0).
Now on (page 8.0).
Downloaded 0 files. Closing the browser...


ElementClickInterceptedException: Message: element click intercepted: Element <input type="checkbox" data-action="selectall" aria-label="Select all items on this page"> is not clickable at point (366, 223). Other element would receive the click: <div class="box" aria-busy="true" style="vertical-align: middle;">...</div>
  (Session info: chrome=126.0.6478.127)
Stacktrace:
0   chromedriver                        0x0000000107cc10e8 chromedriver + 5169384
1   chromedriver                        0x0000000107cb8fba chromedriver + 5136314
2   chromedriver                        0x000000010783536c chromedriver + 402284
3   chromedriver                        0x00000001078896b6 chromedriver + 747190
4   chromedriver                        0x00000001078874e8 chromedriver + 738536
5   chromedriver                        0x0000000107884ef7 chromedriver + 728823
6   chromedriver                        0x0000000107883d12 chromedriver + 724242
7   chromedriver                        0x0000000107877562 chromedriver + 673122
8   chromedriver                        0x00000001078a5ab2 chromedriver + 862898
9   chromedriver                        0x0000000107876ed8 chromedriver + 671448
10  chromedriver                        0x00000001078a5c6e chromedriver + 863342
11  chromedriver                        0x00000001078c4f57 chromedriver + 991063
12  chromedriver                        0x00000001078a5853 chromedriver + 862291
13  chromedriver                        0x00000001078755c6 chromedriver + 665030
14  chromedriver                        0x0000000107875e4e chromedriver + 667214
15  chromedriver                        0x0000000107c83d00 chromedriver + 4918528
16  chromedriver                        0x0000000107c88cfd chromedriver + 4939005
17  chromedriver                        0x0000000107c893d5 chromedriver + 4940757
18  chromedriver                        0x0000000107c64de4 chromedriver + 4791780
19  chromedriver                        0x0000000107c896c9 chromedriver + 4941513
20  chromedriver                        0x0000000107c565b4 chromedriver + 4732340
21  chromedriver                        0x0000000107ca9898 chromedriver + 5073048
22  chromedriver                        0x0000000107ca9a57 chromedriver + 5073495
23  chromedriver                        0x0000000107cb8b6e chromedriver + 5135214
24  libsystem_pthread.dylib             0x00007ff818e964e1 _pthread_start + 125
25  libsystem_pthread.dylib             0x00007ff818e91f6b thread_start + 15


In [13]:
## from claude Sonnet:

""" 

import logging
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import time
import argparse
from config import URL, USERNAME, PASSWORD  # Create a config.py file for these

def setup_logging():
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def setup_driver():
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # Run in headless mode if visual feedback isn't needed
    return webdriver.Chrome(options=options)

def login(driver):
    driver.get(URL)
    # ... (login logic)
    logging.info("Login successful")

def navigate_pages(driver, num_pages):
    for page in range(num_pages):
        try:
            select_all_items(driver)
            navigate_to_next_page(driver)
        except Exception as e:
            logging.error(f"Error on page {page + 1}: {e}")
            break

def select_all_items(driver):
    checkbox = WebDriverWait(driver, 20).until(
        EC.element_to_be_clickable((By.XPATH, '//*[@id="results-list-toolbar-gvs"]/ul[1]/li[1]/input'))
    )
    checkbox.click()
    logging.info("Selected all items on the page")

def navigate_to_next_page(driver):
    next_button = WebDriverWait(driver, 20).until(
        EC.element_to_be_clickable((By.CSS_SELECTOR, 'a[data-action="nextpage"]'))
    )
    driver.execute_script("arguments[0].click();", next_button)
    logging.info("Navigated to the next page")

def download_results(driver):
    # ... (download logic)
    logging.info("Results downloaded successfully")

def main(num_pages):
    setup_logging()
    driver = setup_driver()
    try:
        login(driver)
        navigate_pages(driver, num_pages)
        download_results(driver)
    except Exception as e:
        logging.error(f"An error occurred: {e}")
    finally:
        driver.quit()

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Scrape Nexis Uni search results")
    parser.add_argument("--pages", type=int, default=50, help="Number of pages to scrape")
    args = parser.parse_args()
    main(args.pages)


"""

' \n\nimport logging\nfrom selenium import webdriver\nfrom selenium.webdriver.common.by import By\nfrom selenium.webdriver.support.ui import WebDriverWait\nfrom selenium.webdriver.support import expected_conditions as EC\nfrom selenium.common.exceptions import TimeoutException, NoSuchElementException\nimport time\nimport argparse\nfrom config import URL, USERNAME, PASSWORD  # Create a config.py file for these\n\ndef setup_logging():\n    logging.basicConfig(level=logging.INFO, format=\'%(asctime)s - %(levelname)s - %(message)s\')\n\ndef setup_driver():\n    options = webdriver.ChromeOptions()\n    options.add_argument(\'--headless\')  # Run in headless mode if visual feedback isn\'t needed\n    return webdriver.Chrome(options=options)\n\ndef login(driver):\n    driver.get(URL)\n    # ... (login logic)\n    logging.info("Login successful")\n\ndef navigate_pages(driver, num_pages):\n    for page in range(num_pages):\n        try:\n            select_all_items(driver)\n            navigat

In [30]:
print(f"Downloaded {doc_set} files. Closing the browser...")


Downloaded 500 files. Closing the browser...
