In [None]:
import time

import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC



### Load page
- Connect to a browser driver to initialize a browser session
- Navigate to the ArcGIS table that needs scrapping 
- wait for all elements to load

In [None]:
# connect to firefox driver
driver_dir = r"C:\Program Files (x86)\geckodriver.exe"
driver = webdriver.Firefox(executable_path = driver_dir)

# navigate to the site
url = 'https://www.arcgis.com/home/webmap/viewer.html?url=https%3A%2F%2Fgis2.ncdcr.gov%2Fdncrgis%2Frest%2Fservices%2FNCHHM_Public%2FNC_Highway_Historical_Markers%2FMapServer&source=sd'
driver.get(url)

wait = WebDriverWait(driver, 60)  # wait for elements to load up to 30 seconds

#### inspect and generate button element
- the table that needs to be scrapped is not generated when the webpage is loaded 
- dynamically loaded elements take time a condition is set to scrape the elements when all of them are loaded
- to generate the table a tab in the navigation panel must be hovered over
then a button must be pressed to generate the table
- upon manually inspecting elements to generate the markers the following was found:
1. panel element __id__ in source page: 'leftContentPanel'
2. dynamically generated panel tab (with dynamic id): 'NC_Highway_Historical_Markers_704'
    - __Note__: shows a button when clicked or hovered over
    - __Note:__: 704 is a dynamically generated part of the dynamically generated id 
3. dynamically generated button __id__: 'NC_Highway_Historical_Markers_X_tableTool'
    - __Note:__: 704 is a dynamically generated part of the dynamically generated id 

__Note__: Selenium have methods for waiting for a ceartain elements to load. however, there sis no functions for dynamic elements to load (that i know of till now). so i used a loop that keep checking for the element(s)

    

In [None]:
# static part of the button id
element_static_string=  'NC_Highway_Historical_Markers'

# loop time exit:
start_time = time.time()

# wait for all elements to load
for  i in range(24):
    if len(driver.find_elements_by_css_selector(f"[id*= {element_static_string} ]")) >= 44:
        panel = driver.find_elements_by_css_selector(f"[id*= {element_static_string} ]")
        break
    else:
        time.sleep(5)
        loading_time = time.time() - start_time

print('loading_time: ',loading_time)

    
# identify the index of button element
hover_index = 35
button_index = 37

# click the button element to generate the table
start_time = time.time()

for i in range(10):
    if len([element.get_attribute("id") for element in panel]) >= 37:
        panel_elements_list = [element.get_attribute("id") for element in panel]
        print('hover id = ', panel_elements_list[35])
        print('button id = ', panel_elements_list[37])
        panel[35].click()
        time.sleep(1)
        panel[37].click()
        break
    else:
        time.sleep(5)
        print('loading time =', time.time()-start_time)
        


### inspect table element
After generating and inspecting the table
- the table had a unique class: 'dgrid-scroller'
- the rows without the table had a class: 'dgrid-content ui-widget-content'
- each row had a dynamically generated id: 'dgrid_704-row-45'
    - __Note:__ 704 is a dynamically element of the id
    - __Note:__ 45 is the row number 

In [None]:
# wait for the table to load
time.sleep(10)

# table element
table = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".dgrid-scroller")))[0]
print('table id=', table.get_attribute('class'))

# rows element (table with no header)
rows_container = table.find_element_by_css_selector(".dgrid-content")
print('row container id=', rows_container.get_attribute('class'))
print()

# row element partial id
rows = rows_container.find_elements_by_css_selector("[id*='dgrid_']")

print('first loaded row id =', rows[0].get_attribute('id'))
print('last loaded row id =', rows[-1].get_attribute('id'))

### generate all table rows
the page only loads 50 rows from the table. to load all rows the user must scroll down __in the table element__
- scrolling down a specific number of pixels only works when the row hight is consistent
- by keeping only only the id column the height is consistent and the scrolling rate is consistent with the table content generation rate
- since the other columns are hidden the script doesn't parse their values 
- by rerunning the parsing script a second time after selecting all column, since all the data is already generated but was hidden, all the data is parsed. (a future version of the code should handle)

In [None]:
# open the hider selector to show the column elements
column_hider_button = driver.find_element_by_css_selector('button.ui-icon').click()
print('opened column hider selector')

# find the column name element in the column hider
column_names_elements = driver.find_element_by_css_selector("[id$='hider-menu']").find_elements_by_tag_name('div')

# extract the names of the column from each element
column_names = [column_name.text for column_name in column_names_elements][2:]
print(column_names)

# uncheck all columns from the menu
all_columns = driver.find_element_by_css_selector("[id$='-hider-menu-check-all']")

def press_all_columns_button():
    all_columns.click()
    print('all columns clicked')
    time.sleep(2)

# check all column
press_all_columns_button()

# uncheck all columns
press_all_columns_button()


# check the marker id column only for consistent row hight
driver.find_element_by_css_selector('.hider-menu-label-0').click()
print('show marker id column clicked')
time.sleep(2)



### scrolling loop to generate all the table's elements

In [None]:
# number of rows
num_rows= 1620

# initialize an empty table
df = pd.DataFrame(columns = column_names)

# scrolling_variables
scroll_value = 100 # number of pixels to scroll
table_element = driver.find_element_by_css_selector(".dgrid-scroller")

# extracting data

for i in range(1619): # iterate over all table rows
    
    start_time = time.time()
    current_row_list = []    
    for j in range(len(column_names)): # iterate over current row columns      

        current_row_list.append(
            driver.find_element_by_css_selector(f"[id$='-row-{i+2}']")
            .find_elements_by_class_name("dgrid-cell")[j] # column element j
            .find_element_by_tag_name("div") # cell element 
            .text # cell value
            )
    print('row number:', i, 'time:', time.time()- start_time)
  
    current_row = pd.DataFrame([current_row_list], columns=df.columns)
    df = pd.concat([df, current_row], ignore_index=True, axis = 0)
    
    # scroll action    
    driver.execute_script(f"arguments[0].scrollTop += {scroll_value}", table_element)

    

In [None]:
### Parsing loop to generate the final table

In [None]:
press_all_columns_button()


# number of rows
num_rows= 1620

# initialize an empty table
df = pd.DataFrame(columns = column_names)

# scrolling_variables
scroll_value = 100 # number of pixels to scroll
table_element = driver.find_element_by_css_selector(".dgrid-scroller")

# extracting data

for i in range(1619): # iterate over all table rows
    
    start_time = time.time()
    current_row_list = []    
    for j in range(len(column_names)): # iterate over current row columns      

        current_row_list.append(
            driver.find_element_by_css_selector(f"[id$='-row-{i+2}']")
            .find_elements_by_class_name("dgrid-cell")[j] # column element j
            .find_element_by_tag_name("div") # cell element 
            .text # cell value
            )
    print('row number:', i, 'time:', time.time()- start_time)
  
    current_row = pd.DataFrame([current_row_list], columns=df.columns)
    df = pd.concat([df, current_row], ignore_index=True, axis = 0)



In [None]:
print(df.shape)
display(df.head(2))

In [None]:
df.to_csv("data/gis_markers_table.csv")

In [None]:
driver.quit()