# Info
This scraper will work for almost any Google/Umich/Duo authenticated site.

Requires a .env with the following info:

uniqname = **uniqname**

password = **password**

If you don't need to authenticate with Google, set google_flow to **False**.
This webscraper runs headless by default, meaning you won't see what it's doing. If you want to see what it's doing, set headless to **False**.
If you don't want it to save the HTML of each page, set links_only to **True**.


In [None]:
google_flow = False 
headless = True
links_only = True

start = 'https://sheets.wiki/'
scope = 'https://sheets.wiki/'

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os
from dotenv import load_dotenv
from selenium.webdriver.chrome.options import Options
if headless:
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")

try:
    if load_dotenv('.env') is False:
        raise TypeError
except TypeError:
    print('Unable to load .env file.')
    quit()

uniqname = os.environ['uniqname']
password = os.environ['password']
email = uniqname + '@umich.edu'

# Initialize the WebDriver (choose your browser)
driver = webdriver.Chrome(options=chrome_options)

try:
    driver.get(start)

    if google_flow:
        # Google Sign-In
        email_field = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, 'identifierId'))
        )
        email_field.send_keys(email)
        email_field.send_keys(Keys.RETURN)

        # UMICH Sign-In
        username_field = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, 'username'))
        )
        username_field.send_keys(uniqname)

        password_field = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, 'password'))
        )
        password_field.send_keys(password)
        password_field.send_keys(Keys.RETURN)

        ### THE USER NEEDS TO AUTHENTICATE WITH DUO. ###
        # Afterwards it will click through the standard prompts.
        device_confirm = WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.ID, 'dont-trust-browser-button'))
        )
        device_confirm.click()

        account_confirm = WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'VfPpkd-vQzf8d'))
        )
        account_confirm.click()

    # Wait until the URL contains the scope
    WebDriverWait(driver, 30).until(
        EC.url_contains(scope)
    )

    # Create an empty set to store unique links
    unique_links = set()

    def crawl_links(driver, url):
        if url not in unique_links:
            unique_links.add(url)
            
            # Navigate to the url
            driver.get(url)
            
            if not links_only:
                # Save the HTML of the page to /unparsed
                with open('./unparsed/' + url.split('/')[-1] + '.html', 'w', encoding='utf-8') as file:
                    file.write(driver.page_source)
            
            # Find all anchor tags on the page
            anchor_tags = driver.find_elements(By.TAG_NAME, 'a')
            
            # Collect all valid hrefs before recursing
            hrefs_to_crawl = []
            for tag in anchor_tags:
                href = tag.get_attribute('href')
                
                # Check if the href is not None and contains 'its-internship-pm-handbook'
                if href is not None and scope in href:
                    hrefs_to_crawl.append(href)
            
            # Now recurse on collected hrefs
            for href in hrefs_to_crawl:
                crawl_links(driver, href)

    crawl_links(driver, start)
    print(unique_links)
    for link in unique_links:
        with open('./unparsed/links.txt', 'a') as file:
            file.write(link + '\n')
                

finally:
    # Close the browser window
    driver.quit()