In [11]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import os
import time
import random

# Base URL without page number
base_url = 'https://501c3lookup.org/state/GU?pagenum='

# Set up the WebDriver
driver = webdriver.Chrome()

# Dictionary to store data
data_dict = {
    'organization': [],
    'cities': [],
    'eins': [],
    'po_box_number': [],
    'postal_code': [],
    'map_url': [],
    'larger_map_link': [],
    'organization_code': [],
    'deductibility_code': [],
    'affiliation_code': [],
    'subsection_classification_codes': [],
    'activity_codes': [],
    'ntee_common_code': [],
    'ntee_code': [],
    'foundation_code': [],
    'exempt_organization_status_code': [],
    'tax_period': [],
    'accounting_period': [],
    'income_code': [],
    'income_amount': [],
    'form_990_revenue_amount': [],
    'ruling_date': [],
    'asset_code': [],
    'asset_amount': [],
    'filing_requirement_code': [],
    'pf_filing_requirement_code': []
}

# Output CSV file
output_file = 'data_incremental.csv'

# Check if the file exists
if not os.path.exists(output_file):
    # Create the CSV file and write headers if it doesn't exist
    with open(output_file, 'w', newline='') as file:
        writer = csv.writer(file)
        headers = list(data_dict.keys())
        writer.writerow(headers)

# Function to append data to CSV
def append_data_to_csv(data):
    with open(output_file, 'a', newline='') as file:
        writer = csv.writer(file)
        writer.writerows(data)

# Function to handle retries
def fetch_url_with_retries(url, retries=5, backoff_factor=1):
    for attempt in range(retries):
        try:
            driver.get(url)
            return
        except Exception as e:
            if "429" in str(e):
                wait_time = backoff_factor * (2 ** attempt) + random.uniform(0, 1)
                print(f"429 Too Many Requests. Retrying in {wait_time:.2f} seconds...")
                time.sleep(wait_time)
            else:
                raise e

# Loop through the pages
for page in range(3, 10):  # Adjust range as needed
    url = f'{base_url}{page}'
    fetch_url_with_retries(url)

    # Print the URL to verify
    print(f'Fetching page: {driver.current_url}')

    # Wait for table rows to be present
    WebDriverWait(driver, 120).until(
        EC.presence_of_all_elements_located((By.XPATH, '//tbody/tr'))
    )

    # Get all table rows
    rows = driver.find_elements(By.XPATH, '//tbody/tr')

    page_data = []  # Temporary storage for the current page data

    for index in range(len(rows)):
        try:
            # Refresh the rows list and locate the specific row again
            rows = driver.find_elements(By.XPATH, '//tbody/tr')

            # Extract EIN
            ein = rows[index].find_element(By.XPATH, './td[3]').text

            # Extract Organization
            org_element = rows[index].find_element(By.XPATH, './td[4]/a')
            org_href = org_element.get_attribute('href')
            org_text = org_element.text

            # Extract City
            city_text = rows[index].find_element(By.XPATH, './td[5]').text

            # Debug print statements
            print(f'Organization: {org_text}')
            print(f'EIN: {ein}')
            print(f'City: {city_text}')
            print(f'URL: {org_href}')

            # Visit the detailed page
            fetch_url_with_retries(org_href)

            # Extract detailed information
            po_box_number = driver.find_element(By.XPATH, '//span[@itemprop="postOfficeBoxNumber"]').text.strip() if driver.find_elements(By.XPATH, '//span[@itemprop="postOfficeBoxNumber"]') else 'N/A'
            postal_code = driver.find_element(By.XPATH, '//span[@itemprop="postalCode"]').text.strip() if driver.find_elements(By.XPATH, '//span[@itemprop="postalCode"]') else 'N/A'
            map_url = driver.find_element(By.XPATH, '//iframe[contains(@src, "maps.google.com/maps")]').get_attribute('src') if driver.find_elements(By.XPATH, '//iframe[contains(@src, "maps.google.com/maps")]') else 'N/A'
            larger_map_link = driver.find_element(By.XPATH, '//small/a').get_attribute('href') if driver.find_elements(By.XPATH, '//small/a') else 'N/A'
            organization_code = driver.find_element(By.XPATH, '//h5[contains(text(), "ORGANIZATION CODE")]/span').text.strip() if driver.find_elements(By.XPATH, '//h5[contains(text(), "ORGANIZATION CODE")]/span') else 'N/A'
            deductibility_code = driver.find_element(By.XPATH, '//h5[contains(text(), "DEDUCTIBILITY CODE")]/span').text.strip() if driver.find_elements(By.XPATH, '//h5[contains(text(), "DEDUCTIBILITY CODE")]/span') else 'N/A'
            affiliation_code = driver.find_element(By.XPATH, '//h5[contains(text(), "AFFILIATION CODE")]/span').text.strip() if driver.find_elements(By.XPATH, '//h5[contains(text(), "AFFILIATION CODE")]/span') else 'N/A'
            subsection_classification_codes = driver.find_element(By.XPATH, '//h5[contains(text(), "SUBSECTION/CLASSIFICATION CODES")]/span').text.strip() if driver.find_elements(By.XPATH, '//h5[contains(text(), "SUBSECTION/CLASSIFICATION CODES")]/span') else 'N/A'
            activity_codes = driver.find_element(By.XPATH, '//h5[contains(text(), "ACTIVITY CODES")]/span').text.strip() if driver.find_elements(By.XPATH, '//h5[contains(text(), "ACTIVITY CODES")]/span') else 'N/A'
            ntee_common_code = driver.find_element(By.XPATH, '//h5[contains(text(), "NTEE COMMON CODE")]/span/a').text.strip() if driver.find_elements(By.XPATH, '//h5[contains(text(), "NTEE COMMON CODE")]/span/a') else 'N/A'
            ntee_code = driver.find_element(By.XPATH, '//h5[contains(text(), "NTEE CODE")]/span').text.strip() if driver.find_elements(By.XPATH, '//h5[contains(text(), "NTEE CODE")]/span') else 'N/A'
            foundation_code = driver.find_element(By.XPATH, '//h5[contains(text(), "FOUNDATION CODE")]/span').text.strip() if driver.find_elements(By.XPATH, '//h5[contains(text(), "FOUNDATION CODE")]/span') else 'N/A'
            exempt_organization_status_code = driver.find_element(By.XPATH, '//h5[contains(text(), "EXEMPT ORGANIZATION STATUS CODE")]/span').text.strip() if driver.find_elements(By.XPATH, '//h5[contains(text(), "EXEMPT ORGANIZATION STATUS CODE")]/span') else 'N/A'
            tax_period = driver.find_element(By.XPATH, '//h5[contains(text(), "TAX PERIOD")]/span').text.strip() if driver.find_elements(By.XPATH, '//h5[contains(text(), "TAX PERIOD")]/span') else 'N/A'
            accounting_period = driver.find_element(By.XPATH, '//h5[contains(text(), "ACCOUNTING PERIOD")]/span').text.strip() if driver.find_elements(By.XPATH, '//h5[contains(text(), "ACCOUNTING PERIOD")]/span') else 'N/A'
            income_code = driver.find_element(By.XPATH, '//h5[contains(text(), "INCOME CODE")]/span').text.strip() if driver.find_elements(By.XPATH, '//h5[contains(text(), "INCOME CODE")]/span') else 'N/A'
            income_amount = driver.find_element(By.XPATH, '//h5[contains(text(), "INCOME AMOUNT")]/span').text.strip() if driver.find_elements(By.XPATH, '//h5[contains(text(), "INCOME AMOUNT")]/span') else 'N/A'
            form_990_revenue_amount = driver.find_element(By.XPATH, '//h5[contains(text(), "FORM 990 REVENUE AMOUNT")]/span').text.strip() if driver.find_elements(By.XPATH, '//h5[contains(text(), "FORM 990 REVENUE AMOUNT")]/span') else 'N/A'
            ruling_date = driver.find_element(By.XPATH, '//h5[contains(text(), "RULING DATE")]/span').text.strip() if driver.find_elements(By.XPATH, '//h5[contains(text(), "RULING DATE")]/span') else 'N/A'
            asset_code = driver.find_element(By.XPATH, '//h5[contains(text(), "ASSET CODE")]/span').text.strip() if driver.find_elements(By.XPATH, '//h5[contains(text(), "ASSET CODE")]/span') else 'N/A'
            asset_amount = driver.find_element(By.XPATH, '//h5[contains(text(), "ASSET AMOUNT")]/span').text.strip() if driver.find_elements(By.XPATH, '//h5[contains(text(), "ASSET AMOUNT")]/span') else 'N/A'
            filing_requirement_code = driver.find_element(By.XPATH, '//h5[contains(text(), "FILING REQUIREMENT CODE")]/span').text.strip() if driver.find_elements(By.XPATH, '//h5[contains(text(), "FILING REQUIREMENT CODE")]/span') else 'N/A'
            pf_filing_requirement_code = driver.find_element(By.XPATH, '//h5[contains(text(), "PF FILING REQUIREMENT CODE")]/span').text.strip() if driver.find_elements(By.XPATH, '//h5[contains(text(), "PF FILING REQUIREMENT CODE")]/span') else 'N/A'

            # Append the data for the current row to the page_data list
            page_data.append([
                org_text, city_text, ein, po_box_number, postal_code, map_url, larger_map_link,
                organization_code, deductibility_code, affiliation_code, subsection_classification_codes,
                activity_codes, ntee_common_code, ntee_code, foundation_code,
                exempt_organization_status_code, tax_period, accounting_period,
                income_code, income_amount, form_990_revenue_amount, ruling_date,
                asset_code, asset_amount, filing_requirement_code, pf_filing_requirement_code
            ])

            # Return to the main page
            fetch_url_with_retries(url)
            WebDriverWait(driver, 120).until(
                EC.presence_of_all_elements_located((By.XPATH, '//tbody/tr'))
            )

        except Exception as e:
            print(f'Error processing row {index}: {e}')

    # Append the data for the current page to the CSV file
    append_data_to_csv(page_data)

    # Introduce a random delay to avoid rate-limiting
    time.sleep(random.uniform(1, 5))

# Quit the WebDriver
driver.quit()


Fetching page: https://501c3lookup.org/state/GU?pagenum=3
Organization: GUAM KOREAN BUDDHIST ASSOCIATION
EIN: 66-0571056
City: TAMUNING
URL: https://501c3lookup.org/guam-korean-buddhist-association_660571056
Organization: GUAM KOREAN BUDDHIST TEMPLE
EIN: 98-0213387
City: BARRIGADA
URL: https://501c3lookup.org/guam-korean-buddhist-temple_980213387
Organization: GUAM LEGAL SERVICES CORPORATION
EIN: 98-0046988
City: HAGATNA
URL: https://501c3lookup.org/guam-legal-services-corporation_980046988
Organization: GUAM MUSEUM FOUNDATION INC
EIN: 66-0670817
City: HAGATNA
URL: https://501c3lookup.org/guam-museum-foundation-inc_660670817
Organization: GUAM NATIONAL TENNIS FEDERATION INC
EIN: 66-0874945
City: HAGATNA
URL: https://501c3lookup.org/guam-national-tennis-federation-inc_660874945
Organization: GUAM NAVAL OFFICERS SPOUSES CONNECTION
EIN: 66-0809967
City: SANTA RITA
URL: https://501c3lookup.org/guam-naval-officers-spouses-connection_660809967
Organization: GUAM NAVAL OFFICERS SPOUSES CONNEC

Organization: KOREAN EVANGELICAL CHURCH OF GUAM
EIN: 66-0694561
City: TAMUNING
URL: https://501c3lookup.org/korean-evangelical-church-of-guam_660694561
Organization: KURASON I SENGSONG
EIN: 66-1019882
City: DEDEDO
URL: https://501c3lookup.org/kurason-i-sengsong_661019882
Organization: LA CASA DE LA PLENA TITO MATOS INC
EIN: 66-1004524
City: HAGATNA
URL: https://501c3lookup.org/la-casa-de-la-plena-tito-matos-inc_661004524
Organization: LEGION OF MARY GUAM INC GUAM COMITIUM
EIN: 66-0907591
City: HAGATNA
URL: https://501c3lookup.org/legion-of-mary-guam-inc-guam-comitium_660907591
Organization: LEND-A-HAND INC PACIFIC DAILY NEWS
EIN: 98-0085071
City: AGANA
URL: https://501c3lookup.org/lend-a-hand-inc-pacific-daily-news_980085071
Fetching page: https://501c3lookup.org/state/GU?pagenum=5
Organization: LETS MOVE GUAM
EIN: 66-0785778
City: SINAJANA
URL: https://501c3lookup.org/lets-move-guam_660785778
Organization: MAINTENANCE PROFESSIONAL OF THE YEAR AWARDS COMMITTEE 36 MXG MPOY
EIN: 88-39417

Organization: RAINAN I LANGET FOUNDATION INC
EIN: 66-0889729
City: TAMUNING
URL: https://501c3lookup.org/rainan-i-langet-foundation-inc_660889729
Organization: ROTARY CLUB OF GUAM
EIN: 66-0505436
City: HAGATNA
URL: https://501c3lookup.org/rotary-club-of-guam_660505436
Organization: ROTARY INTERNATIONAL
EIN: 36-4012456
City: TAMUNING
URL: https://501c3lookup.org/rotary-international_364012456
Organization: SANCTUARY INC
EIN: 96-0002543
City: CHALAN PAGO
URL: https://501c3lookup.org/sanctuary-inc_960002543
Organization: SAVING GRACE CHURCH INC
EIN: 66-0833448
City: HAGATNA
URL: https://501c3lookup.org/saving-grace-church-inc_660833448
Organization: SIR AKOMA MEMORIAL FOUNDATION
EIN: 47-5272996
City: TAMUNING
URL: https://501c3lookup.org/sir-akoma-memorial-foundation_475272996
Fetching page: https://501c3lookup.org/state/GU?pagenum=7
Organization: SOCIETY OF AMERICAN MILITARY ENGINEERS S A M E GUAM POST
EIN: 66-0456419
City: HAGATNA
URL: https://501c3lookup.org/society-of-american-militar

TimeoutException: Message: 
