# Hart Island Web-scraper
----

**Author:** Simon Aytes

**Contact:** simon.aytes@lc.cuny.edu

**Version:** 1.2

**Date:** August 23, 2022

## 0. Environment Configuration
----

### 0.1 Import Libraries

In [1]:
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
import pandas as pd
import time
from tqdm import tqdm
from time import sleep
from random import randint
from datetime import datetime, timedelta
from selenium.common.exceptions import NoSuchElementException      

### 0.2 Utility Functions

In [2]:
def check_exists_by_xpath(xpath):
    try:
        webdriver.find_element(By.XPATH, xpath)
    except NoSuchElementException:
        return False
    return True

In [3]:
def get_date_obj(user_date):
    temp = user_date.split("/")
    date = datetime(int(temp[2]), int(temp[0]), int(temp[1]))
    return date

In [4]:
def fill_date_form(driver, date_str):
    # Fill form with date range
    dateDeathFrom = driver.find_element(By.XPATH,"//*[@id='home_form:date_death_from_input']")
    dateDeathFrom.clear()
    dateDeathFrom.send_keys(date_str)
    dateDeathTo = driver.find_element(By.XPATH,"//*[@id='home_form:date_death_to_input']")
    dateDeathTo.clear()
    dateDeathTo.send_keys(date_str)

In [5]:
def select_gender_option(driver, gender_index):
    gender_dropdown = driver.find_element(By.XPATH, "//*[@id='home_form:gender_input']")
    gender_dropdown = Select(gender_dropdown)
    gender_dropdown.select_by_index(gender_index)

In [6]:
def get_row_contents(driver, n_cols):
    row_contents = []
    for p in range(1, n_cols+1):
        # obtaining the text from each column of the table
        value = driver.find_element(By.XPATH, "//*[@id='search_result_table']/tbody/tr["+str(r)+"]/td["+str(p)+"]").text
        row_contents.append(value)
    return row_contents

In [7]:
def create_webdriver():
    # Setup Chrome webdriver
    cd_service = Service("./chromedriver/chromedriver")
    options = webdriver.ChromeOptions()
    driver = webdriver.Chrome(service=cd_service, options=options)
    
    # Return the driver object
    return driver

In [8]:
def decrement_date(date, days_to_subtract):
    return (date - timedelta(days = days_to_subtract))

In [9]:
def get_date_string(date):
    return date.strftime("%m/%d/%Y")

In [10]:
def submit_form(driver):
    # Submit the form and load new page
    driver.find_element(By.XPATH,"//*[@id='home_form:search_lk']").click()

In [11]:
def get_num_columns(driver):
    return len(driver.find_elements(By.XPATH, "//*[@id='search_result_table']/tbody/tr[1]/td"))

In [12]:
def get_num_rows(driver):
    return 1+len(driver.find_elements(By.XPATH, "//*[@id='search_result_table']/tbody/tr"))

## 1. Scrape Data
----

In [13]:
# Start timer
tick = datetime.now()

# Features
f_names = []
l_names = []
ages = []
sexes = []
date_of_deaths = []
place_of_deaths = []
plot_no = []
medical_examiner_nos = []
urls = []
date_scraped = []
jurisdiction = []

# Define loop variables
sleep_seconds = 2.5

# Dropdown options as dict (Format -- {Label:XPath})
gender_opt_list = {'MALE':1, 'FEMALE':2, 'UNKNOWN':3}

# Create list of dates with no records
female_no_record_dates = []
male_no_record_dates = []
unknown_no_record_dates = []

# Create list of dates with 30 or more records
thirty_or_more_record_dates = []

# Get input for from- and to- dates
from_date = get_date_obj(input("Date From (older date, MM/DD/YYYY): "))
to_date = get_date_obj(input("Date To (most-recent date, MM/DD/YYYY): "))
curr_date = to_date + timedelta(days=1) # Add one to the start date because of loop structure
date_delta = to_date - from_date

# Create the webdriver
driver = create_webdriver()

# Make the window larger
driver.maximize_window()

# Loop through each day and track with a progress bar.
for i in tqdm (range(date_delta.days+1), desc="Scraping Data...", ascii=False, ncols=75): 
    # Calculate new date range by subtracting one day
    curr_date = decrement_date(curr_date, 1)
    
    # Get the date string to be input into the box on the page
    curr_date_str = get_date_string(curr_date)
    
    # Loop through all dropdown values (MALE, FEMALE, UNKNOWN)
    for gender in gender_opt_list:
        # Try to gather the data. If any errors are thrown, see 'except' block below.
        try:
            # Establish the website's address
            url = "https://a073-hartisland-web.nyc.gov/hartisland/pages/home/home.jsf"

            # Open website
            driver.get(url)

            # Wait for 'n' seconds
            sleep(sleep_seconds)
            
            # Fill form with date range
            fill_date_form(driver, curr_date_str)
            
            # Select the gender option by clicking directly to the option's XPATH
            select_gender_option(driver, gender_opt_list[gender])
            
            # Submit the form and load new page
            submit_form(driver)
            sleep(sleep_seconds)

            # Obtains number of rows and columns
            rows = get_num_rows(driver)
            cols = get_num_columns(driver)

            # Check if there are more than thirty records on a given date. If so, flag it
            if rows - 1 >= 30:
                thirty_or_more_record_dates.append(curr_date_str)

            # Gather the contents of table row-by-row
            for r in range(1, rows):
                # Get the contents of the row
                row_contents = get_row_contents(driver, cols)
                
                # Append values to lists
                f_names.append(row_contents[1])
                l_names.append(row_contents[0])
                ages.append(row_contents[2])
                sexes.append(gender)
                date_of_deaths.append(row_contents[3])
                place_of_deaths.append(row_contents[4])
                plot_no.append(row_contents[5])
                medical_examiner_nos.append(row_contents[6])
                urls.append(url)
                date_scraped.append(datetime.now().strftime("%m/%d/%Y"))
                jurisdiction.append("Hart Island")

            # Return to search
            driver.find_element(By.XPATH,"//*[@id='home_form:j_id_9b']").click()
            
        # If there is an error, it means there were no records on that date for that specified gender. Log it here and move on to next.
        except Exception as e:
            # Log the dates with no records
            if gender == "MALE":
                male_no_record_dates.append(curr_date_str)
            elif gender == "FEMALE":
                female_no_record_dates.append(curr_date_str)
            elif gender == "UNKNOWN":
                unknown_no_record_dates.append(curr_date_str)
            # Continue to next loop
            continue

# When the data is gathered, close the window and log message.
driver.quit() # Close the ChromeDriver window
print("Done collecting data!")

# Stop timer
tock = datetime.now() - tick

Date From (older date, MM/DD/YYYY):  08/03/1978
Date To (most-recent date, MM/DD/YYYY):  03/02/1980


Scraping Data...: 100%|████████████████| 578/578 [2:57:16<00:00, 18.40s/it]


Done collecting data!


## 2. Output Data to CSV
----

In [14]:
# Create output dataframe with results
data = pd.DataFrame()
data['FName'] = f_names
data['LName'] = l_names
data['Age'] = ages
data['Sex'] = sexes
data['DOD'] = date_of_deaths
data['POD'] = place_of_deaths
data['PlotNo'] = plot_no
data['CaseNo'] = medical_examiner_nos
data['Jurisdiction'] = jurisdiction
data['DateScraped'] = date_scraped
data['SourceURL'] = urls
file_name = './data/scraped/DOC_' + str(datetime.now().strftime('%Y-%m-%d_%H:%M:%S')) + '.csv'
data.to_csv(file_name, index = False)

In [None]:
data