# Hart Island Web-scraper
----

**Author:** Simon Aytes

**Contact:** simon.aytes@lc.cuny.edu

**Version:** 1.0

**Date:** August 21, 2022

## 0. Environment Configuration
----

### 0.1 Import Libraries

In [5]:
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import pandas as pd
import time
from time import sleep
from random import randint
from datetime import datetime, timedelta
from selenium.common.exceptions import NoSuchElementException      

### 0.2 Utility Functions

In [6]:
def check_exists_by_xpath(xpath):
    try:
        webdriver.find_element_by_xpath(xpath)
    except NoSuchElementException:
        return False
    return True

In [11]:
def get_date_obj(user_date):
    temp = user_date.split("/")
    date = datetime(int(temp[2]), int(temp[0]), int(temp[1]))
    return date

## 1. Scrape Data
----

In [44]:
# Features
f_names = []
l_names = []
ages = []
date_of_deaths = []
place_of_deaths = []
plot_no = []
medical_examiner_nos = []
urls = []
date_scraped = []
jurisdiction = []

# Create list of dates with no records
no_record_dates = []

# Create list of dates with 30 or more records
thirty_or_more_record_dates = []

# Get input for from- and to- dates
from_date = get_date_obj(input("Date From (older date, MM/DD/YYYY): "))
to_date = get_date_obj(input("Date To (most-recent date, MM/DD/YYYY): "))
curr_date = to_date + timedelta(days=1) # Add one to the start date because of loop structure

print("Scraping data...")

# Create the webdriver
driver = webdriver.Chrome(executable_path="./chromedriver/chromedriver")
driver.maximize_window() # Make the window larger

# Initialize date flag
date_flag = False

while date_flag == False:
    # Check if the current date being scraped is out of bounds
    if curr_date < from_date:
        date_flag = True
        pass
    
    # Calculate new date range by subtracting one day
    curr_date = curr_date - timedelta(days=1)
    
    # Get the date string to be input into the box on the page
    curr_date_str = (curr_date).strftime("%m/%d/%Y")
    
    print(f"\t\t> {curr_date_str}")
    
    try:
        # Establish the website's address
        url = "https://a073-hartisland-web.nyc.gov/hartisland/pages/home/home.jsf"

        # Open website
        driver.get(url)

        # Wait for 4 seconds
        sleep(4)

        # Fill form with date range
        dateDeathFrom = driver.find_element(By.XPATH,"//*[@id='home_form:date_death_from_input']")
        dateDeathFrom.clear()
        dateDeathFrom.send_keys(curr_date_str)
        dateDeathTo = driver.find_element(By.XPATH,"//*[@id='home_form:date_death_to_input']")
        dateDeathTo.clear()
        dateDeathTo.send_keys(curr_date_str)

        # Submit the form and load new page
        driver.find_element(By.XPATH,"//*[@id='home_form:search_lk']").click()
        sleep(3)

        # Obtains number of rows and columns
        rows = 1+len(driver.find_elements_by_xpath("//*[@id='search_result_table']/tbody/tr"))
        cols = len(driver.find_elements_by_xpath("//*[@id='search_result_table']/tbody/tr[1]/td"))
        
        # Check if there are more than thirty records on a given date. If so, flag it
        if rows - 1 >= 30:
            thirty_or_more_record_dates.append(curr_date_str)
        
        # Gather the contents of table row-by-row
        for r in range(1, rows):
            row_contents = []
            for p in range(1, cols+1):
                # obtaining the text from each column of the table
                value = driver.find_element_by_xpath("//*[@id='search_result_table']/tbody/tr["+str(r)+"]/td["+str(p)+"]").text
                row_contents.append(value)
                #print(value, end='       ')

            # Append values to lists
            f_names.append(row_contents[1])
            l_names.append(row_contents[0])
            ages.append(row_contents[2])
            date_of_deaths.append(row_contents[3])
            place_of_deaths.append(row_contents[4])
            plot_no.append(row_contents[5])
            medical_examiner_nos.append(row_contents[6])
            urls.append(url)
            date_scraped.append(datetime.now().strftime("%m/%d/%Y"))
            jurisdiction.append("Hart Island")

        # Return to search
        driver.find_element(By.XPATH,"//*[@id='home_form:j_id_9b']").click()
    except Exception as e:
        #print(f"\t> No records present for {curr_date_str}...")
        no_record_dates.append(curr_date_str)
        continue

driver.quit() # Close the ChromeDriver window
print("Done collecting data!")

Date From (older date, MM/DD/YYYY):  05/10/2020
Date To (most-recent date, MM/DD/YYYY):  06/10/2020


Scraping data...
	> Loop started...
		> 06/10/2020
	> Loop started...
		> 06/09/2020
	> Loop started...
		> 06/08/2020
	> Loop started...
		> 06/07/2020
	> Loop started...
		> 06/06/2020
	> Loop started...
		> 06/05/2020
	> Loop started...
		> 06/04/2020
	> Loop started...
		> 06/03/2020
	> Loop started...
		> 06/02/2020
	> Loop started...
		> 06/01/2020
	> Loop started...
		> 05/31/2020
	> Loop started...
		> 05/30/2020
	> Loop started...
		> 05/29/2020
	> Loop started...
		> 05/28/2020
	> Loop started...
		> 05/27/2020
	> Loop started...
		> 05/26/2020
	> Loop started...
		> 05/25/2020
	> Loop started...
		> 05/24/2020
	> Loop started...
		> 05/23/2020
	> Loop started...
		> 05/22/2020
	> Loop started...
		> 05/21/2020
	> Loop started...
		> 05/20/2020
	> Loop started...
		> 05/19/2020
	> Loop started...
		> 05/18/2020
	> Loop started...
		> 05/17/2020
	> Loop started...
		> 05/16/2020
	> Loop started...
		> 05/15/2020
	> Loop started...
		> 05/14/2020
	> Loop started...
		> 05/13/20

## 2. Output Data to CSV
----

In [47]:
# Create output dataframe with results
data = pd.DataFrame()
data['FName'] = f_names
data['LName'] = l_names
data['Age'] = ages
data['DOD'] = date_of_deaths
data['POD'] = place_of_deaths
data['PlotNo'] = plot_no
data['CaseNo'] = medical_examiner_nos
data['Jurisdiction'] = jurisdiction
data['DateScraped'] = date_scraped
data['SourceURL'] = urls
file_name = './data/DOC_' + str(datetime.now().strftime('%Y-%m-%d_%H:%M:%S')) + '.csv'
data.to_csv(file_name)