# Final Draft Scraper Team 2

## Setup
<ol>
    <li>Import necessary packages</li>
    <li>Set up and initialize Chrome Webdriver (You do have to manually install your own chrome webdriver)</li>
    <li>Query ProxyScrape API for proxy ips</li>
    <li>Rotate through to find a viable IP and open masscourts.org</li>
</ol>

In [None]:
from selenium import webdriver
import time
import random
import numpy as np
import pandas as pd
from PIL import Image
import io
from selenium.common.exceptions import WebDriverException, TimeoutException, StaleElementReferenceException, NoSuchElementException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.by import By
import requests
import deathbycaptcha
import sys
import datetime
import re

In [None]:
# ProxyScrape returns a list of currently active IP proxies. We filter our request to < 1000 and U.S. location for speed.
proxy_url = "https://api.proxyscrape.com/v2/?request=getproxies&protocol=http&timeout=1000&country=US&ssl=yes&anonymity=all&simplified=true"
r = requests.get(proxy_url, allow_redirects=True)
proxies = str(r.content)[2:-5].split('\\r\\n')
print(len(proxies))
err = True

# Iterate through the list of proxies randomly until we find one that is able to load the page in 5 seconds and has no other exceptions.
# This is done to ensure the proxy we use is reasonably fast and stable.
while(err):
    PROXY = proxies[random.randint(0, len(proxies)-1)]
    print(PROXY)

    options = webdriver.ChromeOptions()
    options.add_argument('--proxy-server=%s' % PROXY)
    options.add_argument('--enable-automation')
    options.add_argument("--dns-prefetch-disable")
    driver = webdriver.Chrome(options=options)
    driver.set_window_size(1920, 1080)
    driver.set_page_load_timeout(10)

    try:
        driver.get('https://www.masscourts.org/')
        err = False
    except (WebDriverException, TimeoutException):
        continue


## Go through Captcha
Death by captcha function. For now, do not run this and manually solve captcha. Still have to check if wrong case works.

In [None]:
def do_dbc(driver):
    # Wait for page --- Add explicit wait here
    time.sleep(3)

    # Take Picture of Captcha and save it as "captchaImg.png"
    img = driver.find_element(By.CLASS_NAME, "captchaImg").screenshot_as_png
    img = Image.open(io.BytesIO(img))
    img.save("captchaImg.png")

    # Get Death By Captcha username and password from text file (MUST BE MANUALLY MADE)
    with open('DBCLogin.txt') as f:
        login = [i.strip() for i in f.readlines()]
    username = login[0]
    password = login[1]

    solved = False
    while not solved:
        # Put your DBC account username and password here.
        # Use deathbycaptcha.HttpClient for HTTP API.
        client = deathbycaptcha.SocketClient(username, password)
        try:
            balance = client.get_balance()

            # Put your CAPTCHA file name or file-like object, and optional
            # solving timeout (in seconds) here:
            captcha = client.decode("captchaImg.png", 60)
            if captcha:
                # The CAPTCHA was solved; captcha["captcha"] item holds its
                # numeric ID, and captcha["text"] item its text.
                print ("CAPTCHA %s solved: %s" % (captcha["captcha"], captcha["text"]))

                # Find the court division selector. If not found after 10 seconds, report Captcha was incorrectly solved
                try:
                    # Input Captcha Solution
                    captcha_input = WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CLASS_NAME, 'captchaTxt')))
                    captcha_input.send_keys(captcha['text'])

                    # Press Click Here Button
                    WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.CLASS_NAME, 
                    "anchorButton"))).click()

                    # If solution doesnt work after 10 seconds, report wrong
                    court_division_selector = WebDriverWait(driver,15).until(EC.visibility_of_element_located((By.NAME, "sdeptCd")))
                    solved = True
                    
                except TimeoutException:
                    captcha_input.clear()
                    client.report(captcha["captcha"])
                    print('Wrong Answer Reported, Trying again')
                
        except deathbycaptcha.AccessDeniedException:
            # Access to DBC API denied, check your credentials and/or balance
            sys.exit('Access to DBC API denied, check your credentials and/or balance')



## Helpful Functions

In [None]:
def sleep_rand():
    """
    Sleeps the python execution between 1 and 3 seconds

    :return: None
    """
    time.sleep(random.randint(1,2) + random.random()) 

In [None]:
def ChooseCourtDivision(driver, court_division: str):
    """ 
    Select Housing Court, inputted court division, and 75 results
    
    :param driver: Chrome Webdriver
    :param court_division: string representation of desired court division name
    :return: None
    """
    # Select Housing Court
    courts = WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.NAME, "sdeptCd")))
    courts_select = Select(courts)
    courts_select.select_by_visible_text('Housing Court')

    time.sleep(0.5)
    # Select Court Division
    divisions = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.NAME, "sdivCd")))
    divisions_select = Select(divisions)
    divisions_select.select_by_value(court_division)

    # Change Number of Results to 75
    results = WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.NAME, "pageSize")))
    results_select = Select(results)
    results_select.select_by_value("2")

In [None]:
def type_dates(driver, startdate, enddate=None):
    """
    Input given date into driver

    :param driver: Chrome Webdriver
    :return: None
    """
    startDate = WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.XPATH, '//input[@data="dateInputBegin0"]')))
    startDate.send_keys(startdate)
    if enddate == None:
        WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH,
        "//div[@id='caseTypeSearchFieldset']/div[2]/input"))).send_keys(startdate)
    else:
        WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH,
        "//div[@id='caseTypeSearchFieldset']/div[2]/input"))).send_keys(enddate)

# Clear the typed dates
def clear_dates(driver):
    """
    Clears Date input fields

    :param driver: Chrome Webdriver
    :return: None
    """
    startDate = WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.XPATH, '//input[@data="dateInputBegin0"]')))
    startDate.clear()

    WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH,
    "//div[@id='caseTypeSearchFieldset']/div[2]/input"))).clear()


# Select the Summary Process Category
def select_Summary_Process_and_Plantiff(driver, selection):
    """
    Selects Summary Process and Plantiff Fields

    :param driver: Chrome Webdriver
    :return: None
    """
    # Select Summary Process
    case_type = WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.NAME, "caseCd")))
    select_case_type = Select(case_type)
    select_case_type.select_by_value(selection)
    
    time.sleep(2)
    # Select Plantiff
    party_type_select = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.NAME, "ptyCd")))
    party_type_select_object = Select(party_type_select)
    party_type_select_object.select_by_value('PLNTF                         ')
    party_type_select_object.deselect_by_value(' ')


## Extraction Helper Functions

In [None]:
def getEventFields():
    """
    Grabs fields to represent the event section of the case document

    :return: fields encoding the list of events, the date and type for last event, and booleans 
        for whether a status conference, trial or mediation event occurred.
    """
    # (4)
    # Grab the elements for each listed event
    try:
        eventInfo = driver.find_element(By.ID, "eventInfo")
        events = driver.find_elements(By.XPATH, "//div[@id='eventInfo']/table/tbody/tr")
    except:
        events = []
    
    had_status_conference = False
    had_mediation = False
    had_trial = False

    # If there are events, encode the list of their dates, sessions, locations and types as strings separated by '||'
    # Separate out the date and type of the last event, and set booleans for whether any event had the specified type
    if len(events) == 0:
        event_dates = 'N/A'
        event_sessions = 'N/A'
        event_locations = 'N/A'
        event_types = 'N/A'
        
        next_event_type = 'N/A'
        next_event_date = 'N/A'
    else:
        event_dates = ''
        event_sessions = ''
        event_locations = ''
        event_types = ''
        for i in range(len(events)):
            if i > 0:
                event_dates += '||'
                event_sessions += '||'
                event_locations += '||'
                event_types += '||'
                
            event_date = events[i].find_element(By.XPATH, "./td[1]/span").text
            event_session = events[i].find_element(By.XPATH, "./td[2]/span").text
            event_location = events[i].find_element(By.XPATH, "./td[4]/span").text
            event_type = events[i].find_element(By.XPATH, "./td[5]/span").text
            
            event_dates += event_date
            event_sessions += event_session
            event_locations += event_location
            event_types += event_type
            
            if i == len(events) - 1:
                next_event_date = event_date
                next_event_type = event_type
            
            had_status_conference = had_status_conference or event_type == "Housing Specialist Status Conference" or event_type == "Status Hearing"
            had_mediation = had_mediation or event_type == "Housing Specialist Mediation"
            had_trial = had_trial or event_type == "Summary Process Trial"
            
    return [event_dates, event_sessions, event_locations, event_types, 
               next_event_type, next_event_date, had_status_conference, had_mediation, had_trial] 

def getAttorneyFields(party):
    """
    Grabs the information of the first attorney for the provided party, if present.

    :param party: Web element for a party in the case
    :return: attorney name, code, address and phone number, or 'N/A' for each
    """
    if party is None:
        attorney = 'N/A'
        attorney_code = 'N/A'
        attorney_address = 'N/A'
        attorney_phone = 'N/A'
    else:
        try:
            attorney_data = party.find_element(By.XPATH, "./div[@class='box ptyAtty']/div/div/div")
            attorney = attorney_data.find_element(By.XPATH, "./ul[1]/li[2]").text
            attorney_code = attorney_data.find_element(By.XPATH, "./ul[2]/li[2]").text
            attorney_address = ' '.join([elem.text for elem in attorney_data.find_elements(By.XPATH, "./ul[3]/li[2]/*") 
                                                if not (elem.text == ',' or elem.text == '')])
            attorney_phone = attorney_data.find_element(By.XPATH, "./ul[4]/li[2]").text
        except NoSuchElementException:
            attorney = 'N/A'
            attorney_code = 'N/A'
            attorney_address = 'N/A'
            attorney_phone = 'N/A'
        
    return [attorney, attorney_code, attorney_address, attorney_phone]

def getPartyFields():
    """
    Grabs and encodes the list of plaintiff and defendant names, 
    as well as the fields for the first attorney representing each

    :return: List of plaintiff and defendant names separated with '||' and attorney fields
    """
    # (6)
    try:
        parties = driver.find_elements(By.XPATH, "//div[@id='ptyContainer']/div")
    except NoSuchElementException:
        plaintiffs = 'N/A'
        defendants = 'N/A'
        
    plaintiffs = ""
    defendants = ""

    party_names = [party.find_element(By.XPATH, './div[1]/h5/div[1]').text for party in parties]
    party_types = [party.find_element(By.XPATH, './div[1]/h5/div[2]').text for party in parties]
    
    # Find the index in the party list where it switches from plaintiffs to defendants so we know where to split 
    # the names list into plaintiff and defendant fields
    try:
        split = party_types.index('- Defendant')
    except ValueError:
        split = len(parties)
    
    plaintiffs = '||'.join(party_names[:split])
    defendants = '||'.join(party_names[split:])
    
    """This assumes all plaintiffs and defendants have the same attorney"""
    first_plaintiff = None if plaintiffs == '' else parties[0]
    plaintiff_attorney_fields = getAttorneyFields(first_plaintiff)
    
    
    first_defendant = None if defendants == '' else parties[split]
    defendant_attorney_fields = getAttorneyFields(first_defendant) 
    
    return [plaintiffs, defendants] + plaintiff_attorney_fields + defendant_attorney_fields

## Main Extraction Function from Document
Go through each Case, Extract:
<ol>
    <li>Case number</li>
    <li>Case Type and Status</li>
    <li>Filing Date and the Filing Type</li>
    <li>Event information</li>
    <li>Address Information of Plantiff</li>
    <li>Plantiff and Defendant names, and Attorney information</li>
    <li>Result Type and Date</li>
    <li>Full Docket text</li>
    <li>Judge name if case went to court</li>
    <li>Amount owed if court ruled for the landlord</li>
    <li>Reason for court dismissal, if applicable</li>
    <li>Date of defendant answer, if applicable</li>
</ol>
<li>

In [None]:
def extract_data_from_doc(driver, court_department, court_division):
    """
    Extract Desired Information from Case Document and Save to dataframe

    :param Chrome Webdriver, court department name, court division name
    :return: List of all fields
    """
    # (1)
    case = WebDriverWait(driver, 15).until(EC.visibility_of_element_located((By.XPATH, "//div[@id='titleBar']/h1/ul/li"))).text
    case_num = case[:case.index(' ')]

    caseHeader = driver.find_element(By.ID, "caseHeader")
    
    # (2)
    case_type = caseHeader.find_element(By.XPATH, "./div[1]/div[1]/ul/li[2]").text
    case_status = caseHeader.find_element(By.XPATH, "./div[1]/div[2]/ul/li[2]").text
    
    # (3)
    file_date = caseHeader.find_element(By.XPATH, "./div[1]/div[3]/ul/li[2]").text
    file_type = caseHeader.find_element(By.XPATH, "./div[2]/div[1]/ul/li[2]").text
    
    # (4)
    event_fields = getEventFields()
    
    # (5)
    try:
        addressInfo = driver.find_element(By.ID, 'addressInfo')
        address = ' '.join([elem.text for elem in addressInfo.find_elements(By.XPATH, './div/span') if elem.text != ''])
    except NoSuchElementException:
        address = 'N/A'
    
    # (6)  
    party_fields = getPartyFields()
    
    # (7)
    disposition = driver.find_element(By.ID, 'dispositionInfo')
    result = disposition.find_element(By.XPATH, './table/tbody/tr')
    result_type = result.find_element(By.XPATH, './td[1]').text
    result_date = result.find_element(By.XPATH, './td[2]').text if result_type != "Pending" else "N/A"
    
    # (8)
    docket_text = ''
    docket_table = driver.find_element(By.XPATH, "//div[@id='docketInfo']/table")
    for i,table_row in enumerate(docket_table.find_elements(By.XPATH, './tbody/tr')):
        if i > 0:
            docket_text += "||"
        docket_text += table_row.find_element(By.XPATH, "./td[1]").text + "_" + table_row.find_element(By.XPATH, "./td[2]").text

    # (9)
    judge = 'N/A'
    if result_type == 'Judgment in SP by Default' or result_type == 'Judgment in SP case entered' or case_status == 'Disposed for Statistical Purposes':
        start = docket_text.rindex("Presiding: ") + len("Presiding: ")
        end = docket_text[start:].index("\n") + start
        judge = docket_text[start+1:end]

    # (10)
    amount = 'N/A'
    try:
        if result_type == 'Judgment in SP by Default' or result_type == 'Judgment in SP case entered':
            money_pattern = "\$?([0-9]{1,3},([0-9]{3},)*[0-9]{3}|[0-9]+)(\.[0-9][0-9])"
            start, end = list(re.finditer(money_pattern, docket_text))[-1].span()

            amount = float(docket_text[start:end].replace(",", ""))
    except IndexError:
        amount = "N/A"
        
    # (11)
    case_dismissal_reason = 'N/A'
    if result_type == 'Dismissal':
        s = "the above entitled matter is dismissed for the following reason(s): * "
        start = docket_text.rfind(s) + len(s)
        end = docket_text[start:].find("||")
        end = len(docket_text) if end == -1 else start+end      
        case_dismissal_reason = docket_text[start:end]
            
    # (12)
    answer_date = 'N/A'
    end = docket_text.find("Answer and Counterclaim of")
    if end != -1:
        start = docket_text[:end].rfind('||') + 2
        end -= 1
        answer_date = docket_text[start:end]
    
    docket_text = repr(docket_text)

    return [case_num, court_department, court_division, case_type, case_status, file_date, file_type, 
            address, result_type, result_date] + party_fields + event_fields + [
            judge, amount, case_dismissal_reason, answer_date, docket_text]

## Site Navigation Helper Functions

In [None]:
def get_date_list(start_datetime, end_datetime=datetime.datetime.today().date()):
    """
    Get a list of dates to input from a start date to end date (end date default to current date)

    :param start_datetime: python datetime object to be start of search
    :param end_datetime: python datetime object to be end of search
    :return: numpy array of date strings that can be inputted into date fields
    """
    delta = datetime.timedelta(days=1)
    dates = []
    while start_datetime <= end_datetime:
        dates.append(start_datetime.strftime('%m/%d/%Y').replace("/0", "/"))
        start_datetime += delta
    return np.array(dates)
    
# Get each Case from page
def getCasesList(driver):
    """
    Get a list of cases from the results table

    :param driver: Chrome WebDriver
    :return: List of case numbers and links to case documents for this results page
    """
    results_table = WebDriverWait(driver, 15).until(EC.visibility_of_element_located((By.XPATH, "//table[@class='tableResults']/tbody")))
    link_elems = results_table.find_elements(By.XPATH, "./tr/td[3]/span/a")
    
    case_nums = [link_elem.find_element(By.XPATH, "./span").text for link_elem in link_elems]
    links = [link_elem.get_attribute('href') for link_elem in link_elems]
    
    return case_nums, links

def get_case_links(driver):
    """
    Get links to all case results for this query, iterating through result pages. Throw an error if the query results were capped at 100.

    :param driver: Chrome WebDriver
    :return: List of all links to case documents for the query
    """
    results_text = WebDriverWait(driver, 30).until(EC.visibility_of_element_located((By.ID, "srchResultNotice"))).text

    if not re.match("Displaying all", results_text):
        raise ValueError("Not all matches displayed for department: %s, division: %s, date range: %s - %s" % (court_department, court_division, date_start, date_end))
        
    num_results = int(results_text[len("Displaying all "):-len(" matches.")])

    case_nums, links = getCasesList(driver)

    if num_results > 75:
        # Go to second page
        WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.XPATH, 
        "//table[@class='tableResults']/tfoot/tr/td/div[@class='navigator']/span/a[1]"))).click()
        
        page2 = getCasesList(driver)
        
        case_nums += page2[0]
        links += page2[1]
        driver.back()
    return links

In [None]:
def save_dataset(dataset, startDate, endDate, court):
    startDateString = startDate.strftime('%m.%d.%Y')
    endDateString = endDate.strftime('%m.%d.%Y')
    csvString = './data/' + court + startDateString + '-' + endDateString + '.csv'
    pd.DataFrame(dataset, columns=["caseNum", "courtDepartment", "courtDivision", "caseType", "caseStatus", "fileDate", "fileType",
                            "address", "result", "resultDate", "plaintiffs", "defendants", 
                        "plaintiffAttorney", "plaintiffAttorneyCode", "plaintiffAttorneyAddress", "plaintiffAttorneyPhone", 
                        "defendantAttorney", "defendantAttorneyCode", "defendantAttorneyAddress", "defendantAttorneyPhone", 
                        "eventDates", "eventSessions", "eventLocations", "eventTypes", 
                        "nextEventType", "nextEventDate", "hadStatusConference", "hadMediation", "hadTrial", 
                            "judge", "amount", "caseDismissalReason", "answerDate", "docketText"]).to_csv(csvString)
    

# MAIN RUNNER

In [None]:
# Run death_by_captcha
#do_dbc(driver)

court_Divisions = ['HCWOR_DIV ', 'HCBOS_DIV ', 'HCMETROS_D', 'NEHC      ', 'SEHC_DIV  ', 'HCSPR_DIV ']
division_Names = ['Central Housing', 'Eastern Housing', 'Metro South Housing', 'Northeast Housing', 'Southeast Housing', 'Western Housing']
startDate = datetime.date(2020, 1, 13)
endDate = datetime.date(2020, 1, 13)
dates = get_date_list(startDate, endDate)
dataset = []
# Go through Each Court Division
for i, court_D in enumerate(court_Divisions):
    driver.refresh()
    # Choose Court Division and 75 elements per page
    try:
        ChooseCourtDivision(driver, court_D)
        sleep_rand()
        WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.XPATH, 
        "//div[@class='tabSection']/div/div[1]/ul/li[2]/a"))).click()
    except:
        WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.XPATH, 
        "//a"))).click()
        for x in range(5):
            driver.refresh()
        time.sleep(5)
        WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.CLASS_NAME, 
                    "anchorButton"))).click()
        ChooseCourtDivision(driver, court_D)
        sleep_rand()
        WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.XPATH, 
        "//div[@class='tabSection']/div/div[1]/ul/li[2]/a"))).click()

    # Go Through each Date
    for date in dates:
        try:
            ChooseCourtDivision(driver, court_D)
            sleep_rand()
            try:

                WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.XPATH, 
                    "//div[@class='tabSection']/div/div[1]/ul/li[2]/a"))).click()
            except TimeoutException:
                driver.refresh()
                time.sleep(5)
                WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.XPATH, 
                    "//div[@class='tabSection']/div/div[1]/ul/li[2]/a"))).click()
            # Type input dates
            type_dates(driver, date)
            # Select Summary Process
            select_Summary_Process_and_Plantiff(driver, "SP                            ")

        except (TimeoutException, StaleElementReferenceException, WebDriverException):
            # Kicked to main begin page, so click button
            try:
                WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.CLASS_NAME, 
                    "anchorButton"))).click()
            except:
                pass
            ChooseCourtDivision(driver, court_D)
            sleep_rand()
            WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.XPATH, 
                "//div[@class='tabSection']/div/div[1]/ul/li[2]/a"))).click()
            # Type input dates
            type_dates(driver, date)
            # Select Summary Process
            select_Summary_Process_and_Plantiff(driver, "SP                            ")

        # Submit Link
        try:
            WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.NAME,
            "submitLink"))).click()
        except TimeoutError:
            driver.refresh()
            time.sleep(5)
            WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.NAME,
            "submitLink"))).click()
        # Wait a bit to slow down requests per time
        sleep_rand()

        # CURRENTLY IN CASE NUMBER TABLE or NO MATCHES
        # No matches Case
        try:
            no_matches = driver.find_element(By.ID, 'srchResultNoticeNomatch')
        # Matches: Extract Info            
        except NoSuchElementException:
            case_links = get_case_links(driver)
            for case_link in case_links:
                try:
                    driver.get(case_link)
                except:
                    print('Error, got until: ', date)
                    # EndDate is wrong so can be manually changed
                    save_dataset(dataset, startDate, datetime.date(2030, 1, 1))
                    driver.refresh()
                    time.sleep(5)
                    driver.get(case_link)

                # First Check if there is a defendant before extracting
                if 'Defendant' in driver.page_source:
                    dataset.append(extract_data_from_doc(driver, 'HousingCourt', division_Names[i]))
                #driver.back() # Exit Document
        
        # Go back to select dates page
        try:
            driver.get('https://www.masscourts.org/eservices/search.page.210?x=')
        except TimeoutException:
            print('Error, got until: ', court_D)
            # EndDate is wrong so can be manually changed
            save_dataset(dataset, startDate, datetime.date(2030, 1, 1), court='H')
            driver.refresh()
            time.sleep(5)
            driver.get('https://www.masscourts.org/eservices/search.page.210?x=')
        # Clear Dates to input next ones


save_dataset(dataset, startDate, endDate, 'H')

# District Court Side
Follow similar process, except we must find the result status in the case document if the case was disposed internally.

In [None]:
def ChooseCourtDivisionDistrict(driver, court_division: str):
    """ 
    Select District Court, inputted court division, and 75 results
    
    :param driver: Chrome Webdriver
    :param court_division: string representation of desired court division name
    :return: None
    """
    # Select Housing Court
    courts = WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.NAME, "sdeptCd")))
    courts_select = Select(courts)
    courts_select.select_by_visible_text('District Court')

    time.sleep(0.5)
    # Select Court Division
    divisions = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.NAME, "sdivCd")))
    divisions_select = Select(divisions)
    divisions_select.select_by_value(court_division)

    # Change Number of Results to 75
    results = WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.NAME, "pageSize")))
    results_select = Select(results)
    results_select.select_by_value("2")

In [None]:
def find_status(text: str):
    # Judge Dismisses after trial
    if "Judgment of dismissal after bench trial" in text:
        return "JudgmentDismissalTrial"
    elif "Judgment for Plaintiff for Possession and Rent    , after trial by a judge" in text:
        return "JudgmentPlaintiffTrial"
    elif "Judgment of Dismissal per Plaintiff's request" in text:
        return "JudgmentDismissalPlaintiffRequest"
    elif "Judgment for Plaintiff for Possession and Rent    , by agreement of the parties" in text:
        return "JudgmentPlaintiffAgreement"
    elif "Judgment for Plaintiff for Possession and Rent    , after default"  or "Judgment for Plaintiff for Possession and Rent    , after defendant(s) failed to appear" in text:
        return "JudgmentPlaintiffDefendantNoAppear"
    elif "Judgment for Defendant for Possession and Rent    , after plaintiff(s) failed to appear in text":
        return "JudgmentDefendantPlaintiffNoAppear"
    elif "Judgment for Defendant for Possession and Rent    , after trial by a judge" in text:
        return "JudgmentDefendantTrial"
    else:
        return "Undetermined"

## District Court Main Runner

In [None]:
court_D_names = ["Springfield", "Brockton", "New Bedford", "Worcester", "Lowell"]
court_Divisions = ['DC59_DIV  ', 'DC61_DIV  ', 'DC41_DIV  ', 'DC60_DIV  ', 'DC16_DIV  ']
startDate = datetime.date(2020, 1, 1)
endDate = datetime.date(2020, 1, 31)
dates_range = get_date_list(startDate, endDate)
dates = []
i = 0
while i < len(dates_range):
    if i + 28 < len(dates_range):
        dates.append((dates_range[i], dates_range[i + 28]))
        i += 39
    else:
        dates.append((dates_range[i], dates_range[-1]))
        break

dataset = []

# Go through Each Court Division
for court_D in court_Divisions:
    court_name = court_D_names[court_Divisions.index(court_D)]
    # Choose Court Division and 75 elements per page
    ChooseCourtDivisionDistrict(driver, court_D)
    sleep_rand()
    WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.XPATH, 
    "//div[@class='tabSection']/div/div[1]/ul/li[2]/a"))).click()
    # Go Through each Date
    for date_tuple in dates:
        ChooseCourtDivisionDistrict(driver, court_D)
        sleep_rand()
        try:

            WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.XPATH, 
                "//div[@class='tabSection']/div/div[1]/ul/li[2]/a"))).click()
        except TimeoutException:
            driver.refresh()
            time.sleep(5)
            WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.XPATH, 
                "//div[@class='tabSection']/div/div[1]/ul/li[2]/a"))).click()
                
        # Type input dates
        type_dates(driver, date_tuple[0], date_tuple[1])
        # Select Summary Process
        select_Summary_Process_and_Plantiff(driver, "SU                            ")

        # Search Results
        # Submit Link
        try:
            WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.NAME,
            "submitLink"))).click()
        except TimeoutError:
            driver.refresh()
            time.sleep(5)
            WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.NAME,
            "submitLink"))).click()
        # Wait a bit to slow down requests per time
        sleep_rand()

        # CURRENTLY IN CASE NUMBER TABLE or NO MATCHES
        # No matches Case
        try:
            no_matches = driver.find_element(By.ID, 'srchResultNoticeNomatch')
        # Matches: Extract Info            
        except NoSuchElementException:
            case_links = get_case_links(driver)
            for case_link in case_links:
                try:
                    driver.get(case_link)
                except:
                    print('Error, got until: ', date)
                    # EndDate is wrong so can be manually changed
                    save_dataset(dataset, startDate, datetime.date(2030, 1, 1))
                    driver.refresh()
                    time.sleep(5)
                    driver.get(case_link)

                # First Check if there is a defendant before extracting
                if 'Defendant' in driver.page_source:
                    dataset.append(extract_data_from_doc(driver, 'DistrictCourt', court_name))
                #driver.back() # Exit Document
        
        # Go back to select dates page
        try:
            driver.get('https://www.masscourts.org/eservices/search.page.1?x=')
        except TimeoutException:
            print('Error, got until: ', court_D)
            # EndDate is wrong so can be manually changed
            save_dataset(dataset, startDate, datetime.date(2030, 1, 1))
            driver.refresh()
            time.sleep(5)
            driver.get('https://www.masscourts.org/eservices/search.page.1?x=')

for i, data in enumerate(dataset):
    if data[4] == "Disposed for Statistical Purposes":
        if "Judgment Entered" in data[-1]:
            dataset[i][8] = find_status(data[-1])
        else:
            dataset[i][8] = 'Dismissed'
            
    
save_dataset(dataset, startDate, endDate, 'D')