In [15]:
import pdfplumber
import os
import csv

"""
Extractor for format "C" of election results
Format C is the second most common format in OpenElections data for Indiana
This format is characterized by grey rectangles in the header, a blue bar indicating the precinct, and green boxes marking questions.
"""

'\nExtractor for format "C" of election results\nFormat C is the second most common format in OpenElections data for Indiana\nThis format is characterized by grey rectangles in the header, a blue bar indicating the precinct, and green boxes marking questions.\n'

In [16]:
file = "../data-raw/Cass_24.pdf"               # Filepath for the target PDF
county = input("County Name: ").capitalize()   # Retrieves the county name and formats it properly. This is done so that only the file path must be changed out or parties added.
parties = ["REP", "DEM", "LIB", "WTP", "NP"]   # Party abbreviations used in question choice lines. Detection is difficult, so must be added manually

County Name:  Cass


In [17]:
class Precinct():                # An object that stores data for a given precinct, rather than having a dict for each attribute as I originally attempted.
    def __init__(self, name):
        self.name = ""           # Stores the name. Presently serves little purpose, may be removed later
        self.pages = []          # Stores the text for each page belonging to the precinct
        self.header_lines = []   # Stores the text of the header
        self.lines = []          # Stores the lines of the questions only for the given precinct
        self.questions = []      # Stores the lines of each question in a nested list. self.questions[0] would return a list of lines for the first question

In [18]:
##### Precinct Separation
precincts = {}   # Precincts will be paired with a list of pages since no page has data for more than one precinct

# Opening the file
with pdfplumber.open(file) as pdf:
    for page in pdf.pages:
        lines = page.extract_text_lines()
        # Identify the precinct the page belongs to
        header_words = lines[10]['text'].split(" ")   # Header lines look like "Precinct Name # of # registered voters = x%"
        precinct_name = ""
        consistent_text = lines[12]['text']
        for word in header_words:   # Rejoins words until the first number is encountered. May need adjusted if another county has names like "Precinct 01"
            if(word.replace(",","").isnumeric() == False):   # Replace method accounts for commas in large numbers
                precinct_name += " " + word
            else:
                precinct_name = precinct_name.strip()   # Remove the trailing spaces from the above process
                break   # Ensures that rejoining stops at the first number instead of skipping numbers
        # Assign the pages to their respective precinct objects
        if(precinct_name not in precincts):   # Checks if the precinct has any pages. If not, adds a new entry to precincts.
            precincts.update({precinct_name : Precinct(precinct_name)})
            print(precinct_name)                                       # For testing purposes, lists all of the precincts as they are sorted
        precincts[precinct_name].pages.append(lines)
    print("Precincts separated")   # Outputs a progress update when all the data is loaded, since it takes a while

Adams
Bethlehem
Boone
Clay City
Clay
Clinton
Deer Creek East
Deer Creek West
Eel East
Eel North
Eel West
Galveston
Harrison
Jackson
Jefferson
Logansport
Miami - Logan
Miami - Lewis Cass
Noble City
Noble North
Noble - Logan
Noble - Pioneer
Tipton I
Tipton II
Washington - Logan
Washington - Lewis Cass
Washington South
Washington City - Logan
Washington City - Lewis Cass
Federal Only
President Only
Precincts separated


In [23]:
##### Line Extraction

# Accepts a precinct object (e.g. precincts['Adams']) and stores question text in precinct.lines
def extract_lines(precinct):
    for page in precinct.pages:
        for i in range(11):
            precinct.header_lines.append(page[i]['text'])
        for i in range(11,len(page)):   # questions fall within this range, allowing for easier separation of questions
            precinct.lines.append(page[i]['text'])   # Since the data can be separated using text alone, only the text is grabbed

In [24]:
# Extracts lines for all stored precincts
for precinct in precincts:
    extract_lines(precincts[precinct])

In [25]:
##### Question Separation

# Accepts a precinct name (e.g. 'Adams') and separates the precinct text into individual questions
def separate_questions(target):
    precinct = precincts[target]   # Layer that makes the dictionary call less ugly in the code below
    question_pos = []   # Stores the positions of the questions within precinct.lines
    
    # Question identification
    consistent_text = precinct.lines[1]   # Second line of every question has the same column headings, providing a reference point for questions.
    for i in range(len(precinct.lines)):
        if(precinct.lines[i] == consistent_text):
            question_pos.append(i-1)
    question_pos.append(len(precinct.lines))   # Pads question_pos so that the final question can be grabbed below.

    # Question isolation
    for pos in range(len(question_pos)-1):
        question = []
        for line in range(question_pos[pos], question_pos[pos+1]):
            question.append(precinct.lines[line])
        precinct.questions.append(question)
    # print(precinct.questions[0])   # For testing, to see if questions properly separated / view the raw question data
    

In [26]:
for precinct in precincts:
    separate_questions(precinct)

In [27]:
'''
Format: county,precinct,office,district,party,candidate,election_day,absentee,early_voting,votes

District does not seem to appear in this format, and the EMS format does not always include it

Question layout:
office - optional instruction
candidate party absentee early_voting election_day votes
Each "candidate" lists the above data, with the votes listed in raw numbers and percentages
Question total votes, per type
Undervotes, per type
Overvotes, per type
Invalid votes, per type
'''

'\nFormat: county,precinct,office,district,party,candidate,election_day,absentee,early_voting,votes\n\nDistrict does not seem to appear in this format, and the EMS format does not always include it\n\nQuestion layout:\noffice - optional instruction\ncandidate party absentee early_voting election_day votes\nEach "candidate" lists the above data, with the votes listed in raw numbers and percentages\nQuestion total votes, per type\nUndervotes, per type\nOvervotes, per type\nInvalid votes, per type\n'

In [28]:
def extract_ballot_data(target):
    precinct = precincts[target]
    numbers = []
    for word in precinct.header_lines[-1].split(" "):
        if(word.replace(",","").isnumeric()):
            numbers.append(int(word.replace(",","")))
    ballots = [county, target, "Ballots Cast",'','','','','','',numbers[0]]
    voters = [county, target, "Registered Voters",'','','','','','',numbers[1]]
    return ballots, voters

In [29]:
##### Data Extraction, per question

# Accepts precinct name string (e.g. 'Adams') and a zero-indexed question number, returns a list of formatted data from each option for the question
def extract_data(target, question_num):
    results = []   # A list of the output data for each line of the question
    precinct = precincts[target]   # Shorthand for calls to the precinct object
    question = precinct.questions[question_num]   # Shorthand for the question the method applies to
    
    # office
    office = question[0].split("-")[0].strip()   # Removes any instructions (e.g. Vote for one (1) only) not part of the question name, and then remove the trailing space left behind
    district = ''
    if("DISTRICT" in office.split(" ")):   # Checks if the office mentions a district
        words = office.split(" ")
        office = ''
        for word in words:
            if(word == "DISTRICT"):
                break
            else:
                office += " "+word
        district = words[-1]
        office = office.strip()

    # Per-line data
    for line in range(2, len(question)):   # Indexing begins at 2 to ignore the "office" and the data header
        candidate = ""
        party = ""   # A 2-3 character alphabetical code for the 
        numbers = []
        if("Cast Votes" in question[line]):   # Cuts off the non-response data that we do not need for the desired output
            break
        else:
            contents = question[line].split(" ")
            for word in contents:
                # Reconstructs the split candidate name from individual pieces, filtering out the party and votes
                if(word not in parties and word.replace("%","").replace(".","").replace(",","").isnumeric() == False):
                    candidate += " " + word

                # Detect party abbreviations from the dictionary
                elif(word in parties):
                    party = word

                # Add the election results to a list for later sorting
                elif(word.replace(",", "").isnumeric()):
                    numbers.append(word.replace(",",""))
                
                # After the line has been read, removes the leading space from the candidate reconstruction code
                candidate = candidate.strip()

        # Completes the candidate name of the previous line in case the name rolled into a second line
        if(numbers == []):
            results[line-3][5] += " " + candidate
            continue   # Skips the code that adds the results

        # Compile the results according to the output format and add to a list for the overall question
        # print(numbers)                                                                                                 # Debug
        result = [county, target, office, district, party, candidate, numbers[2], numbers[0], numbers[1], numbers[3]]
        results.append(result)
    return results

In [30]:
output_lines = [['county','precinct','office','district','party','candidate','election_day','absentee','early_voting','votes']]

for precinct in precincts:
    ballots, voters = extract_ballot_data(precinct)
    output_lines.append(ballots)
    output_lines.append(voters)
    for q in range(len(precincts[precinct].questions)):
        try:
            results = extract_data(precinct, q)
        except Exception as e:
            print(precinct, q, " - ", e)
        for line in results:
            output_lines.append(line)

In [31]:
##### Output to CSV
def csv_writer():
    with open("../data/20241105__in__general__"+county.lower()+"__precinct.csv", "w", newline = "\n") as csvfile:
        csv_writer = csv.writer(csvfile)
        for line in output_lines:
            csv_writer.writerow(line)

csv_writer()