In [1]:
import pdfplumber
import os

file = "../data-raw/Cass_24.pdf"
"""
Extractor for format "C" of election results
Format C is the second most common format in OpenElections data for Indiana
This format is characterized by grey rectangles in the header, a blue bar indicating the precinct, and green boxes marking questions.
"""

In [2]:
class Precinct():   # An object that stores data for a given precinct, rather than having a dict for each attribute as I originally attempted.
    def __init__(self, name):
        self.name = ""    # Stores the name. Presently serves little purpose, may be removed later
        self.pages = []   # Stores the text for each page belonging to the precinct
        self.lines = []   # Stores the lines of the questions only for the given precinct
        self.questions = []   # Stores the lines of each question in a nested list. self.questions[0] would return a list of lines for the first question

In [3]:
##### Precinct Separation
precincts = {}   # Precincts will be paired with a list of pages since no page has data for more than one precinct

# Opening the file
with pdfplumber.open(file) as pdf:
    for page in pdf.pages:
        lines = page.extract_text_lines()
        # Identify the precinct the page belongs to
        header_words = lines[10]['text'].split(" ")   # Header lines look like "Precinct Name # of # registered voters = x%"
        precinct_name = ""
        consistent_text = lines[12]['text']
        for word in header_words:   # Rejoins words until the first number is encountered. May need adjusted if another county has names like "Precinct 01"
            if(word.replace(",","").isnumeric() == False):   # Replace method accounts for commas in large numbers
                precinct_name += " " + word
            else:
                precinct_name = precinct_name.strip()   # Remove the trailing spaces from the above process
                break   # Ensures that rejoining stops at the first number instead of skipping numbers
        # Assign the pages to their respective precinct objects
        if(precinct_name not in precincts):   # Checks if the precinct has any pages. If not, adds a new entry to precincts.
            precincts.update({precinct_name : Precinct(precinct_name)})
            print(precinct_name)                                       # For testing purposes, lists all of the precincts as they are sorted
        precincts[precinct_name].pages.append(lines)
    print("Precincts separated")

Adams
Bethlehem
Boone
Clay City
Clay
Clinton
Deer Creek East
Deer Creek West
Eel East
Eel North
Eel West
Galveston
Harrison
Jackson
Jefferson
Logansport
Miami - Logan
Miami - Lewis Cass
Noble City
Noble North
Noble - Logan
Noble - Pioneer
Tipton I
Tipton II
Washington - Logan
Washington - Lewis Cass
Washington South
Washington City - Logan
Washington City - Lewis Cass
Federal Only
President Only
Precincts separated


In [4]:
##### Line Extraction
def extract_lines(precinct):
    for page in precinct.pages:
        for i in range(11,len(page)):   # questions fall within this range, allowing for easier separation of questions
            precinct.lines.append(page[i]['text'])   # Since the data can be separated using text alone, only the text is grabbed

for precinct in precincts:
    extract_lines(precincts[precinct])


In [12]:
print(precincts['Adams'].lines[1])

Choice Party Absentee Voting Early Voting Election Day Voting Total


In [30]:
##### Question Separation

def separate_questions(target):
    precinct = precincts[target]   # Layer that makes the dictionary call less ugly in the code below
    question_pos = []   # Stores the positions of the questions within precinct.lines
    
    # Question identification
    consistent_text = precinct.lines[1]   # Second line of every question has the same column headings, providing a reference point for questions.
    for i in range(len(precinct.lines)):
        if(precinct.lines[i] == consistent_text):
            question_pos.append(i-1)
    question_pos.append(len(precinct.lines))   # Pads question_pos so that the final question can be grabbed below.

    # Question isolation
    for pos in range(len(question_pos)-1):
        question = []
        for line in range(question_pos[pos], question_pos[pos+1]):
            question.append(precinct.lines[line])
        precinct.questions.append(question)
    print(precinct.questions[2])   # For testing, to see if questions properly separated / view the raw question data
    

In [31]:
separate_questions('Adams')

['PRESIDENT & VICE PRESIDENT OF THE UNITED STATES - Vote for one (1) only', 'Choice Party Absentee Voting Early Voting Election Day Voting Total', 'DONALD J. TRUMP / JD VANCE REP 11 73.33% 196 79.67% 113 86.92% 320 81.84%', 'KAMALA D. HARRIS / TIM WALZ DEM 4 26.67% 48 19.51% 16 12.31% 68 17.39%', 'CHASE OLIVER / MIKE TER MAAT LIB 0 0.00% 2 0.81% 1 0.77% 3 0.77%', 'ROBERT F. KENNEDY JR. / NICOLE WTP 0 0.00% 0 0.00% 0 0.00% 0 0.00%', 'SHANAHAN', 'ANDRE RAMON MCNEIL SR. (W) 0 0.00% 0 0.00% 0 0.00% 0 0.00%', 'CAROL ASHER (W) 0 0.00% 0 0.00% 0 0.00% 0 0.00%', 'CHERUNDA LYNN FOX (W) 0 0.00% 0 0.00% 0 0.00% 0 0.00%', 'CLAUDIA DE LA CRUZ (W) 0 0.00% 0 0.00% 0 0.00% 0 0.00%', 'CORNEL WEST / MELINA ABDULLAH (W) 0 0.00% 0 0.00% 0 0.00% 0 0.00%', 'DOUG JENKINS (W) 0 0.00% 0 0.00% 0 0.00% 0 0.00%', 'NALA BAOZUN SCOTT JOHNSON (W) 0 0.00% 0 0.00% 0 0.00% 0 0.00%', 'PAIJ BORING (W) 0 0.00% 0 0.00% 0 0.00% 0 0.00%', 'PETER SONSKI (W) 0 0.00% 0 0.00% 0 0.00% 0 0.00%', 'SHONDRA IRVING (W) 0 0.00% 0 0.00%