Turnout data source: https://earlyvoting.texas-election.com/Elections/

In [None]:
from urllib import request, parse
import os
import sys
from datetime import datetime
from functools import partial
from collections import defaultdict
import time
from datetime import datetime
import re
import zipfile
# from multiprocessing import Pool

import pandas as pd
import matplotlib as mpl
import numpy as np
from bs4 import BeautifulSoup

pd.set_option('display.max_columns', None)
os.chdir(sys.path[0])

In [None]:
os.makedirs('data/results', exist_ok = True)

In [None]:
opener = request.URLopener()

opener.addheader('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36')

In [None]:
main_page = BeautifulSoup(opener.open('https://earlyvoting.texas-election.com/Elections/getElectionDetails.do').read().decode('utf-8'), 'html.parser')

In [None]:
ev_options = main_page.find(id = 'idElection').find_all('option')

elections = {}

for option in ev_options:
    if len(option['value']) > 0:
        elections[option['value']] = option.contents[0].strip()

In [None]:
def retrieve_dates(id):
    print(f'Downloading election dates (id={id})')
    url = f'https://earlyvoting.texas-election.com/Elections/getElectionEVDates.do?idElection={id}&results=&elecDateName=&cdElectionType='

    dates_page = BeautifulSoup(opener.open(url).read().decode('utf-8'), 'html.parser')

    ev_options = dates_page.find(id = 'selectedDate').find_all('option')

    ev_dates = [ option['value'] for option in ev_options if len(option['value']) > 0 ]

    ed_options = dates_page.find(id = 'electionDate').find_all('option')

    ed_dates = [ option['value'] for option in ed_options if len(option['value']) > 0 ]

    return {
        'early_voting_dates': ev_dates,
        'election_dates': ed_dates
    }

In [None]:
def retrieve_ev_details(id, date, output_dir):
    print(f'Downloading early voting details for {output_dir} on {date}')
    url = f'https://earlyvoting.texas-election.com/Elections/downloadVoterInfoReport.do?idElection={id}&selectedDate={date}&electionDate=&earlyVoteFlag=true&downloadElectionFileCSVFlag=true&idTown='

    opener.retrieve(url, os.path.join('data', 'results', output_dir, f'ev_{id}_{str(datetime.fromisoformat(date).date())}.csv'))

In [None]:
ed_failures = []

In [None]:
def retrieve_ed_details(id, date, output_dir):
    try:
        filename = f'ed_{id}'
        url = f'https://earlyvoting.texas-election.com/Elections/downloadParticipationCountReport.do?selectedDate=&pollPlaceIdtown=&electionDayIdTown=1&idElection={id}&idGroupedElection=&electionDate=&webPageSyncDate=true&downloadElectionFileCSVFlag=true'

        if len(date) > 0:
            filename = f'ed_{id}_{str(datetime.fromisoformat(date).date())}'
            url = f'https://earlyvoting.texas-election.com/Elections/downloadParticipationCountReport.do?idElection={id}&selectedDate={date}&electionDate={date}&earlyVoteFlag=false&downloadElectionFileCSVFlag=true&idTown='

        print(f'Downloading election day details for {output_dir} on {date}')
        content_type = opener.open(url).info()['Content-Type']

        if content_type == 'application/csv':
            path = os.path.join('data', 'results', output_dir, f'{filename}.csv')
            opener.retrieve(url, path)

            df = pd.read_csv(path, low_memory = False)

            df['date'] = date
            df['election_id'] = id
            df['election_name'] = output_dir

            if 'COUNTY' not in df.columns:
                df.index.rename('COUNTY', inplace = True)

            df.to_csv(path)
        else:
            url = f'https://earlyvoting.texas-election.com/Elections/downloadParticipationCountReport.do?idElection={id}&selectedDate={date}&electionDate={date}&earlyVoteFlag=false&downloadElectionFileCSVFlag=false&idTown='
            content_type = opener.open(url).info()['Content-Type']

            if content_type == 'application/zip':
                print(url)
                opener.retrieve(url, os.path.join('data', 'results', output_dir, f'{filename}.zip'))

                print('Extracting election day archive')

                with zipfile.ZipFile(os.path.join('data', 'results', output_dir, f'{filename}.zip'), 'r') as z:
                    from io import StringIO

                    poll_places_df = None

                    with z.open(f'{id}_STATE.csv') as f:
                        poll_places_df = pd.read_csv(StringIO(f.read().decode('utf-8')))

                    if 'COUNTY' in poll_places_df.columns:
                        with z.open(f'{id}VOTER_STATE.csv') as f:
                            df = pd.read_csv(StringIO(f.read().decode('utf-8')))
                            
                            mapping = {}
                            for _, row in poll_places_df.iterrows():
                                mapping[row['POLL PLACE ID']] = row['COUNTY']
                            
                            df['COUNTY'] = df['POLL PLACE ID'].replace(mapping)

                            df = df.drop(columns = ['Date'])

                            df['date'] = date
                            df['election_id'] = id
                            df['election_name'] = output_dir

                            df.to_csv(os.path.join('data', 'results', output_dir, 'ed.csv'), index = False)
                    else:
                        poll_places_df.index.rename('COUNTY', inplace = True)

                        poll_places_df['date'] = date
                        poll_places_df['election_id'] = id
                        poll_places_df['election_name'] = output_dir

                        poll_places_df.to_csv(os.path.join('data', 'results', output_dir, 'ed.csv'))
                        
                
                os.remove(os.path.join('data', 'results', output_dir, f'ed_{id}_{str(datetime.fromisoformat(date).date())}.zip'))
            else:
                print(f'Error retrieving election day details for {output_dir} on {date}')
    except:
        ed_failures.append((id, date, output_dir))
        print(f'Error retrieving election day details for {output_dir} on {date}')

In [None]:
for election_id, election_name in elections.items():
    print(f'Scraping election {election_name}')
    os.makedirs(os.path.join('data', 'results', election_name), exist_ok = True)
    election_dates = retrieve_dates(election_id)

    ev_dates = election_dates['early_voting_dates']
    ed_dates = election_dates['election_dates']

    for ev_date in ev_dates:
        retrieve_ev_details(election_id, ev_date, election_name)
    
    if len(ed_dates) == 0:
        retrieve_ed_details(election_id, '', election_name)
    else:
        for ed_date in ed_dates:
            retrieve_ed_details(election_id, ed_date, election_name)

Missing data:
* 2024 SPECIAL ELECTION SENATE DISTRICT 15 on 2024-05-04
* 2024 SPECIAL RUNOFF ELECTION HOUSE DISTRICT 2 on 2024-01-30
* 2023 MAY 6TH LOCAL ELECTIONS
* 2020 SPECIAL RUNOFF ELECTION SENATE DISTRICT 14
* 2020 SPECIAL RUNOFF ELECTION HOUSE DISTRICT 148
* 2019 MARCH 5TH SPECIAL RUNOFF ELECTION

In [None]:
for election_id, election_name in elections.items():
    file_names = os.listdir(os.path.join('data', 'results', election_name))

    for file_name in file_names:
        path, ext = os.path.splitext(file_name)
        
        if ext != '.zip':
            with open(os.path.join('data', 'results', election_name, file_name)) as f:
                if '<!doctype html>' in f.read():
                    f.close()
                    os.remove(os.path.join('data', 'results', election_name, file_name))
                    print(os.path.join('data', 'results', election_name, file_name))

In [None]:
date_re = re.compile(r'[0-9][0-9][0-9][0-9]\-[0-9][0-9]\-[0-9][0-9]')
id_re = re.compile(r'[0-9][0-9][0-9][0-9][0-9]')

In [None]:
ev_dfs = []
ed_dfs = []

for election_id, election_name in elections.items():
    file_names = os.listdir(os.path.join('data', 'results', election_name))

    sub_ev_dfs = []

    for file_name in file_names:
        path, ext = os.path.splitext(file_name)

        df = pd.read_csv(os.path.join('data', 'results', election_name, file_name), low_memory = False)

        if 'PRECINCT' not in df.columns:
            print(f'No PRECINCT for {os.path.join('data', 'results', election_name, file_name)}')
            df['PRECINCT'] = pd.NA

        if file_name[0:2] == 'ed':
            # df = pd.read_csv(os.path.join('data', 'results', election_name, file_name), nrows = 2)
            # print(os.path.join('data', 'results', election_name, file_name))
            # print(df.columns)

            if 'ID_VOTER' in df.columns:
                group_df = df[['COUNTY', 'PRECINCT', 'date', 'election_id', 'election_name']] \
                    .groupby(['COUNTY', 'PRECINCT', 'date', 'election_id', 'election_name']) \
                    .size().reset_index().rename(columns = {0: 'VOTER COUNT'})
                
                ed_dfs.append(group_df)
            else:
                ed_dfs.append(df[['COUNTY', 'PRECINCT', 'date', 'election_id', 'election_name', 'VOTER COUNT']])
        else:
            date = re.search(date_re, file_name)

            if date is not None:
                df['date'] = date.group(0)
            else:
                df['date'] = pd.NA

            sub_ev_dfs.append(df)
    
    if len(sub_ev_dfs) > 0:
        election_df = pd.concat(sub_ev_dfs)
        
        election_df['election_id'] = election_id
        election_df['election_name'] = election_name

        ev_dfs.append(election_df)

        # election_df.to_csv(os.path.join('data', 'results', election_name, 'ev_all.csv'), index = False)

In [None]:
c = ['COUNTY', 'VOTING_METHOD', 'PRECINCT', 'date', 'election_id', 'election_name']

ev_df = pd.concat(ev_dfs)[c].groupby(c).size().reset_index().rename(columns = {0: 'VOTER COUNT'})

In [None]:
ed_df = pd.concat(ed_dfs)
ed_df.head()

In [None]:
ev_df.to_csv(os.path.join('data', 'results', 'early_voting.csv'), index = False)
ed_df.to_csv(os.path.join('data', 'results', 'election_day.csv'), index = False)