Turnout data source: https://earlyvoting.texas-election.com/Elections/

In [None]:
from urllib import request, parse
import os
import sys
from datetime import datetime
from functools import partial
from collections import defaultdict
import time
from datetime import datetime
import re
import zipfile
# from multiprocessing import Pool

import pandas as pd
import matplotlib as mpl
import numpy as np
from bs4 import BeautifulSoup

pd.set_option('display.max_columns', None)
os.chdir(sys.path[0])

In [None]:
os.makedirs('data/results', exist_ok = True)

In [None]:
opener = request.URLopener()

opener.addheader('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36')

In [None]:
main_page = BeautifulSoup(opener.open('https://earlyvoting.texas-election.com/Elections/getElectionDetails.do').read().decode('utf-8'), 'html.parser')

In [None]:
ev_options = main_page.find(id = 'idElection').find_all('option')

elections = {}

for option in ev_options:
    if len(option['value']) > 0:
        elections[option['value']] = option.contents[0].strip()

In [None]:
def retrieve_dates(id):
    print(f'Downloading election dates (id={id})')
    url = f'https://earlyvoting.texas-election.com/Elections/getElectionEVDates.do?idElection={id}&results=&elecDateName=&cdElectionType='

    dates_page = BeautifulSoup(opener.open(url).read().decode('utf-8'), 'html.parser')

    ev_options = dates_page.find(id = 'selectedDate').find_all('option')

    ev_dates = [ option['value'] for option in ev_options if len(option['value']) > 0 ]

    ed_options = dates_page.find(id = 'electionDate').find_all('option')

    ed_dates = [ option['value'] for option in ed_options if len(option['value']) > 0 ]

    return {
        'early_voting_dates': ev_dates,
        'election_dates': ed_dates
    }

In [None]:
def retrieve_ev_details(id, date, output_dir):
    print(f'Downloading early voting details for {output_dir} on {date}')
    url = f'https://earlyvoting.texas-election.com/Elections/downloadVoterInfoReport.do?idElection={id}&selectedDate={date}&electionDate=&earlyVoteFlag=true&downloadElectionFileCSVFlag=false&idTown='

    opener.retrieve(url, os.path.join('data', 'results', output_dir, f'ev_{id}_{str(datetime.fromisoformat(date).date())}.csv'))

In [None]:
def retrieve_ed_details(id, date, output_dir):
    print(f'Downloading election day details for {output_dir} on {date}')
    url = f'https://earlyvoting.texas-election.com/Elections/downloadParticipationCountReport.do?idElection={id}&selectedDate={date}&electionDate={date}&earlyVoteFlag=false&downloadElectionFileCSVFlag=false&idTown='

    opener.retrieve(url, os.path.join('data', 'results', output_dir, f'ed_{id}_{str(datetime.fromisoformat(date).date())}.zip'))

In [None]:
for election_id, election_name in elections.items():
    print(f'Scraping election {election_name}')
    os.makedirs(os.path.join('data', 'results', election_name), exist_ok = True)

    election_dates = retrieve_dates(election_id)

    ev_dates = election_dates['early_voting_dates']
    ed_dates = election_dates['election_dates']

    for ev_date in ev_dates:
        retrieve_ev_details(election_id, ev_date, election_name)
    
    for ed_date in ed_dates:
        retrieve_ed_details(election_id, ed_date, election_name)

In [None]:
for election_id, election_name in elections.items():
    file_names = os.listdir(os.path.join('data', 'results', election_name))

    for file_name in file_names:
        path, ext = os.path.splitext(file_name)
        
        if ext != '.zip':
            with open(os.path.join('data', 'results', election_name, file_name)) as f:
                if '<!doctype html>' in f.read():
                    f.close()
                    os.remove(os.path.join('data', 'results', election_name, file_name))

In [None]:
date_re = re.compile(r'[0-9][0-9][0-9][0-9]\-[0-9][0-9]\-[0-9][0-9]')
id_re = re.compile(r'[0-9][0-9][0-9][0-9][0-9]')

ev_dfs = []
ed_dfs = []

for election_id, election_name in elections.items():
    file_names = os.listdir(os.path.join('data', 'results', election_name))

    sub_ev_dfs = []

    for file_name in file_names:
        path, ext = os.path.splitext(file_name)
        
        if ext == '.zip':
            date = re.search(date_re, file_name)
        
            if date is not None:
                date = date.group(0)
                id = re.search(id_re, file_name).group(0)
                try:
                    with zipfile.ZipFile(os.path.join('data', 'results', election_name, file_name), 'r') as z:
                        with z.open(f'{id}VOTER_STATE.csv') as f:
                            from io import StringIO

                            df = pd.read_csv(StringIO(f.read().decode('utf-8')))

                            df['date'] = date
                            df['election_id'] = election_id
                            df['election_name'] = election_name

                            df.to_csv(os.path.join('data', 'results', election_name, 'ed.csv'))
                            ed_dfs.append(df)
                except:
                    print(f'Error reading {file_name} for {election_name}')
        else:
            date = re.search(date_re, file_name)

            if date is not None:
                df = pd.read_csv(os.path.join('data', 'results', election_name, file_name))

                df['date'] = date.group(0)

                sub_ev_dfs.append(df)
    
    if len(sub_ev_dfs) > 0:
        election_df = pd.concat(sub_ev_dfs)
        
        election_df['election_id'] = election_id
        election_df['election_name'] = election_name

        ev_dfs.append(election_df)

        election_df.to_csv(os.path.join('data', 'results', election_name, 'ev.csv'), index = False)

ev_df = pd.concat(ev_dfs)
ed_df = pd.concat(ed_dfs)

In [None]:
ev_df.to_csv(os.path.join('data', 'results', 'early_voting.csv'))
ed_df.to_csv(os.path.join('data', 'results', 'election_day.csv'))