In [10]:
import itertools
import pathlib
import pickle
import string
import time

import numpy as np
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from IPython.display import clear_output

In [2]:
url = 'https://www.unitedstateszipcodes.org/'

In [3]:
driver = webdriver.Firefox()
driver.get(url)

In [4]:
def search_zip_code(driver, zip_code):
    text_field_id = 'q'
    text_field = driver.find_element_by_id(text_field_id)
    text_field.click()
    text_field.send_keys(Keys.COMMAND + 'a')
    text_field.send_keys(Keys.DELETE)
    text_field.send_keys(zip_code)
    
    button_class = 'btn-danger'
    button = driver.find_element_by_class_name(button_class)
    button.click()

In [5]:
IGNORE = string.ascii_letters + ' $'

def get_info_by_zip_code(driver, zip_code):
    search_zip_code(driver, zip_code)
    tables = driver.find_elements_by_tag_name('table')
    
    dct = {'Zip Code': zip_code}
    keys1 = ['Population', 'Population Density', 'Housing Units', 'Median Home Value']
    keys2 = ['Land Area', 'Water Area', 'Occupied Housing Units', 'Median Household Income']
    
    for i, keys in enumerate([keys1, keys2], start=1):
        try:
            data = tables[i].text.split('\n')
        except IndexError:
            vals = itertools.repeat(np.nan)
        else:
            vals = (float(d.strip(IGNORE).replace(',' , '')) for d in data)
        dct.update(zip(keys, vals))

    return dct

In [8]:
cwd = pathlib.Path.cwd()
data_path = cwd / 'data'
csv_path = data_path / 'csv'

zip_code_path = csv_path / 'zip_codes.csv'
zip_code_arr = np.genfromtxt(zip_code_path, delimiter=',')

pkl_path = data_path / 'pkl'
zip_code_pkl = pkl_path / 'zip_codes.pkl'
if not pkl_path.exists():
    zip_code_dct = {}
else:
    with open(zip_code_pkl, 'rb') as fp:
        zip_code_dct = pickle.load(fp)

In [11]:
start = 1441
stop = None
for i, zip_code in enumerate(zip_code_arr[start:stop], start=start):
    dct = get_info_by_zip_code(driver, int(zip_code))
    zip_code = dct.pop('Zip Code')
    zip_code_dct[zip_code] = dct
    
    delay = 3 + np.random.rand() + np.random.poisson()
    print(f'{i:>5} - Delay: {delay:.4}')
    time.sleep(delay)
    clear_output(wait=True)
print('Done' + '*' * 50)

Done**************************************************


In [12]:
pathlib.Path.mkdir(pkl_path, exist_ok=True)
with open(zip_code_pkl, 'wb') as fp:
    pickle.dump(zip_code_dct, fp)

In [13]:
len(zip_code_dct), len(zip_code_arr)

(1493, 1494)

In [18]:
driver.quit()