In [None]:
import geopandas as gpd
import pandas as pd

#Read Shapefile
zipcode_shapefile = gpd.read_file(#file path of shapefile)

#Read data file
zipcode_data = pd.read_csv(#Filepath of zipcode data, dtype = {'zipcode':'str'})

#FIlter the zipcode data file down to one year or loop through the years one by one to generate a different shapefile for each year    

zipcode_shapefile = zipcode_shapefile.merge(zipcode_data, how = 'left', on=#Unique identifier)

#Write new shapefile                                 
zipcode_shapfile.to_file('folder/EITC_year.shp')
                                            
#zipfile the folder that the outputs are saved to and then upload to ArcGIS Online


In [1]:
import geopandas as gpd
from bs4 import BeautifulSoup
import pandas as pd
import re
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
import os
from tqdm import tqdm

In [2]:
#Function to pull shapefile data

#Returns list of zip files in main directory
def get_zip(url):
    front_page = requests.get(url,verify=False)
    soup = BeautifulSoup(front_page.content,'html.parser')
    zf = soup.find_all("a",href=re.compile(r"zip"))
    zl = [os.path.join(url,i['href']) for i in zf]
    return zl

#parse through list of zip files and append together (might take too much time)
def get_geo_table(geography, url):
    files = get_zip(url)
    pages = []
    for zfile in tqdm(files, desc=f"{geography}",total=len(files)):
        table = gpd.read_file(zfile)
        pages.append(table)
    fulltable =  gpd.GeoDataFrame(pd.concat(pages, ignore_index=True))
    return fulltable

In [3]:
county = get_geo_table('COUNTY','https://www2.census.gov/geo/tiger/TIGER2022/COUNTY/')
cousub = get_geo_table('COUSUB','https://www2.census.gov/geo/tiger/TIGER2020/COUSUB/')
place = get_geo_table('PLACE','https://www2.census.gov/geo/tiger/TIGER2022/PLACE/')

COUNTY: 100%|████████████████████████████████████████████████████████████████████████████| 1/1 [00:48<00:00, 48.46s/it]
COUSUB: 100%|██████████████████████████████████████████████████████████████████████████| 56/56 [07:36<00:00,  8.16s/it]
PLACE: 100%|███████████████████████████████████████████████████████████████████████████| 56/56 [03:20<00:00,  3.59s/it]


In [5]:
county = county.iloc[:,[3,-3,-2,-1]]
cousub = cousub.iloc[:,[4,-3,-2,-1]]
place = place.iloc[:,[3,-3,-2,-1]]

In [24]:
#Update Gulport,IL, Cahokia Heights,IL, and Pecos,Tx to match to CTV geoids.
place = place.replace({'1731991':'1731992','1710373':'1710370','4873493':'4856516'})

In [8]:
ctv = pd.read_csv('Output/CTV.csv', dtype = {'GEOID':'string'})

In [81]:
ctv_map = pd.concat([place,county,cousub], ignore_index = True).merge(ctv , on = 'GEOID')
ctv_map = ctv_map.iloc[:,[0,1,2,3]]

In [33]:
ctv_map.to_file('ShapeFiles/CTV/ctv.shp')
ctv_map.to_file('ShapeFiles/GeoJSON/ctv.geojson', driver='GeoJSON')

In [3]:
#Load crosswalks and base tables
CTV = pd.read_csv('Output/CTV_622update.csv',usecols =['GEOID','GEO_UNIT','FULL_NAME','TYPE','CODE_state','member_flag','POPULATION','POP_YEAR'],dtype={'GEOID':'str'})
CTV_x_COUNTY = pd.read_csv('Output/CTV_x_COUNTY.csv',dtype='str')
CTV_x_BLOCK = pd.read_csv('Output/CTV_x_BLOCK.csv',dtype='str')
CTV_x_CD = pd.read_csv('Output/CTV_x_CD.csv',dtype='str')

In [4]:
#STATE
states = get_geo_table('STATE','https://www2.census.gov/geo/tiger/TIGER2020/STATE/')
states_data = pd.read_csv('Output/STATE.csv', dtype = 'str')
states_data = states_data[states_data.CODE.isin(CTV.CODE_state.unique().tolist())]
states = states.merge(states_data, left_on='GEOID',right_on='FIPS',suffixes=('','_y'))
states = states.iloc[:,[15,16,6,0,1,10,11,12,13,14]]
states.REGION = states.REGION.map({'1':'Northeast',
                                   '2':'Midwest',
                                   '3':'South',
                                   '4':'West',
                                   '9':'N/A'})
states.DIVISION = states.DIVISION.map({'1':'New England',
                                       '2':'Middle Atlantic',
                                       '3':'East North Central',
                                       '4':'West North Central',
                                       '5':'South Atlantic',
                                       '6':'East South Central',
                                       '7':'West South Central',
                                       '8':'Mountain',
                                       '9':'Pacific',
                                       '0':'N/A'})
states.to_file('ShapeFiles/STATE/state.shp')
states.to_file('ShapeFiles/GeoJSON/state.geojson', driver='GeoJSON')

STATE: 100%|█████████████████████████████████████████████████████████████████████████████| 1/1 [00:08<00:00,  8.66s/it]


In [5]:
#CSA
csa = get_geo_table('CSA','https://www2.census.gov/geo/tiger/TIGER2020/CSA/')
csa_data = pd.read_csv('Output/CSA.csv', dtype = 'str').iloc[:,:2]
csa_data = csa_data[csa_data.FIPS.isin(CTV_x_COUNTY.FIPS_csa.unique().tolist())]
csa = csa.merge(csa_data, left_on='GEOID',right_on='FIPS',suffixes=('','_y'))
csa = csa.iloc[:,[11,2,6,7,8,9,10]]
csa.to_file('ShapeFiles/CSA/csa.shp')
csa.to_file('ShapeFiles/GeoJSON/csa.geojson', driver='GeoJSON')

CSA: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:19<00:00, 19.75s/it]


In [6]:
#CBSA
cbsa = get_geo_table('CBSA','https://www2.census.gov/geo/tiger/TIGER2020/CBSA/')
cbsa_data = pd.read_csv('Output/CBSA.csv', dtype = 'str').iloc[:,:3]
cbsa_data = cbsa_data[cbsa_data.FIPS.isin(CTV_x_COUNTY.FIPS_cbsa.unique().tolist())]
cbsa = cbsa.merge(cbsa_data, left_on='GEOID',right_on='FIPS',suffixes=('','_y'))
cbsa = cbsa.iloc[:,[13,3,15,8,9,10,11,12]]
cbsa.to_file('ShapeFiles/CBSA/cbsa.shp')
cbsa.to_file('ShapeFiles/GeoJSON/cbsa.geojson', driver='GeoJSON')

CBSA: 100%|██████████████████████████████████████████████████████████████████████████████| 1/1 [00:28<00:00, 28.90s/it]


In [7]:
#METDIV
metdiv = get_geo_table('METDIV','https://www2.census.gov/geo/tiger/TIGER2020/METDIV/')
metdiv_data = pd.read_csv('Output/METDIV.csv', dtype = 'str').iloc[:,:2]
metdiv_data = metdiv_data[metdiv_data.FIPS.isin(CTV_x_COUNTY.FIPS_metdiv.unique().tolist())]
metdiv = metdiv.merge(metdiv_data, left_on='METDIVFP',right_on='FIPS',suffixes=('','_y'))
metdiv = metdiv.iloc[:,[13,4,8,9,10,11,12]]
metdiv.to_file('ShapeFiles/METDIV/metdiv.shp')
metdiv.to_file('ShapeFiles/GeoJSON/metdiv.geojson', driver='GeoJSON')

METDIV: 100%|████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.77s/it]


In [8]:
#COUNTY
county = get_geo_table('COUNTY','https://www2.census.gov/geo/tiger/TIGER2022/COUNTY/')
county_data = pd.read_csv('Output/COUNTY.csv', dtype = 'str').iloc[:,[0,1,3]]
county_data.rename(columns={"CENTRAL_OUTLYING": "CENT_OUT"},inplace=True)
county_data = county_data[county_data.GEOID.isin(CTV_x_COUNTY.GEOID_county.unique().tolist())]
county = county.merge(county_data, on='GEOID',suffixes=('','_y'))
county = county.iloc[:,[3,4,19,13,14,15,16,17]]
county.to_file('ShapeFiles/COUNTY/county.shp')
county.to_file('ShapeFiles/GeoJSON/county.geojson', driver='GeoJSON')

COUNTY: 100%|████████████████████████████████████████████████████████████████████████████| 1/1 [00:58<00:00, 58.28s/it]


In [20]:
#COUSUB
cousub = get_geo_table('COUSUB','https://www2.census.gov/geo/tiger/TIGER2022/COUSUB/')
cousub_data = pd.read_csv('Output/COUSUB.csv', dtype = 'str').iloc[:,:3]
cousub_data = cousub_data[cousub_data.GEOID.isin(CTV_x_BLOCK.GEOID_cousub.unique().tolist())]
cousub = cousub.merge(cousub_data, on='GEOID',suffixes=('','_y'))
cousub = cousub.iloc[:,[4,5,20,14,15,16,17,18]]
cousub.to_file('ShapeFiles/COUSUB/cousub.shp')
cousub.to_file('ShapeFiles/GeoJSON/cousub.geojson', driver='GeoJSON')

COUSUB:   7%|█████▎                                                                     | 4/56 [00:18<03:54,  4.51s/it]


KeyboardInterrupt: 

In [10]:
#TRACT
tract = get_geo_table('TRACT','https://www2.census.gov/geo/tiger/TIGER2020/TRACT/')
tract_data = pd.read_csv('Output/TRACT.csv', dtype = 'str').iloc[:,:2]
tract_data = tract_data[tract_data.GEOID.isin(CTV_x_BLOCK.GEOID_tract.unique().tolist())]
tract = tract.merge(tract_data, on='GEOID',suffixes=('','_y'))
tract = tract.iloc[:,[3,5,8,9,10,11,12]]
tract.to_file('ShapeFiles/TRACT/tract.shp')
tract.to_file('ShapeFiles/GeoJSON/tract.geojson', driver='GeoJSON')

TRACT: 100%|███████████████████████████████████████████████████████████████████████████| 56/56 [06:57<00:00,  7.45s/it]


In [11]:
#BLOCKGROUP
blockgroup = get_geo_table('BLOCKGROUP','https://www2.census.gov/geo/tiger/TIGER2020/BG/')
blockgroup_data = pd.read_csv('Output/BLOCKGROUP.csv', dtype = 'str').iloc[:,:2]
blockgroup_data = blockgroup_data[blockgroup_data.GEOID.isin(CTV_x_BLOCK.GEOID_blockgroup.unique().tolist())]
blockgroup = blockgroup.merge(blockgroup_data, on='GEOID',suffixes=('','_y'))
blockgroup = blockgroup.iloc[:,[4,13,8,9,10,11,12]]
blockgroup.to_file('ShapeFiles/BLOCKGROUP/blockgroup.shp')
blockgroup.to_file('ShapeFiles/GeoJSON/blockgroup.geojson', driver='GeoJSON')

BLOCKGROUP: 100%|██████████████████████████████████████████████████████████████████████| 56/56 [13:03<00:00, 14.00s/it]


In [12]:
#PLACE
place = get_geo_table('https://www2.census.gov/geo/tiger/TIGER2022/PLACE/')
place_data = pd.read_csv('Output/PLACE.csv', dtype = 'str').iloc[:,:2]
place_data = place_data[place_data.GEOID.isin(CTV.GEOID.unique().tolist())]
place = place.merge(place_data, on='GEOID',suffixes=('','_y'))
place = place.iloc[:,[3,4,12,13,14,15,16]]

PLACE: 100%|███████████████████████████████████████████████████████████████████████████| 56/56 [02:29<00:00,  2.66s/it]


In [13]:
#CTV Map
ctv_map = pd.concat([place,county,cousub], ignore_index = True).iloc[:,:-2].merge(CTV, on = 'GEOID')
ctv_map = ctv_map.iloc[:,[0,7,8,9,10,11,12,13,2,3,4,5,6]]
ctv_map.rename(columns={"member_flag": "mem_flag"},inplace=True)
ctv_map.to_file('ShapeFiles/CTV/ctv.shp')
ctv_map.to_file('ShapeFiles/GeoJSON/ctv.geojson', driver='GeoJSON')

In [14]:
#CD
cd = get_geo_table('CD','https://www2.census.gov/geo/tiger/TIGER2020/CD/CD118/')
cd_data = pd.read_csv('Output/CD.csv', dtype = 'str').iloc[:,:2]
cd_data = cd_data[cd_data.GEOID.isin(CTV_x_CD.GEOID_cd.unique().tolist())]
cd = cd.merge(cd_data, left_on='GEOID20',right_on='GEOID',suffixes=('','_y'))
cd = cd.iloc[:,[13,14,8,9,10,11,12]]
cd.to_file('ShapeFiles/CD/cd.shp')
cd.to_file('ShapeFiles/GeoJSON/cd.geojson', driver='GeoJSON')

CD: 100%|██████████████████████████████████████████████████████████████████████████████| 56/56 [01:02<00:00,  1.11s/it]


In [5]:
#BLOCK
block = get_geo_table('BLOCK','https://www2.census.gov/geo/tiger/TIGER2020/TABBLOCK20/')
block_data = pd.read_csv('Output/BLOCK.csv', dtype = 'str').iloc[:,[0,1,2,7,8]]
block_data = block_data[block_data.GEOID.isin(CTV_x_BLOCK.GEOID_block.unique().tolist())]
block = block.merge(block_data, left_on='GEOID20',right_on='GEOID',suffixes=('','_y'))
block = block.iloc[:,[18,19,20,21,22,11,12,13,14,17]]
block.to_file('ShapeFiles/BLOCK/block.shp')
block.to_file('ShapeFiles/GeoJSON/block.geojson', driver='GeoJSON')