### Retrieve Census Block Centroid Coordinate Data

In [1]:
from ftplib import FTP
import pandas as pd
import geopandas as gpd
import zipfile
from pathlib import Path
import timeit
from IPython.display import clear_output
from shutil import unpack_archive
import numpy as np

#### Create directory path for temp storage of downloads

In [2]:
ftpTmp = Path('../data/ftp-temp/')
ftpTmp.mkdir(exist_ok=True, parents=True)

#### Function - Load the shapefile information into GeoPandas

In [3]:
def get_data(filename):
    gdf = gpd.read_file(filename, dtype={'GEOID10': 'object', 'INTPTLAT10': 'float', 'INTPTLON10': 'float'})
    return gdf

#### Extract files from compressed file (zip)

In [4]:
def extract_files(zip_path_file):
    unpack_archive(str(zip_path_file), extract_dir=str(ftpTmp))
    fh.close()

#### Function - Write the tract dataframe to list

In [5]:
def append_list(df_gdf, count_records):
    #keep only the rows that are not water
    df_gdf = df_gdf.drop(df_gdf[(df_gdf['AWATER10'] > 0) & (df_gdf['ALAND10'] == 0)].index)
    
    #exclude Puerto Rico and island areas
    if str(df_gdf['STATEFP10'].unique()[0]) in st_list:
        df_list.append(df_gdf)
        count_records = count_records+len(df_gdf)
        
    # drop the columns that are not needed
    cols=['STATEFP10','COUNTYFP10','TRACTCE10','BLOCKCE10','NAME10','MTFCC10','UR10','UACE10','UATYPE','FUNCSTAT10','ALAND10','AWATER10','geometry']
    df_gdf.drop(cols, axis=1, inplace=True)
    df_gdf.rename(columns={'GEOID10': 'block_geoid', 'INTPTLAT10': 'lat', 'INTPTLON10': 'lon'}, inplace=True)
    return count_records

#### Function - Convert decimal time to minutes and seconds

In [6]:
def get_time(timer):
    minutes, seconds = int(np.floor(timer)), round(np.asscalar(timer % 1)*60)
    return [minutes, seconds]

#### Setup FTP connection with Census Bureau

In [7]:
ftp = FTP('ftp2.census.gov')
ftp.login()
ftp.cwd('geo/tiger/TIGER2018/TABBLOCK/')
file_list = ftp.nlst()
# print a list of all files in the FTP folder
print(file_list)

['tl_2018_01_tabblock10.zip', 'tl_2018_02_tabblock10.zip', 'tl_2018_04_tabblock10.zip', 'tl_2018_05_tabblock10.zip', 'tl_2018_06_tabblock10.zip', 'tl_2018_08_tabblock10.zip', 'tl_2018_09_tabblock10.zip', 'tl_2018_10_tabblock10.zip', 'tl_2018_11_tabblock10.zip', 'tl_2018_12_tabblock10.zip', 'tl_2018_13_tabblock10.zip', 'tl_2018_15_tabblock10.zip', 'tl_2018_16_tabblock10.zip', 'tl_2018_17_tabblock10.zip', 'tl_2018_18_tabblock10.zip', 'tl_2018_19_tabblock10.zip', 'tl_2018_20_tabblock10.zip', 'tl_2018_21_tabblock10.zip', 'tl_2018_22_tabblock10.zip', 'tl_2018_23_tabblock10.zip', 'tl_2018_24_tabblock10.zip', 'tl_2018_25_tabblock10.zip', 'tl_2018_26_tabblock10.zip', 'tl_2018_27_tabblock10.zip', 'tl_2018_28_tabblock10.zip', 'tl_2018_29_tabblock10.zip', 'tl_2018_30_tabblock10.zip', 'tl_2018_31_tabblock10.zip', 'tl_2018_32_tabblock10.zip', 'tl_2018_33_tabblock10.zip', 'tl_2018_34_tabblock10.zip', 'tl_2018_35_tabblock10.zip', 'tl_2018_36_tabblock10.zip', 'tl_2018_37_tabblock10.zip', 'tl_2018_38_t

#### Loop through the ftp files, download, extract, load, and save

In [None]:
# set various counting variables
counter = 0
decimals = 0   
# start a timer
start = timeit.default_timer()

# create a blank list to accumulate the dataframes
df_list = []
count_records = 0
# a list of state fips to be included (eliminates Puerto Rico, Samoa,Guam,Mariana Islans, Virgin Island, and Island Areas) 
# see https://www.census.gov/geo/maps-data/data/tallies/dfblock.html
st_list = ['01','04','05','06','08','09','10','11','12','13','16','17','18','19','20','21','22','23','24','25','26','27','28','29','30','31','32','33','34','35','36','37','38','39','40','41','42','44','45','46','47','48','49','50','51','53','54','55','56']


for index, zip_filename in enumerate(file_list):
    
    # create a file path for the download
    zip_path_file = ftpTmp.joinpath(file_list[index])
    
    # write download file to storage area
    fh = open (zip_path_file, "wb")
    print ("STARTED download for file: " + zip_filename)
    ftp.retrbinary("RETR " + zip_filename, fh.write)
    
    # open downloaded file for exdf
    fh = open(zip_path_file, "rb")
    zp = zipfile.ZipFile(fh)
    
    # call df function
    extract_files(zip_path_file)
    
    # assign file handles to each of the files exdfed
    cpg, dbf, prj, shp, xml1, xml2, shx = [filename for filename in zp.namelist()]
    
    fh.close
    zp.close
    print(shp)
    
    # add a path to the shapefile
    shape_file = ftpTmp.joinpath(shp)
    
    # call function to create a geodataframe
    df_gdf = get_data(shape_file)
    
    # call function to append the geodataframe to a list
    count_records = append_list(df_gdf, count_records)
    
    #removes exdfed files and zip file
    zip_path_file.unlink()
    shape_file.unlink()
    file_to_rem = ftpTmp.joinpath(cpg)
    file_to_rem.unlink()    
    file_to_rem = ftpTmp.joinpath(dbf)
    file_to_rem.unlink()    
    file_to_rem = ftpTmp.joinpath(xml1)
    file_to_rem.unlink()    
    file_to_rem = ftpTmp.joinpath(xml2)
    file_to_rem.unlink()    
    file_to_rem = ftpTmp.joinpath(shx)
    file_to_rem.unlink()    
    file_to_rem = ftpTmp.joinpath(prj)
    file_to_rem.unlink()

    # get the current time on timer
    stop = timeit.default_timer()
    
    #create a numpy array to calculate time
    timer = np.array([(stop-start)/60])
    # call function to calculate minutes and seconds
    min_sec = get_time(timer)
    minutes, seconds = min_sec[0], min_sec[1]
    
    # clear the output below this cell
    clear_output(wait=True)
    
    # print a few lines for progress monitoring
    print('Processed df file:',zip_filename)
    print('Timer:', minutes, 'minutes', seconds, 'seconds')
    print('Current record count:', "{:,}".format(count_records),'\n')
    counter+=1
    
# clear the output below this cell
clear_output(wait=True)
print('\n\nProcessing Complete\nTotal time:', minutes, 'minutes', seconds, 'seconds')
print('\nTotal count:', "{:,}".format(count_records))

Processed df file: tl_2018_11_tabblock10.zip
Timer: 40 minutes 17 seconds
Current record count: 1,628,815 

STARTED download for file: tl_2018_12_tabblock10.zip


#### Remove the ftp temp folder

In [9]:
ftpTmp.rmdir()

#### Create a Pandas dataframe containing all tract records

In [10]:
%time df_out = pd.concat(df_list)

Wall time: 71.6 ms


#### Create directory path for output file

In [36]:
outputPath = Path('../data/blocks/')
outputPath.mkdir(exist_ok=True, parents=True)

#### Name zip file

In [37]:
outputZip = 'block_centroids.csv.gz'

#### Create full path with zip file

In [38]:
out_Zip = outputPath.joinpath(outputZip)

#### Write dataframe to compressed CSV

In [39]:
# start a timer
start = timeit.default_timer()
print ('Compressing dataframe. Please be patient.')
df_out.to_csv(out_Zip, compression='gzip', index=None)
clear_output(wait=True)
# get the current time on timer
stop = timeit.default_timer()
timer = np.array([(stop-start)/60])
min_sec = get_time(timer)
minutes, seconds = min_sec[0], min_sec[1]
print('\nData compression complete.\nTotal time:', minutes, 'minutes', seconds, 'seconds')


Data compression complete.
Total time: 2 minutes 30 seconds


In [12]:
print('Number of tract records written to file:',"{:,}".format(len(df_out)))

Number of tract records written to file: 236,319


In [13]:
df_out.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 236319 entries, 0 to 252265
Data columns (total 16 columns):
STATEFP10      236319 non-null object
COUNTYFP10     236319 non-null object
TRACTCE10      236319 non-null object
BLOCKCE10      236319 non-null object
block_geoid    236319 non-null object
NAME10         236319 non-null object
MTFCC10        236319 non-null object
UR10           236319 non-null object
UACE10         99729 non-null object
UATYPE         99729 non-null object
FUNCSTAT10     236319 non-null object
ALAND10        236319 non-null int64
AWATER10       236319 non-null int64
lat            236319 non-null object
lon            236319 non-null object
geometry       236319 non-null object
dtypes: int64(2), object(14)
memory usage: 30.7+ MB
