### Retrieve Census Tract Centroid Coordinate Data

In [2]:
from ftplib import FTP
import pandas as pd
import geopandas as gpd
import zipfile
from pathlib import Path
import timeit
from IPython.display import clear_output
from shutil import unpack_archive
import numpy as np

#### Create directory path for temp storage of downloads

In [3]:
ftpTmp = Path('../data/ftp-temp/')
ftpTmp.mkdir(exist_ok=True, parents=True)

#### Function - Load the shapefile information into GeoPandas

In [4]:
def get_data(filename):
    gdf = gpd.read_file(filename, dtype={'GEOID': 'object', 'INTPTLAT': 'float', 'INTPTLON': 'float'})
    return gdf

#### Extract files from compressed file (zip)

In [5]:
def extract_files(zip_path_file):
    unpack_archive(str(zip_path_file), extract_dir=str(ftpTmp))
    fh.close()

#### Function - Write the tract dataframe to list

In [6]:
def append_list(tract_gdf, count_records):
    #exclude Puerto Rico and island areas
    if str(tract_gdf['STATEFP'].unique()[0]) in st_list:
        tract_list.append(tract_gdf)
        count_records = count_records+len(tract_gdf)
        
    # drop the columns that are not needed
    tract_gdf.drop(['NAME','NAMELSAD','MTFCC','FUNCSTAT','ALAND','AWATER','geometry','STATEFP','COUNTYFP','TRACTCE'], axis=1, inplace=True)
    tract_gdf.rename(columns={'GEOID': 'tract_geoid', 'INTPTLAT': 'lat', 'INTPTLON': 'lon'}, inplace=True)
    return count_records

#### Function - Convert decimal time to minutes and seconds

In [7]:
def get_time(timer):
    minutes, seconds = int(np.floor(timer)), round(np.asscalar(timer % 1)*60)
    return [minutes, seconds]

#### Setup FTP connection with Census Bureau

In [8]:
ftp = FTP('ftp2.census.gov')
ftp.login()
ftp.cwd('geo/tiger/TIGER2018/TRACT/')
file_list = ftp.nlst()
# print a list of all files in the FTP folder
print(file_list)

['tl_2018_01_tract.zip', 'tl_2018_02_tract.zip', 'tl_2018_04_tract.zip', 'tl_2018_05_tract.zip', 'tl_2018_06_tract.zip', 'tl_2018_08_tract.zip', 'tl_2018_09_tract.zip', 'tl_2018_10_tract.zip', 'tl_2018_11_tract.zip', 'tl_2018_12_tract.zip', 'tl_2018_13_tract.zip', 'tl_2018_15_tract.zip', 'tl_2018_16_tract.zip', 'tl_2018_17_tract.zip', 'tl_2018_18_tract.zip', 'tl_2018_19_tract.zip', 'tl_2018_20_tract.zip', 'tl_2018_21_tract.zip', 'tl_2018_22_tract.zip', 'tl_2018_23_tract.zip', 'tl_2018_24_tract.zip', 'tl_2018_25_tract.zip', 'tl_2018_26_tract.zip', 'tl_2018_27_tract.zip', 'tl_2018_28_tract.zip', 'tl_2018_29_tract.zip', 'tl_2018_30_tract.zip', 'tl_2018_31_tract.zip', 'tl_2018_32_tract.zip', 'tl_2018_33_tract.zip', 'tl_2018_34_tract.zip', 'tl_2018_35_tract.zip', 'tl_2018_36_tract.zip', 'tl_2018_37_tract.zip', 'tl_2018_38_tract.zip', 'tl_2018_39_tract.zip', 'tl_2018_40_tract.zip', 'tl_2018_41_tract.zip', 'tl_2018_42_tract.zip', 'tl_2018_44_tract.zip', 'tl_2018_45_tract.zip', 'tl_2018_46_tra

#### Loop through the ftp files, download, extract, load, and save

In [9]:
# set various counting variables
counter = 0
decimals = 0   
# start a timer
start = timeit.default_timer()

# create a blank list to accumulate the dataframes
tract_list = []
count_records = 0
# a list of state fips to be included (eliminates Puerto Rico, Samoa,Guam,Mariana Islans, Virgin Island, and Island Areas) 
# see https://www.census.gov/geo/maps-data/data/tallies/tractblock.html
st_list = ['01','04','05','06','08','09','10','11','12','13','16','17','18','19','20','21','22','23','24','25','26','27','28','29','30','31','32','33','34','35','36','37','38','39','40','41','42','44','45','46','47','48','49','50','51','53','54','55','56']


for index, zip_filename in enumerate(file_list):
    
    # create a file path for the download
    zip_path_file = ftpTmp.joinpath(file_list[index])
    
    # write download file to storage area
    fh = open (zip_path_file, "wb")
    print ("STARTED download for file: " + zip_filename)
    ftp.retrbinary("RETR " + zip_filename, fh.write)
    
    # open downloaded file for extract
    fh = open(zip_path_file, "rb")
    zp = zipfile.ZipFile(fh)
    
    # call extract function
    extract_files(zip_path_file)
    
    # assign file handles to each of the files extracted
    cpg, dbf, prj, shp, xml1, xml2, shx = [filename for filename in zp.namelist()]
    
    fh.close
    zp.close
    print(shp)
    
    # add a path to the shapefile
    shape_file = ftpTmp.joinpath(shp)
    
    # call function to create a geodataframe
    tract_gdf = get_data(shape_file)
    
    # call function to append the geodataframe to a list
    count_records = append_list(tract_gdf, count_records)
    
    #removes extracted files and zip file
    zip_path_file.unlink()
    shape_file.unlink()
    file_to_rem = ftpTmp.joinpath(cpg)
    file_to_rem.unlink()    
    file_to_rem = ftpTmp.joinpath(dbf)
    file_to_rem.unlink()    
    file_to_rem = ftpTmp.joinpath(xml1)
    file_to_rem.unlink()    
    file_to_rem = ftpTmp.joinpath(xml2)
    file_to_rem.unlink()    
    file_to_rem = ftpTmp.joinpath(shx)
    file_to_rem.unlink()    
    file_to_rem = ftpTmp.joinpath(prj)
    file_to_rem.unlink()

    # get the current time on timer
    stop = timeit.default_timer()
    
    #create a numpy array to calculate time
    timer = np.array([(stop-start)/60])
    # call function to calculate minutes and seconds
    min_sec = get_time(timer)
    minutes, seconds = min_sec[0], min_sec[1]
    
    # clear the output below this cell
    clear_output(wait=True)
    
    # print a few lines for progress monitoring
    print('Processed tract file:',zip_filename)
    print('Timer:', minutes, 'minutes', seconds, 'seconds')
    print('Current record count:', "{:,}".format(count_records),'\n')
    counter+=1
    
# clear the output below this cell
clear_output(wait=True)
print('\n\nProcessing Complete\nTotal time:', minutes, 'minutes', seconds, 'seconds')
print('\nTotal count:', "{:,}".format(count_records))



Processing Complete
Total time: 16 minutes 28 seconds

Total count: 72,538


#### Remove the ftp temp folder

In [10]:
ftpTmp.rmdir()

#### Create a Pandas dataframe containing all tract records

In [11]:
%time df_out = pd.concat(tract_list)

CPU times: user 19 ms, sys: 3.35 ms, total: 22.3 ms
Wall time: 20.5 ms


#### Create directory path for output file

In [12]:
outputPath = Path('../data/tracts/')
outputPath.mkdir(exist_ok=True, parents=True)

#### Name zip file

In [13]:
outputZip = 'tract_centroids.csv.gz'

#### Create full path with zip file

In [14]:
out_Zip = outputPath.joinpath(outputZip)

#### Write dataframe to compressed CSV

In [15]:
# start a timer
start = timeit.default_timer()
print ('Compressing dataframe. Please be patient.')
df_out.to_csv(out_Zip, compression='gzip', index=None)
clear_output(wait=True)
# get the current time on timer
stop = timeit.default_timer()
timer = np.array([(stop-start)/60])
min_sec = get_time(timer)
minutes, seconds = min_sec[0], min_sec[1]
print('\nData compression complete.\nTotal time:', minutes, 'minutes', seconds, 'seconds')


Data compression complete.
Total time: 0 minutes 1 seconds


In [16]:
print('Number of tract records written to file:',"{:,}".format(len(df_out)))

Number of tract records written to file: 72,538
