In [14]:
import pathlib
from pathlib import Path
import collections
import geopandas as gpd
import pandas as pd
import zipfile
import timeit
from IPython.display import clear_output
from shutil import unpack_archive
import numpy as np

In [2]:
pathlib.Path.cwd()

PosixPath('/Users/mark/Documents/mapping/map698/698-project/python-workbooks')

#### Create directory path for temp storage of downloads

In [3]:
ftpTmp = Path('../data/ftp-temp/')
ftpTmp.mkdir(exist_ok=True, parents=True)

#### Function - Load the shapefile information into GeoPandas

In [4]:
def get_data(filename):
    gdf = gpd.read_file(filename, dtype={'GEOID10': 'object', 'INTPTLAT10': 'float', 'INTPTLON10': 'float'})
    return gdf

#### Extract files from compressed file (zip)

In [5]:
def extract_files(zip_path_file):
    unpack_archive(str(zip_path_file), extract_dir=str(ftpTmp))
    fh.close()

#### Function - Write the tract dataframe to list

In [6]:
def append_list(df_gdf, count_records):
    #keep only the rows that are not water
    df_gdf = df_gdf.drop(df_gdf[(df_gdf['AWATER10'] > 0) & (df_gdf['ALAND10'] == 0)].index)
    
    #exclude Puerto Rico and island areas
    if str(df_gdf['STATEFP10'].unique()[0]) in st_list:
        df_list.append(df_gdf)
        count_records = count_records+len(df_gdf)
        
    # drop the columns that are not needed
    cols=['STATEFP10','COUNTYFP10','TRACTCE10','BLOCKCE10','NAME10','MTFCC10','UR10','UACE10','UATYPE','FUNCSTAT10','ALAND10','AWATER10','geometry']
    df_gdf.drop(cols, axis=1, inplace=True)
    df_gdf.rename(columns={'GEOID10': 'block_geoid', 'INTPTLAT10': 'lat', 'INTPTLON10': 'lon'}, inplace=True)
    return count_records

#### Function - Convert decimal time to minutes and seconds

In [7]:
def get_time(timer):
    minutes, seconds = int(np.floor(timer)), round(np.asscalar(timer % 1)*60)
    return [minutes, seconds]

In [8]:
BlocksPath = Path('/Users/mark/Documents/github-private/698-inspiration/Census-blocks')

In [9]:
collections.Counter(p.suffix for p in BlocksPath.glob('*.z*'))

Counter({'.zip': 56})

#### list all the files in the Census download directory

In [10]:
file_list=[]
directory = BlocksPath
for path in sorted(directory.rglob('*')):
        depth = len(path.relative_to(directory).parts)
        file_list.append(path.name)

In [11]:
# set various counting variables
counter = 0
decimals = 0   
# start a timer
start = timeit.default_timer()

# create a blank list to accumulate the dataframes
df_list = []
count_records = 0
# a list of state fips to be included (eliminates Puerto Rico, Samoa,Guam,Mariana Islans, Virgin Island, and Island Areas) 
# see https://www.census.gov/geo/maps-data/data/tallies/dfblock.html
st_list = ['01','04','05','06','08','09','10','11','12','13','16','17','18','19','20','21','22','23','24','25','26','27','28','29','30','31','32','33','34','35','36','37','38','39','40','41','42','44','45','46','47','48','49','50','51','53','54','55','56']


for index, zip_filename in enumerate(file_list):
    
    # create a file path for the download
    zip_path_file = BlocksPath.joinpath(file_list[index])
    
    # open downloaded file for df
    fh = open(zip_path_file, "rb")
    zp = zipfile.ZipFile(fh)
    
    # call df function
    extract_files(zip_path_file)
    
    # assign file handles to each of the files exdfed
    cpg, dbf, prj, shp, xml1, xml2, shx = [filename for filename in zp.namelist()]
    
    fh.close
    zp.close
    print(shp)
    
    # add a path to the shapefile
    shape_file = ftpTmp.joinpath(shp)
    
    # call function to create a geodataframe
    df_gdf = get_data(shape_file)
    
    # call function to append the geodataframe to a list
    count_records = append_list(df_gdf, count_records)
    
    #removes exdfed files and zip file
    zip_path_file.unlink()
    shape_file.unlink()
    file_to_rem = ftpTmp.joinpath(cpg)
    file_to_rem.unlink()    
    file_to_rem = ftpTmp.joinpath(dbf)
    file_to_rem.unlink()    
    file_to_rem = ftpTmp.joinpath(xml1)
    file_to_rem.unlink()    
    file_to_rem = ftpTmp.joinpath(xml2)
    file_to_rem.unlink()    
    file_to_rem = ftpTmp.joinpath(shx)
    file_to_rem.unlink()    
    file_to_rem = ftpTmp.joinpath(prj)
    file_to_rem.unlink()

    # get the current time on timer
    stop = timeit.default_timer()
    
    #create a numpy array to calculate time
    timer = np.array([(stop-start)/60])
    # call function to calculate minutes and seconds
    min_sec = get_time(timer)
    minutes, seconds = min_sec[0], min_sec[1]
    
    # clear the output below this cell
    clear_output(wait=True)
    
    # print a few lines for progress monitoring
    print('Processed df file:',zip_filename)
    print('Timer:', minutes, 'minutes', seconds, 'seconds')
    print('Current record count:', "{:,}".format(count_records),'\n')
    counter+=1
    
# clear the output below this cell
clear_output(wait=True)
print('\n\nProcessing Complete\nTotal time:', minutes, 'minutes', seconds, 'seconds')
print('\nTotal count:', "{:,}".format(count_records))



Processing Complete
Total time: 16 minutes 27 seconds

Total count: 10,498,069


#### Remove the ftp temp folder

In [12]:
ftpTmp.rmdir()

#### Create a Pandas dataframe containing all tract records

In [15]:
%time df_out = pd.concat(df_list)

CPU times: user 1.38 s, sys: 3.51 s, total: 4.88 s
Wall time: 5.6 s


#### Create directory path for output file

In [16]:
outputPath = Path('../data/blocks/')
outputPath.mkdir(exist_ok=True, parents=True)

#### Name zip file

In [17]:
outputZip = 'block_centroids.csv.gz'

#### Create full path with zip file

In [18]:
out_Zip = outputPath.joinpath(outputZip)

#### Write dataframe to compressed CSV

In [19]:
# start a timer
start = timeit.default_timer()
print ('Compressing dataframe. Please be patient.')
df_out.to_csv(out_Zip, compression='gzip', index=None)
clear_output(wait=True)
# get the current time on timer
stop = timeit.default_timer()
timer = np.array([(stop-start)/60])
min_sec = get_time(timer)
minutes, seconds = min_sec[0], min_sec[1]
print('\nData compression complete.\nTotal time:', minutes, 'minutes', seconds, 'seconds')


Data compression complete.
Total time: 2 minutes 27 seconds


In [20]:
print('Number of tract records written to file:',"{:,}".format(len(df_out)))

Number of tract records written to file: 10,498,069


In [21]:
df_out.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 10498069 entries, 0 to 86203
Data columns (total 3 columns):
block_geoid    object
lat            object
lon            object
dtypes: object(3)
memory usage: 320.4+ MB


In [22]:
df_out.head()

Unnamed: 0,block_geoid,lat,lon
0,10890111001560,34.5974352,-86.6606772
2,10890105012010,34.8659936,-86.7366266
4,10890106241007,34.7863755,-86.7546843
5,10630600002024,32.9585581,-88.0510053
6,10630602001058,32.755908,-88.0879299
