## Retrieve Origin-Destination Data

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import zipfile
import timeit
from IPython.display import clear_output
import requests

#### Function - Downloads each state OD data from the remote site

In [2]:
def get_data(filename, counter):
    print ('\nCurrently processing #', counter, filename)
    dataframe = pd.read_csv(filename, dtype=str)
    return dataframe

#### Function - Convert decimal time to minutes and seconds

In [3]:
def get_time(timer):
    #minutes, seconds = int(np.floor(timer)), round(np.asscalar(timer % 1)*60)
    minutes, seconds = int(np.floor(timer)), round(np.ndarray.item(timer % 1)*60)
    #numpy.ndarray.item() 
    return[minutes,  seconds]

#### A list of state abbreviations
To be used to identify the files to extract from remote site. Of use in the loop below.

In [4]:
state_list = ['ak','al', 'ar', 'az', 'ca', 'co', 'ct', 'dc', 'de', 'fl', 'ga', 'ia', 'id', 'il', 'in', 'ks',
              'ky', 'la', 'ma', 'md', 'me', 'mi', 'mn', 'mo', 'ms', 'mt', 'nc', 'nd', 'ne', 'nh', 'nj', 'nm', 'nv',
              'ny', 'oh', 'ok', 'or', 'pa', 'ri', 'sc', 'sd', 'tn', 'tx', 'ut', 'va', 'vt', 'wa', 'wi', 'wv','wy']

#### Loop through each state, download the compressed file, extract data, append to CSV

In [5]:
# set various counting variables
counter = 0
decimals = 0   
# start a timer
start = timeit.default_timer()
# create a blank list to accumulate the dataframes
df_list = []
count_records = 0
year='2017'
valid_state_list = []

for st in state_list:
    # file and path of download
    filename = 'https://lehd.ces.census.gov/data/lodes/LODES7/'+ st + '/od/' + st + '_od_main_JT01_'+ year +'.csv.gz'
    filecheck = requests.get(filename)  #check if file exists on census.gov
    
    if not filecheck:
        print (st.upper(),' - Does NOT Exist')
        continue
    else:
        # call function to load compressed data into dataframe
        df = get_data(filename, counter)

        # count records in current list
        count_records = count_records+len(df)

        # clear the output below this cell
        clear_output(wait=True)

        # get the current time on timer
        stop = timeit.default_timer()
        timer = np.array([(stop-start)/60])
        min_sec = get_time(timer)
        minutes, seconds = min_sec[0], min_sec[1]

        # print a few lines for progress monitoring
        print('Processed state OD file:',st.upper())
        print('Length of',st.upper(), 'dataframe:',"{:,}".format(len(df)))
        print('\nTotal record count:', "{:,}".format(count_records))
        print('Total time:', minutes, 'minutes', seconds, 'seconds')
        counter+=1

        # append df to list before next state overwrites df
        df_list.append(df)
        valid_state_list.append(st)
        print('\nStates processed: ',[x.upper() for x in valid_state_list])
        
for st in valid_state_list:
    # file and path of download
    filename = 'https://lehd.ces.census.gov/data/lodes/LODES7/'+ st + '/od/' + st + '_od_aux_JT01_' + year + '.csv.gz'
    #https://lehd.ces.census.gov/data/lodes/LODES7/ak/od/ak_od_aux_JT01_2015.csv.gz
    
    # call function to load compressed data into dataframe
    df = get_data(filename, counter)
    
    # count records in current list
    count_records = count_records+len(df)

    # clear the output below this cell
    clear_output(wait=True)
    
    # get the current time on timer
    stop = timeit.default_timer()
    timer = np.array([(stop-start)/60])
    min_sec = get_time(timer)
    minutes, seconds = min_sec[0], min_sec[1]
    
    # print a few lines for progress monitoring
    print('Processed state AUX file:',st.upper())
    print('Length of ',st.upper(), ' dataframe:',"{:,}".format(len(df)))
    print('\nTotal record count:', "{:,}".format(count_records))
    print('Total time:', minutes, 'minutes', seconds, 'seconds')
    counter+=1
    
    # append df to list before next state overwrites df
    df_list.append(df)

# clear the output below this cell
clear_output(wait=True)

print('Processed state AUX file:',st.upper())
print('Length of',st.upper(), 'dataframe:',"{:,}".format(len(df)))
print('\n\nAll state files retrieved. Total time:', minutes, 'minutes', seconds, 'seconds',
      '\nTotal number of records:', "{:,}".format(count_records))

print('\n\nStates with missing OD & AUX data: ',list(set(state_list)-set(valid_state_list)))

Processed state OD file: WY
Length of WY dataframe: 21,477


All state files retrieved. Total time: 23 minutes 46 seconds 
Total number of records: 117,907,270


States with missing OD & AUX data:  ['ak', 'sd']


#### Create a dataframe containing all OD records

In [6]:
%time df_out = pd.concat(df_list)

CPU times: user 12 s, sys: 30.5 s, total: 42.4 s
Wall time: 1min 2s


#### Remove Unnecessary column(s)

In [7]:
%time df_out.drop(['createdate'], axis=1, inplace=True)

CPU times: user 22.6 s, sys: 1min 11s, total: 1min 33s
Wall time: 2min 53s


In [8]:
df_out.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 117907270 entries, 0 to 21476
Data columns (total 12 columns):
w_geocode    object
h_geocode    object
S000         object
SA01         object
SA02         object
SA03         object
SE01         object
SE02         object
SE03         object
SI01         object
SI02         object
SI03         object
dtypes: object(12)
memory usage: 11.4+ GB


#### Create directory path for output file

In [9]:
outputPath = Path("../data/OD/")
outputPath.mkdir(exist_ok=True, parents=True)

#### Name zip file

In [10]:
outputZip = 'od_aux.csv.gz'

#### Create full path with zip file

In [11]:
out_Zip = outputPath.joinpath(outputZip)

#### Write dataframe to compressed CSV

In [12]:
# start a timer
start = timeit.default_timer()
print ('Compressing dataframe. Please be patient.')
df_out.to_csv(out_Zip, compression='gzip', index=None)
clear_output(wait=True)
# get the current time on timer
stop = timeit.default_timer()
timer = np.array([(stop-start)/60])
min_sec = get_time(timer)
minutes, seconds = min_sec[0], min_sec[1]
print('\nData compression complete.\nTotal time:', minutes, 'minutes', seconds, 'seconds')


Data compression complete.
Total time: 31 minutes 37 seconds
