## Retrieve Origin-Destination Data

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import zipfile
import timeit
from IPython.display import clear_output

#### Function - Downloads each state OD data from the remote site

In [2]:
def get_data(filename, counter):
    print ('\nCurrently processing #', counter, filename)
    dataframe = pd.read_csv(filename, dtype=str)
    return dataframe

#### Function - Convert decimal time to minutes and seconds

In [3]:
def get_time(timer):
    minutes, seconds = int(np.floor(timer)), round(np.asscalar(timer % 1)*60)
    return[minutes,  seconds]

#### A list of state abbreviations
To be used to identify the files to extract from remote site. Of use in the loop below.  
Puerto Rico (pr), Virgin Island (vi), and Wyoming (wy) do not have data for 2015

In [4]:
state_list = ['ak', 'al', 'ar', 'az', 'ca', 'co', 'ct', 'dc', 'de', 'fl', 'ga', 'hi', 'ia', 'id', 'il', 'in', 'ks',
              'ky', 'la', 'ma', 'md', 'me', 'mi', 'mn', 'mo', 'ms', 'mt', 'nc', 'nd', 'ne', 'nh', 'nj', 'nm', 'nv',
              'ny', 'oh', 'ok', 'or', 'pa', 'ri', 'sc', 'sd', 'tn', 'tx', 'ut', 'va', 'vt', 'wa', 'wi', 'wv']

#### Loop through each state, download the compressed file, extract data, append to CSV

In [5]:
# set various counting variables
counter = 0
decimals = 0   
# start a timer
start = timeit.default_timer()
# create a blank list to accumulate the dataframes
df_list = []
count_records = 0

for st in state_list:
    # file and path of download
    filename = 'https://lehd.ces.census.gov/data/lodes/LODES7/'+ st + '/od/' + st + '_od_main_JT01_2015.csv.gz'
    
    # call function to load compressed data into datafram
    df = get_data(filename, counter)
    
    # count records in current list
    count_records = count_records+len(df)

    # clear the output below this cell
    clear_output(wait=True)
    
    # get the current time on timer
    stop = timeit.default_timer()
    timer = np.array([(stop-start)/60])
    min_sec = get_time(timer)
    minutes, seconds = min_sec[0], min_sec[1]
    
    # print a few lines for progress monitoring
    print('Processed state OD file:',st.upper())
    print('Length of dataframe:',"{:,}".format(len(df)))
    print('Current record count:', "{:,}".format(count_records),'\n')
    print('Timer:', minutes, 'minutes', seconds, 'seconds')
    counter+=1
    
    # append df to list before next state overwrites df
    df_list.append(df)

        
for st in state_list:
    # file and path of download
    filename = 'https://lehd.ces.census.gov/data/lodes/LODES7/'+ st + '/od/' + st + '_od_aux_JT01_2015.csv.gz'
    #https://lehd.ces.census.gov/data/lodes/LODES7/ak/od/ak_od_aux_JT01_2015.csv.gz
    
    # call function to load compressed data into dataframe
    df = get_data(filename, counter)
    
    # count records in current list
    count_records = count_records+len(df)

    # clear the output below this cell
    clear_output(wait=True)
    
    # get the current time on timer
    stop = timeit.default_timer()
    timer = np.array([(stop-start)/60])
    min_sec = get_time(timer)
    minutes, seconds = min_sec[0], min_sec[1]
    
    # print a few lines for progress monitoring
    print('Processed state AUX file:',st.upper())
    print('Length of dataframe:',"{:,}".format(len(df)))
    print('Current record count:', "{:,}".format(count_records),'\n')
    print('Timer:', minutes, 'minutes', seconds, 'seconds')
    counter+=1
    
    # append df to list before next state overwrites df
    df_list.append(df)

# clear the output below this cell
clear_output(wait=True)
print('\n\nProcessing Complete\nTotal time:', minutes, 'minutes', seconds, 'seconds',
      '\nTotal count:', "{:,}".format(count_records))



Processing Complete
Total time: 10 minutes 10 seconds 
Total count: 115,682,106


#### Create a dataframe containing all OD records

In [6]:
%time df_out = pd.concat(df_list)

Wall time: 11.8 s


#### Remove Unnecessary column(s)

In [7]:
%time df_out.drop(['createdate'], axis=1, inplace=True)

Wall time: 54.8 s


In [8]:
df_out.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 115682106 entries, 0 to 81999
Data columns (total 12 columns):
w_geocode    object
h_geocode    object
S000         object
SA01         object
SA02         object
SA03         object
SE01         object
SE02         object
SE03         object
SI01         object
SI02         object
SI03         object
dtypes: object(12)
memory usage: 11.2+ GB


#### Create directory path for output file

In [9]:
outputPath = Path("../data/OD/")
outputPath.mkdir(exist_ok=True, parents=True)

#### Name zip file

In [10]:
outputZip = 'od_aux.csv.gz'

#### Create full path with zip file

In [11]:
out_Zip = outputPath.joinpath(outputZip)

#### Write dataframe to compressed CSV

In [12]:
# start a timer
start = timeit.default_timer()
print ('Compressing dataframe. Please be patient.')
df_out.to_csv(out_Zip, compression='gzip', index=None)
clear_output(wait=True)
# get the current time on timer
stop = timeit.default_timer()
timer = np.array([(stop-start)/60])
min_sec = get_time(timer)
minutes, seconds = min_sec[0], min_sec[1]
print('\nData compression complete.\nTotal time:', minutes, 'minutes', seconds, 'seconds')


Data compression complete.
Total time: 22 minutes 14 seconds
