## Merge OD and AUX data with Block Centroid 1k-15k employed per block

In [1]:
import pandas as pd

from pathlib import Path
import timeit
from IPython.display import clear_output
from pyproj import Geod
import numpy as np

#### Function - Convert decimal time to minutes and seconds

In [2]:
def get_time(timer):
    minutes, seconds = int(np.floor(timer)), round(np.ndarray.item(timer % 1)*60)
    return[minutes,  seconds]

#### Funtion to calculate distance between work and home

In [3]:
def Distance(lat1,lon1,lat2,lon2):
  az12,az21,dist = wgs84_geod.inv(lon1,lat1,lon2,lat2) 
  return dist

#### Funtion to process the locations with range of employees and merge with centroids

In [4]:
def chunk_process_distance(df):
    # create a group of all workplace geoids
    df_group = df.groupby(['w_geocode']).sum()
    
    # include only the records with # employees in the block
     # create a dataframe with only blocks with 1000+ employees
    df_group_limited = df_group[(df_group['S000'] >= 1000) & (df_group['S000'] < 15000)]
    
   
    df2 = pd.merge(df, df_group_limited, on='w_geocode', suffixes=('','_del'))
    # keep the sum of the S000 column to use in later filtering.
    df2.rename(columns={'S000_del': 'w_group_count'}, inplace=True)
    # Remove all other S*_del columns
    df2.drop(list(df2.filter(regex='_del')), axis=1, inplace=True)
    
    
    # create list of all block with > 1000 workers
    #top_block_list = df_group_limited.index.tolist()
    # create a dataframe with only block with > 1000
    #df2 = df[df['w_geocode'].isin(top_block_list)]
    
    # load the centroid file
    #centroids = pd.read_csv(block_file, dtype={'block_geoid': 'object', 'lat': 'float', 'lon': 'float'})
    
    # merge the centroid data frame with the grouped dataframe
   
    #merge_result = pd.merge(df2_sort, centroids,left_on=  ['w_geocode'],right_on= ['block_geoid'],how = 'left')
    merge_result = pd.merge(df2, centroids,left_on=  ['w_geocode'],right_on= ['block_geoid'],how = 'left')
    merge_result.drop(['block_geoid'], axis=1, inplace=True)
    merge_result.rename(columns={'lat': 'w_lat', 'lon': 'w_lon'}, inplace=True)
    
    # merge the centroid data frame on the h_geocode column
    merge_result = pd.merge(merge_result, centroids,left_on=  ['h_geocode'],right_on= ['block_geoid'],how = 'left')
    merge_result.drop(['block_geoid'], axis=1, inplace=True)
    merge_result.rename(columns={'lat': 'h_lat', 'lon': 'h_lon'}, inplace=True)
    
    merge_result['distance'] = Distance(merge_result['w_lat'].tolist(),merge_result['w_lon'].tolist(),merge_result['h_lat'].tolist(),merge_result['h_lon'].tolist())
    merge_result['distance'] = merge_result['distance'].apply(lambda x: round(x, decimals))
    #merge_limited = merge_result[(merge_result['distance'] < 100000) & (merge_result['distance'] > 15000)]
    #merge_limited['geometry'] = merge_limited.apply(lambda x: geom.LineString([(x['w_lon'], x['w_lat'] ), (x['h_lon'],x['h_lat'])]), axis = 1)
    return merge_result

#### Input locations and file names

In [5]:
ODpath = Path("../data/OD/")
OD_file = ODpath.joinpath("od_aux.csv.gz")
if OD_file.exists ():
    print ("OD file exist")
else:
    print ("OD file does not exist")
    
blockPath = Path("../data/blocks/")
block_file = blockPath.joinpath("block_centroids_ftp.csv.gz")
if block_file.exists ():
    print ("Block Centroid file exist")
else:
    print ("Block Centroid file does not exist")

OD file exist
Block Centroid file exist


#### Load the centroid file to dataframe for merging

In [6]:
centroids = pd.read_csv(block_file, dtype={'block_geoid': 'object', 'lat': 'float', 'lon': 'float'})

#### Read the Origin Destination (OD) data into chunk dataframes

In [7]:
%time df_chunk = pd.read_csv(OD_file, compression='gzip', dtype={'w_geocode': str,'h_geocode':str}, chunksize=10000000)

Wall time: 81.8 ms


#### Read the data in chunks. Filter data by calling function 'chunk_process_distance'

In [8]:
chunk_list = []

wgs84_geod = Geod(ellps='WGS84') 
#Distance will be measured on this ellipsoid - more accurate than a spherical method

chunk_num = 1
decimals = 0   

print('Reading in the chunk dataframe')
start = timeit.default_timer()
for df in df_chunk:
    print('Starting processing for chunk #', chunk_num)
    # call function to group and filter the data 
    filter_chunk = chunk_process_distance(df)
    # append the filtered data to list
    chunk_list.append(filter_chunk)
    
    # get the current time on timer
    stop = timeit.default_timer()
    timer = np.array([(stop-start)/60])
    min_sec = get_time(timer)
    minutes, seconds = min_sec[0], min_sec[1]
    
    clear_output(wait=True)
    print('Chunk number:', chunk_num)
    print('Length of dataframe:',"{:,}".format(len(filter_chunk)),'\n')
    print('Timer:', minutes, 'minutes', seconds, 'seconds')
    chunk_num += 1

print('\nData merge complete.')

Chunk number: 12
Length of dataframe: 744,335 

Timer: 8 minutes 9 seconds

Data merge complete.


#### Concatenate the chunk list into a dataframe

In [9]:
# concat the list into dataframe 
%time df_concat = pd.concat(chunk_list)
print('Length of concatenated dataframe:',"{:,}".format(len(df_concat)),'\n')

Wall time: 3.15 s
Length of concatenated dataframe: 28,339,159 



In [10]:
outputZip = 'od_distance_1k-15k_unclean.csv.gz'

#### Create full path with zip file

In [11]:
out_Zip = ODpath.joinpath(outputZip)

#### Write all the unclean OD line data to compressed csv file

In [12]:
# start a timer
start = timeit.default_timer()
print ('Compressing dataframe. Please be patient.')
df_concat.to_csv(out_Zip, compression='gzip', index=None)
clear_output(wait=True)
# get the current time on timer
stop = timeit.default_timer()
timer = np.array([(stop-start)/60])
min_sec = get_time(timer)
minutes, seconds = min_sec[0], min_sec[1]
print('\nData compression complete.\nTotal time:', minutes, 'minutes', seconds, 'seconds')


Data compression complete.
Total time: 11 minutes 59 seconds


#### Arrange the columns

In [13]:
df_concat = df_concat[['w_geocode','h_geocode','distance','w_group_count', 'S000', 'SA01', 'SA02', 'SA03', 'SE01', 'SE02', 'SE03', 'SI01', 'SI02', 'SI03','w_lat','w_lon','h_lat','h_lon']]
df_concat.head()

Unnamed: 0,w_geocode,h_geocode,distance,w_group_count,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,w_lat,w_lon,h_lat,h_lon
0,10010205001001,10010201001027,6962.0,1119,1,0,0,1,1,0,0,0,1,0,32.45674,-86.415025,32.451653,-86.488828
1,10010205001001,10010201002005,8132.0,1119,1,1,0,0,0,1,0,0,0,1,32.45674,-86.415025,32.493159,-86.490113
2,10010205001001,10010201002006,7700.0,1119,1,1,0,0,1,0,0,0,0,1,32.45674,-86.415025,32.481959,-86.491338
3,10010205001001,10010201002016,7088.0,1119,2,0,1,1,0,1,1,0,2,0,32.45674,-86.415025,32.474176,-86.487559
4,10010205001001,10010201002017,6830.0,1119,4,2,1,1,1,1,2,0,3,1,32.45674,-86.415025,32.477237,-86.483532


#### Identify null records

In [14]:
df_null = df_concat[df_concat.isnull().any(axis=1)]
print ('\nthe number of null records:', "{:,}".format(len(df_null)),'\n\n')
df_null.head()


the number of null records: 10,957 




Unnamed: 0,w_geocode,h_geocode,distance,w_group_count,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,w_lat,w_lon,h_lat,h_lon
2120571,60816103021033,60014005001014,,1109,1,0,0,1,0,0,1,0,0,1,,,37.84922,-122.264232
2120572,60816103021033,60014008002013,,1109,1,1,0,0,0,0,1,1,0,0,,,37.84556,-122.27841
2120573,60816103021033,60014045021006,,1109,1,0,1,0,0,0,1,0,0,1,,,37.839373,-122.202255
2120574,60816103021033,60014048001005,,1109,1,0,1,0,0,0,1,0,0,1,,,37.800936,-122.214607
2120575,60816103021033,60014051005005,,1109,1,0,1,0,0,0,1,1,0,0,,,37.813526,-122.233665


#### Remove null records

In [15]:
df_clean = df_concat.dropna(how='any').copy()
df_null = df_concat[df_concat.isnull().any(axis=1)]
print ('Length of cleaned dataframe:',"{:,}".format(len(df_clean)),'\n')
df_clean.head(3)

Length of cleaned dataframe: 28,328,202 



Unnamed: 0,w_geocode,h_geocode,distance,w_group_count,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,w_lat,w_lon,h_lat,h_lon
0,10010205001001,10010201001027,6962.0,1119,1,0,0,1,1,0,0,0,1,0,32.45674,-86.415025,32.451653,-86.488828
1,10010205001001,10010201002005,8132.0,1119,1,1,0,0,0,1,0,0,0,1,32.45674,-86.415025,32.493159,-86.490113
2,10010205001001,10010201002006,7700.0,1119,1,1,0,0,1,0,0,0,0,1,32.45674,-86.415025,32.481959,-86.491338


#### Remove the precision on the distance

In [16]:
df_clean.distance =  df_clean.distance.map(lambda x: '%.0f' % x).copy()

#### Reduce the precision on the lat & lon columns

In [17]:
df_clean.w_lat =  df_clean.w_lat.map(lambda x: '%.3f' % x)
df_clean.w_lon =  df_clean.w_lon.map(lambda x: '%.3f' % x)
df_clean.h_lat =  df_clean.h_lat.map(lambda x: '%.3f' % x)
df_clean.h_lon =  df_clean.h_lon.map(lambda x: '%.3f' % x)

In [18]:
df_clean.head(3)

Unnamed: 0,w_geocode,h_geocode,distance,w_group_count,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,w_lat,w_lon,h_lat,h_lon
0,10010205001001,10010201001027,6962,1119,1,0,0,1,1,0,0,0,1,0,32.457,-86.415,32.452,-86.489
1,10010205001001,10010201002005,8132,1119,1,1,0,0,0,1,0,0,0,1,32.457,-86.415,32.493,-86.49
2,10010205001001,10010201002006,7700,1119,1,1,0,0,1,0,0,0,0,1,32.457,-86.415,32.482,-86.491


#### Name zip file

In [19]:
outputZip = 'od_distance_1k-15k_clean.csv.gz'

#### Create full path with zip file

In [20]:
out_Zip = ODpath.joinpath(outputZip)

#### Write all the OD line data to compressed csv file

In [21]:
# start a timer
start = timeit.default_timer()
print ('Compressing dataframe. Please be patient.')
df_clean.to_csv(out_Zip, compression='gzip', index=None)
clear_output(wait=True)
# get the current time on timer
stop = timeit.default_timer()
timer = np.array([(stop-start)/60])
min_sec = get_time(timer)
minutes, seconds = min_sec[0], min_sec[1]
print('\nData compression complete.\nTotal time:', minutes, 'minutes', seconds, 'seconds')


Data compression complete.
Total time: 8 minutes 38 seconds
