## Merge ALL  OD and AUX data with Block Centroid and Calculate Distance and Stats

In [1]:
import pandas as pd

from pathlib import Path
import timeit
from IPython.display import clear_output
from pyproj import Geod
import numpy as np

#### Function - Convert decimal time to minutes and seconds

In [2]:
def get_time(timer):
    minutes, seconds = int(np.floor(timer)), round(np.asscalar(timer % 1)*60)
    return[minutes,  seconds]

#### Funtion to calculate distance between work and home

In [3]:
def Distance(lat1,lon1,lat2,lon2):
  az12,az21,dist = wgs84_geod.inv(lon1,lat1,lon2,lat2) 
  return dist

#### Funtion to process the locations with range of employees and merge with centroids

In [4]:
def chunk_process_distance(df):
    # create a group of all workplace geoids
    df_group = df.groupby(['w_geocode']).sum()
    
    # include only the records with # employees in the block
    df_group_limited = df_group[(df_group['S000'] >= 500)]
    
    # create a dataframe with only blocks with 1000+ employees
    df2 = pd.merge(df, df_group_limited, on='w_geocode', suffixes=('','_del'))
    # keep the sum of the S000 to use in later filtering
    df2.rename(columns={'S000_del': 'w_group_count'}, inplace=True)
    df2.drop(list(df2.filter(regex='_del')), axis=1, inplace=True)
    
    
    # create list of all block with > 1000 workers
    #top_block_list = df_group_limited.index.tolist()
    # create a dataframe with only block with > 1000
    #df2 = df[df['w_geocode'].isin(top_block_list)]
    
    # sort the dataframe
    #df2_sort= df2.sort_values(['w_geocode','h_geocode'], ascending=[True, True])
    # reorder df2_sort dataframe
    #df2_sort = df2_sort.reset_index(drop=True)
    # reorder the columns of the dataframe
    #cols = df2_sort.columns.tolist()
    #cols = cols[-2:] + cols[:-2]
    #df2_sort = df2_sort[cols ]
    # group the dataframe by the worker and home block geoid
    #df2_group = df2_sort.groupby(['block_w_geoid','block_h_geoid']).sum().reset_index()
    #df2_sort.drop(['S000','SA01','SA02','SA03','SE01','SE02','SE03','SI01','SI02','SI03'], axis = 1, inplace=True)
    # load the centroid file
    centroids = pd.read_csv(block_file, dtype={'block_geoid': 'object', 'lat': 'float', 'lon': 'float'})
    # merge the centroid data frame with the grouped dataframe
    #merge_result = pd.merge(df2_sort, centroids,left_on=  ['w_geocode'],right_on= ['block_geoid'],how = 'left')
    merge_result = pd.merge(df2, centroids,left_on=  ['w_geocode'],right_on= ['block_geoid'],how = 'left')
    merge_result.drop(['block_geoid'], axis=1, inplace=True)
    merge_result.rename(columns={'lat': 'w_lat', 'lon': 'w_lon'}, inplace=True)
    # merge the centroid data frame on the Fayette county h_geocode column
    merge_result = pd.merge(merge_result, centroids,left_on=  ['h_geocode'],right_on= ['block_geoid'],how = 'left')
    merge_result.drop(['block_geoid'], axis=1, inplace=True)
    merge_result.rename(columns={'lat': 'h_lat', 'lon': 'h_lon'}, inplace=True)
    merge_result['distance'] = Distance(merge_result['w_lat'].tolist(),merge_result['w_lon'].tolist(),merge_result['h_lat'].tolist(),merge_result['h_lon'].tolist())
    merge_result['distance'] = merge_result['distance'].apply(lambda x: round(x, decimals))
    #merge_limited = merge_result[(merge_result['distance'] < 100000) & (merge_result['distance'] > 15000)]
    #merge_limited['geometry'] = merge_limited.apply(lambda x: geom.LineString([(x['w_lon'], x['w_lat'] ), (x['h_lon'],x['h_lat'])]), axis = 1)
    return merge_result

#### Setup output location and file name

In [5]:
ODpath = Path("../data/OD/")
OD_file = ODpath.joinpath("od_aux.csv.gz")
if OD_file.exists ():
    print ("OD file exist")
else:
    print ("OD file does not exist")
    
blockPath = Path("../data/blocks/")
block_file = blockPath.joinpath("block_centroids.csv.gz")
if block_file.exists ():
    print ("Block Centroid file exist")
else:
    print ("Block Centroid file does not exist")

OD file exist
Block Centroid file exist


#### Read the Origin Destination (OD) data into chunk dataframes

In [6]:
%time df_chunk = pd.read_csv(OD_file, compression='gzip', dtype={'w_geocode': str,'h_geocode':str}, chunksize=10000000)

Wall time: 320 ms


#### Read the data in chunks. Filter data by calling function 'chunk_process_distance'

In [None]:
chunk_list = []

wgs84_geod = Geod(ellps='WGS84') 
#Distance will be measured on this ellipsoid - more accurate than a spherical method

chunk_num = 1
decimals = 0   

print('Reading in the chunk dataframe')
start = timeit.default_timer()
for df in df_chunk:
    print('Starting processing for chunk #', chunk_num)
    # call function to group and filter the data 
    filter_chunk = chunk_process_distance(df)
    # append the filtered data to list
    chunk_list.append(filter_chunk)
    
    # get the current time on timer
    stop = timeit.default_timer()
    timer = np.array([(stop-start)/60])
    min_sec = get_time(timer)
    minutes, seconds = min_sec[0], min_sec[1]
    
    clear_output(wait=True)
    print('Chunk number:', chunk_num)
    print('Length of dataframe:',"{:,}".format(len(filter_chunk)),'\n')
    print('Timer:', minutes, 'minutes', seconds, 'seconds')
    chunk_num += 1

print('\nData merge complete.')

Reading in the chunk dataframe
Starting processing for chunk # 1


#### Concatenate the chunk list into a dataframe

In [8]:
# concat the list into dataframe 
%time df_concat = pd.concat(chunk_list)
print('Length of concatenated dataframe:',"{:,}".format(len(df_concat)),'\n')

Wall time: 5.68 s
Length of concatenated dataframe: 45,300,441 



In [9]:
outputZip = 'od_distance_500_unclean.csv.gz'

#### Create full path with zip file

In [10]:
out_Zip = ODpath.joinpath(outputZip)

#### Write all the unclean OD line data to compressed csv file

In [11]:
# start a timer
start = timeit.default_timer()
print ('Compressing dataframe. Please be patient.')
df_concat.to_csv(out_Zip, compression='gzip', index=None)
clear_output(wait=True)
# get the current time on timer
stop = timeit.default_timer()
timer = np.array([(stop-start)/60])
min_sec = get_time(timer)
minutes, seconds = min_sec[0], min_sec[1]
print('\nData compression complete.\nTotal time:', minutes, 'minutes', seconds, 'seconds')


Data compression complete.
Total time: 21 minutes 9 seconds


In [7]:
df_concat = pd.read_csv(out_Zip, compression='gzip', dtype={'w_geocode': str,'h_geocode':str})

#### Arrange the columns

In [12]:
df_concat = df_concat[['w_geocode','h_geocode','distance','w_group_count', 'S000', 'SA01', 'SA02', 'SA03', 'SE01', 'SE02', 'SE03', 'SI01', 'SI02', 'SI03','w_lat','w_lon','h_lat','h_lon']]
df_concat.head()

Unnamed: 0,w_geocode,h_geocode,distance,w_group_count,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,w_lat,w_lon,h_lat,h_lon
0,20160002001050,20160001001250,,1042,3,0,1,2,0,2,1,2,1,0,,,,
1,20160002001050,20160001001270,,1042,1,0,1,0,0,0,1,1,0,0,,,,
2,20160002001050,20160001001453,,1042,1,1,0,0,0,1,0,1,0,0,,,,
3,20160002001050,20160001001461,,1042,1,1,0,0,0,1,0,0,0,1,,,,
4,20160002001050,20160001001475,,1042,1,0,1,0,0,1,0,1,0,0,,,,


#### Identify null records

In [13]:
df_null = df_concat[df_concat.isnull().any(axis=1)]
print ('\nthe number of null records:', "{:,}".format(len(df_null)),'\n\n')
df_null.head()


the number of null records: 234,893 




Unnamed: 0,w_geocode,h_geocode,distance,w_group_count,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,w_lat,w_lon,h_lat,h_lon
0,20160002001050,20160001001250,,1042,3,0,1,2,0,2,1,2,1,0,,,,
1,20160002001050,20160001001270,,1042,1,0,1,0,0,0,1,1,0,0,,,,
2,20160002001050,20160001001453,,1042,1,1,0,0,0,1,0,1,0,0,,,,
3,20160002001050,20160001001461,,1042,1,1,0,0,0,1,0,0,0,1,,,,
4,20160002001050,20160001001475,,1042,1,0,1,0,0,1,0,1,0,0,,,,


#### Remove null records

In [14]:
df_clean = df_concat.dropna(how='any')
df_null = df_concat[df_concat.isnull().any(axis=1)]
print ('Length of cleaned dataframe:',"{:,}".format(len(df_clean)),'\n')
df_clean.head(3)

Length of cleaned dataframe: 45,065,548 



Unnamed: 0,w_geocode,h_geocode,distance,w_group_count,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,w_lat,w_lon,h_lat,h_lon
63272,10010205001001,10010201001016,5896.0,1076,1,1,0,0,1,0,0,0,1,0,32.45674,-86.415025,32.466619,-86.476649
63273,10010205001001,10010201001025,6964.0,1076,1,1,0,0,0,1,0,0,1,0,32.45674,-86.415025,32.459802,-86.489003
63274,10010205001001,10010201002016,7088.0,1076,2,0,2,0,0,0,2,0,2,0,32.45674,-86.415025,32.474176,-86.487559


#### Remove the precision on the distance

In [15]:
df_clean.distance =  df_clean.distance.map(lambda x: '%.0f' % x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


#### Reduce the precision on the lat & lon columns

In [16]:
df_clean.w_lat =  df_clean.w_lat.map(lambda x: '%.3f' % x)
df_clean.w_lon =  df_clean.w_lon.map(lambda x: '%.3f' % x)
df_clean.h_lat =  df_clean.h_lat.map(lambda x: '%.3f' % x)
df_clean.h_lon =  df_clean.h_lon.map(lambda x: '%.3f' % x)

In [17]:
df_clean.head(3)

Unnamed: 0,w_geocode,h_geocode,distance,w_group_count,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,w_lat,w_lon,h_lat,h_lon
63272,10010205001001,10010201001016,5896,1076,1,1,0,0,1,0,0,0,1,0,32.457,-86.415,32.467,-86.477
63273,10010205001001,10010201001025,6964,1076,1,1,0,0,0,1,0,0,1,0,32.457,-86.415,32.46,-86.489
63274,10010205001001,10010201002016,7088,1076,2,0,2,0,0,0,2,0,2,0,32.457,-86.415,32.474,-86.488


#### Name zip file

In [4]:
outputZip = 'od_distance_500.csv.gz'

#### Create full path with zip file

In [5]:
out_Zip = ODpath.joinpath(outputZip)

#### Write all the OD line data to compressed csv file

In [20]:
# start a timer
start = timeit.default_timer()
print ('Compressing dataframe. Please be patient.')
df_clean.to_csv(out_Zip, compression='gzip', index=None)
clear_output(wait=True)
# get the current time on timer
stop = timeit.default_timer()
timer = np.array([(stop-start)/60])
min_sec = get_time(timer)
minutes, seconds = min_sec[0], min_sec[1]
print('\nData compression complete.\nTotal time:', minutes, 'minutes', seconds, 'seconds')


Data compression complete.
Total time: 14 minutes 45 seconds
