## Create the Statistics File

#### Function - Process the statistics 

In [15]:
def chunk_process_stats(df):
    # create a group of all workplace geoids
    #df.drop(['h_geocode'], axis = 1, inplace = True)
    df_group = df.groupby(['w_geocode']).sum()
    return df_group

In [16]:
%time df_chunk = pd.read_csv(OD_file, compression='gzip', dtype={'w_geocode': str,'h_geocode':str}, chunksize=10000000)

CPU times: user 3.72 ms, sys: 18.6 ms, total: 22.3 ms
Wall time: 50.3 ms


In [17]:
chunk_num = 1
decimals = 0   
chunk_stats_list=[]


print('Reading in the chunk dataframe')
start = timeit.default_timer()
for df in df_chunk:
    print('Starting processing for chunk #', chunk_num)
    # call function to group and filter the data 
    stats_filter_chunk = chunk_process_stats(df)
    # append the filtered data to list
    chunk_stats_list.append(stats_filter_chunk)
    
    # get the current time on timer
    stop = timeit.default_timer()
    timer = np.array([(stop-start)/60])
    min_sec = get_time(timer)
    minutes, seconds = min_sec[0], min_sec[1]
    
    clear_output(wait=True)
    print('Chunk number:', chunk_num)
    print('Length of dataframe:',"{:,}".format(len(stats_filter_chunk)),'\n')
    print('Timer:', minutes, 'minutes', seconds, 'seconds')
    chunk_num += 1

print('\nData group stats complete.')

Chunk number: 12
Length of dataframe: 660,179 

Timer: 3 minutes 24 seconds

Data group stats complete.


#### Create a dataframe from the chunk list

In [18]:
# concat the list into dataframe 
%time df_concat_stats = pd.concat(chunk_stats_list,ignore_index=False)

print ('\nrecords loaded to dataframe:', "{:,}".format(len(df_concat_stats)),'\n\n')
df_concat_stats.head(1)

CPU times: user 205 ms, sys: 100 ms, total: 305 ms
Wall time: 328 ms

records loaded to dataframe: 2,815,940 




Unnamed: 0_level_0,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03
w_geocode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
10010201001000,6,1,3,2,3,3,0,3,0,3


In [19]:
df_concat_stats.reset_index(drop=False, inplace=True)
df_concat_stats.head()

Unnamed: 0,w_geocode,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03
0,10010201001000,6,1,3,2,3,3,0,3,0,3
1,10010201001016,17,1,13,3,5,1,11,0,0,17
2,10010201001018,24,3,12,9,0,12,12,17,7,0
3,10010201001022,267,23,167,77,67,66,134,0,0,267
4,10010201001023,6,2,1,3,1,3,2,0,0,6


#### Group the concatentated dataframe

In [20]:
df_group = df_concat_stats.groupby(['w_geocode']).sum()
print ('\nrecords loaded to dataframe:', "{:,}".format(len(df_group)),'\n\n')


records loaded to dataframe: 2,085,972 




#### Add centroid info for the work block location

In [21]:
# load the centroid file
centroids = pd.read_csv(block_file, dtype={'block_geoid': 'object', 'lat': 'float', 'lon': 'float'})
# merge the centroid data frame with the grouped dataframe
merge_result = pd.merge(df_group, centroids,left_on=  ['w_geocode'],right_on= ['block_geoid'],how = 'left')
#merge_result.drop(['block_geoid'], axis=1, inplace=True)
merge_result.rename(columns={'block_geoid': 'w_geocode', 'lat': 'w_lat', 'lon': 'w_lon'}, inplace=True)
merge_result = merge_result[['w_geocode','S000','SA01','SA02','SA03','SE01','SE02','SE03','SI01','SI02','SI03','w_lat','w_lon']]
print ('\nrecords in dataframe:', "{:,}".format(len(merge_result)),'\n')
df_stats = merge_result[(merge_result['S000'] >= 1000)]
print ('records in dataframe with 1000+ employees:', "{:,}".format(len(df_stats)),'\n\n')
df_stats.reset_index(drop=True, inplace=True)
df_stats.head(1)


records in dataframe: 2,085,972 

records in dataframe with 1000+ employees: 17,355 




Unnamed: 0,w_geocode,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,w_lat,w_lon
0,10010205001001,1100,536,426,138,534,412,154,13,540,547,32.45674,-86.415025


#### Are there any nulls?

In [22]:
df_null = df_stats[df_stats.isnull().any(axis=1)]
print ('\nthe number of null records:', "{:,}".format(len(df_null)),'\n\n')
df_null.head()


the number of null records: 122 




Unnamed: 0,w_geocode,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,w_lat,w_lon
179,,1530,257,924,349,599,522,409,1189,308,33,,
180,,1042,161,551,330,581,312,149,956,49,37,,
181,,1572,819,585,168,563,789,220,4,1056,512,,
182,,2857,300,1854,703,85,330,2442,46,6,2805,,
183,,1937,273,1163,501,45,143,1749,1328,0,609,,


#### Remove rows with any nulls

In [23]:
df_stats_clean = df_stats.dropna(how='any')
df_null = df_stats[df_stats.isnull().any(axis=1)]
df_stats_clean.head()

Unnamed: 0,w_geocode,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,w_lat,w_lon
0,10010205001001,1100,536,426,138,534,412,154,13,540,547,32.45674,-86.415025
1,10030107032109,1009,498,406,105,517,332,160,0,584,425,30.667644,-87.849564
2,10030112023027,1088,173,656,259,93,413,582,0,0,1088,30.518815,-87.88825
3,10030115021041,1280,560,522,198,719,397,164,0,805,475,30.372959,-87.68456
4,10059505002038,1448,432,793,223,105,1020,323,1448,0,0,31.801006,-85.332896


In [24]:
outputZip2 = 'od_stats.csv.gz'

In [25]:
out_Zip = ODpath.joinpath(outputZip2)

In [26]:
%time df_stats_clean.to_csv(out_Zip, compression='gzip', index=False)

CPU times: user 640 ms, sys: 68.4 ms, total: 709 ms
Wall time: 957 ms
