## Write all states and a single file for each State in the 1k-15k data range

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import zipfile
import timeit
from IPython.display import clear_output

In [2]:
ODpath = Path("../data/OD/")
OD_file = ODpath.joinpath("od_distance_1k-15k_clean.csv.gz")
if OD_file.exists ():
    print ("OD file exist")
else:
    print ("OD file does not exist")
    

OD file exist


In [3]:
%time df = pd.read_csv(OD_file, compression='gzip', dtype={'w_geocode': str,'h_geocode':str})

Wall time: 2min 16s


#### Group the data to verify max and min values
Should be between near 1k to 15k

In [4]:
df_group = df.groupby(['w_geocode']).sum()
print ('The max summed S000 is:', df_group.S000.max())
print ('The min summed S000 is:', df_group.S000.min())

The max summed S000 is: 16312
The min summed S000 is: 999


#### Identify all state FIPS codes in data

In [5]:
state_list = df['w_geocode'].str.slice(0,2).unique().tolist()
print (len(state_list))

50


#### Create a column to contain the state FIPS

In [6]:
df['state'] = df['w_geocode'].str.slice(0,2)
print ('Number of records in dataframe:', len(df))
df.head()

Number of records in dataframe: 27605039


Unnamed: 0,w_geocode,h_geocode,distance,w_group_count,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,w_lat,w_lon,h_lat,h_lon,state
0,20160002001050,20160001001250,716908,1042,3,0,1,2,0,2,1,2,1,0,53.893,-166.534,51.831,-176.63,2
1,20160002001050,20160001001270,714934,1042,1,0,1,0,0,0,1,1,0,0,53.893,-166.534,51.88,-176.631,2
2,20160002001050,20160001001453,433979,1042,1,1,0,0,0,1,0,1,0,0,53.893,-166.534,57.197,-170.186,2
3,20160002001050,20160001001461,430850,1042,1,1,0,0,0,1,0,0,0,1,53.893,-166.534,57.133,-170.267,2
4,20160002001050,20160001001475,430559,1042,1,0,1,0,0,1,0,1,0,0,53.893,-166.534,57.127,-170.273,2


#### Filter the data to include commute distances > 15 miles and <60 miles

In [7]:
df_filter = df[(df['distance'] > 24000) & (df['distance'] <= 100000)]
print ('Number of records after filter for 15-60 mile commutes:', len(df_filter))

Number of records after filter for 15-60 mile commutes: 8401012


#### Reset the w_group_count
** This is need to correct an issue caused by chunking the data in an earlier step

In [8]:
df_w_counts = pd.DataFrame(df,columns=['w_geocode','w_group_count'])

In [9]:
df_w_counts_unique = df_w_counts.drop_duplicates()

In [10]:
df_w_count_group = df_w_counts_unique.groupby(['w_geocode']).sum()
df_w_count_group = df_w_count_group.reset_index()
df_w_count_group.head()

Unnamed: 0,w_geocode,w_group_count
0,10010205001001,1076
1,10030112023027,1069
2,10030115021041,1183
3,10059505002038,1061
4,10150007002053,1082


#### Merge the reset counts with the dataframe

In [11]:
merge_result = pd.merge(df_filter, df_w_count_group, left_on=  ['w_geocode'],right_on= ['w_geocode'],how = 'left')

merge_result.rename(columns={'w_group_count_y': 'w_group_count'}, inplace=True)
# Remove duplicate column names with _x suffix
merge_result.drop(list(merge_result.filter(regex='_x')), axis=1, inplace=True)
merge_result.head()

Unnamed: 0,w_geocode,h_geocode,distance,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,w_lat,w_lon,h_lat,h_lon,state,w_group_count
0,20200007023012,20200001011055,31749,2,1,0,1,0,2,0,0,1,1,61.212,-149.742,61.436,-149.375,2,1548
1,20200007023012,20200001012050,28342,1,0,0,1,1,0,0,0,1,0,61.212,-149.742,61.411,-149.414,2,1548
2,20200007023012,20200001012064,27888,1,0,1,0,0,1,0,0,0,1,61.212,-149.742,61.413,-149.431,2,1548
3,20200007023012,20200001013004,28501,4,1,1,2,0,3,1,0,4,0,61.212,-149.742,61.426,-149.452,2,1548
4,20200007023012,20200001013006,28019,1,0,0,1,1,0,0,0,1,0,61.212,-149.742,61.417,-149.44,2,1548


#### With updated counts filter for those above 15,000 employees per block

In [12]:
df_filter = merge_result[(merge_result['w_group_count'] < 15000)]

#### Write the updated file to compressed CSV

In [13]:
output = 'od_distance_1k-15k_15-60_miles.csv'
out_file_path = ODpath.joinpath(output)
df_filter.to_csv(out_file_path, index=None)
print ('Number of records written:', len(df_filter))

Number of records written: 8383570


#### Funtion to write each state file to csv

In [14]:
def write_state(df_state):
    df_state.to_csv(out_state, index=None)

#### Loop through each State FIPS code and call function to write csv

In [15]:
for index in range(0,len(state_list)):
    state_id = state_list[index]
    df_state = df_filter[df_filter['state'] == state_id]
    outputstate = state_id + '_od_distance_1k-15k_15-60_miles.csv'
    out_state = ODpath.joinpath(outputstate)
    write_state(df_state)

### END