## Create the Statistics Files

In [1]:
import pandas as pd
from pathlib import Path
import zipfile

In [2]:
ODpath = Path("../data/OD/")
OD_file = ODpath.joinpath("od_distance_1k-15k_15-60_miles.csv")
if OD_file.exists ():
    print ("OD file exist")
else:
    print ("OD file does not exist")
    

OD file exist


In [3]:
%time df = pd.read_csv(OD_file, dtype={'w_geocode': str,'h_geocode':str})

Wall time: 44.5 s


In [4]:
df_w_counts = pd.DataFrame(df,columns=['w_geocode','w_group_count'])

In [5]:
df_w_counts_unique = df_w_counts.drop_duplicates()
df_w_counts_unique.head()

Unnamed: 0,w_geocode,w_group_count
0,20200007023012,1548
150,20200011001027,2829
486,20200011001035,1852
742,20200011001037,1547
927,20200015003001,1447


#### Classify distance ranges

In [6]:
def assign_distance (df):
    if df.distance < 32000 :
        return 'short_commute'
    if df.distance > 46000 :
        return 'long_commute'
    else :
        return 'medium_commute'

In [7]:
df['commute_length'] = df.apply (lambda df: assign_distance(df), axis=1)


In [8]:
df_geocode = df.drop(['h_lat', 'h_lon', 'w_lat', 'w_lon','distance','h_geocode','state'],axis=1)
df_geocode.head()

Unnamed: 0,w_geocode,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,w_group_count,commute_length
0,20200007023012,2,1,0,1,0,2,0,0,1,1,1548,short_commute
1,20200007023012,1,0,0,1,1,0,0,0,1,0,1548,short_commute
2,20200007023012,1,0,1,0,0,1,0,0,0,1,1548,short_commute
3,20200007023012,4,1,1,2,0,3,1,0,4,0,1548,short_commute
4,20200007023012,1,0,0,1,1,0,0,0,1,0,1548,short_commute


In [9]:
ODpath = Path("../data/OD/")
OD_file_out = ODpath.joinpath("commute_length_with_1k-15k_od_stats_15-60_miles.csv")
df_geocode.to_csv(OD_file_out, index=None)

#### Group the data on geocode column

In [10]:
df_group_w_geocode = df_geocode.groupby(['w_geocode','commute_length','w_group_count']).sum()

df_group_w_geocode.reset_index(inplace = True)
df_group_w_geocode.head()

Unnamed: 0,w_geocode,commute_length,w_group_count,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03
0,10010205001001,long_commute,1076,124,62,45,17,61,51,12,0,71,53
1,10010205001001,medium_commute,1076,45,21,20,4,22,13,10,0,19,26
2,10010205001001,short_commute,1076,72,35,30,7,33,25,14,0,28,44
3,10030112023027,long_commute,1069,49,10,35,4,3,20,26,0,0,49
4,10030112023027,medium_commute,1069,126,21,79,26,13,44,69,0,0,126


In [11]:
ODpath = Path("../data/OD/")
OD_file_out = ODpath.joinpath("geocode_1k-15k_od_stats_15-60_miles.csv")
df_group_w_geocode.to_csv(OD_file_out, index=None)

#### Load the centroid file to dataframe for merging lat lng

In [12]:
blockPath = Path("../data/blocks/")
block_file = blockPath.joinpath("block_centroids.csv.gz")
if block_file.exists ():
    print ("Block Centroid file exist")
else:
    print ("Block Centroid file does not exist")

Block Centroid file exist


In [13]:
# load the centroid file
centroids = pd.read_csv(block_file, dtype={'block_geoid': 'object', 'lat': 'float', 'lon': 'float'})

#### Merge with df to get the lat lon assigned to the w_geocode

In [14]:
merge_4latlng = pd.merge(df_group_w_geocode, centroids,left_on=  ['w_geocode'],right_on= ['block_geoid'],how = 'left')
merge_4latlng.head()

Unnamed: 0,w_geocode,commute_length,w_group_count,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,block_geoid,lat,lon
0,10010205001001,long_commute,1076,124,62,45,17,61,51,12,0,71,53,10010205001001,32.45674,-86.415025
1,10010205001001,medium_commute,1076,45,21,20,4,22,13,10,0,19,26,10010205001001,32.45674,-86.415025
2,10010205001001,short_commute,1076,72,35,30,7,33,25,14,0,28,44,10010205001001,32.45674,-86.415025
3,10030112023027,long_commute,1069,49,10,35,4,3,20,26,0,0,49,10030112023027,30.518815,-87.88825
4,10030112023027,medium_commute,1069,126,21,79,26,13,44,69,0,0,126,10030112023027,30.518815,-87.88825


In [15]:
df_merged = merge_4latlng.drop(['block_geoid'],axis=1)
df_merged.head()

Unnamed: 0,w_geocode,commute_length,w_group_count,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,lat,lon
0,10010205001001,long_commute,1076,124,62,45,17,61,51,12,0,71,53,32.45674,-86.415025
1,10010205001001,medium_commute,1076,45,21,20,4,22,13,10,0,19,26,32.45674,-86.415025
2,10010205001001,short_commute,1076,72,35,30,7,33,25,14,0,28,44,32.45674,-86.415025
3,10030112023027,long_commute,1069,49,10,35,4,3,20,26,0,0,49,30.518815,-87.88825
4,10030112023027,medium_commute,1069,126,21,79,26,13,44,69,0,0,126,30.518815,-87.88825


#### Are there any nulls?

In [16]:
df_null = df_merged[df_merged.isnull().any(axis=1)]
print ('\nthe number of null records:', "{:,}".format(len(df_null)),'\n\n')
df_null.head()


the number of null records: 299 




Unnamed: 0,w_geocode,commute_length,w_group_count,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,lat,lon
486,20200007023012,long_commute,1548,88,37,41,10,21,54,13,0,72,16,,
487,20200007023012,medium_commute,1548,76,35,30,11,22,39,15,1,55,20,,
488,20200007023012,short_commute,1548,23,8,8,7,8,12,3,0,17,6,,
489,20200011001027,long_commute,2829,301,30,206,65,9,29,263,7,0,294,,
490,20200011001027,medium_commute,2829,115,8,74,33,5,9,101,5,0,110,,


#### Remove rows with any nulls

In [17]:
df_stats_clean = df_merged.dropna(how='any')
df_null = df_merged[df_merged.isnull().any(axis=1)]
df_stats_clean.head()

Unnamed: 0,w_geocode,commute_length,w_group_count,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,lat,lon
0,10010205001001,long_commute,1076,124,62,45,17,61,51,12,0,71,53,32.45674,-86.415025
1,10010205001001,medium_commute,1076,45,21,20,4,22,13,10,0,19,26,32.45674,-86.415025
2,10010205001001,short_commute,1076,72,35,30,7,33,25,14,0,28,44,32.45674,-86.415025
3,10030112023027,long_commute,1069,49,10,35,4,3,20,26,0,0,49,30.518815,-87.88825
4,10030112023027,medium_commute,1069,126,21,79,26,13,44,69,0,0,126,30.518815,-87.88825


In [18]:
len(df_stats_clean)

47922

#### Write the block level stats

In [19]:
OD_file_out = ODpath.joinpath("1k-15k_od_block_stats_15-60_miles.csv")
df_stats_clean.to_csv(OD_file_out, index=None)

#### Sum the records for each state

In [20]:
df_state_group = df.groupby(['state']).sum()

In [21]:
len(df_state_group)

50

In [22]:
df_state_group.columns

Index(['distance', 'S000', 'SA01', 'SA02', 'SA03', 'SE01', 'SE02', 'SE03',
       'SI01', 'SI02', 'SI03', 'w_lat', 'w_lon', 'h_lat', 'h_lon',
       'w_group_count'],
      dtype='object')

In [23]:
df_state_group.drop(['distance','w_group_count','w_lat', 'w_lon', 'h_lat', 'h_lon'], axis=1, inplace=True)
df_state_group.head()

Unnamed: 0_level_0,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,89880,17272,55109,17499,8909,26800,54171,27751,9490,52639
2,10475,1391,6523,2561,820,1866,7789,2259,1230,6986
4,273039,48172,166761,58106,33592,82583,156864,30008,47834,195197
5,45503,8380,27215,9908,4970,16985,23548,9160,4586,31757
6,1398046,253404,846624,298018,155886,322679,919481,186836,227706,983504


#### Write the state level stats

In [24]:
OD_file_out = ODpath.joinpath("1k-15k_od_state_stats_18-60_miles.csv")
df_state_group.to_csv(OD_file_out)

#### Count the number of records for each state

In [25]:
df['count'] = 1

In [26]:
df.head()

Unnamed: 0,w_geocode,h_geocode,distance,S000,SA01,SA02,SA03,SE01,SE02,SE03,...,SI02,SI03,w_lat,w_lon,h_lat,h_lon,state,w_group_count,commute_length,count
0,20200007023012,20200001011055,31749,2,1,0,1,0,2,0,...,1,1,61.212,-149.742,61.436,-149.375,2,1548,short_commute,1
1,20200007023012,20200001012050,28342,1,0,0,1,1,0,0,...,1,0,61.212,-149.742,61.411,-149.414,2,1548,short_commute,1
2,20200007023012,20200001012064,27888,1,0,1,0,0,1,0,...,0,1,61.212,-149.742,61.413,-149.431,2,1548,short_commute,1
3,20200007023012,20200001013004,28501,4,1,1,2,0,3,1,...,4,0,61.212,-149.742,61.426,-149.452,2,1548,short_commute,1
4,20200007023012,20200001013006,28019,1,0,0,1,1,0,0,...,1,0,61.212,-149.742,61.417,-149.44,2,1548,short_commute,1


In [27]:
df.columns

Index(['w_geocode', 'h_geocode', 'distance', 'S000', 'SA01', 'SA02', 'SA03',
       'SE01', 'SE02', 'SE03', 'SI01', 'SI02', 'SI03', 'w_lat', 'w_lon',
       'h_lat', 'h_lon', 'state', 'w_group_count', 'commute_length', 'count'],
      dtype='object')

In [28]:
df_state_count = df[['state', 'count']]

In [29]:
len(df_state_count)

8383570

In [30]:
df_state_counts = df_state_count.groupby(['state']).sum()

In [31]:
df_state_counts

Unnamed: 0_level_0,count
state,Unnamed: 1_level_1
1,76861
2,6903
4,245061
5,39191
6,1319032
8,144477
9,96917
10,20091
11,83493
12,486561


#### Write the state counts

In [32]:
OD_file_out = ODpath.joinpath("1k-15k_od_state_counts_15-60_miles.csv")
df_state_counts.to_csv(OD_file_out)