## Create the Statistics Files

In [1]:
import pandas as pd
from pathlib import Path
import zipfile

In [4]:
ODpath = Path("../data/OD/")
OD_file = ODpath.joinpath("od_distance_1k-15k_18-60_miles.csv")
if OD_file.exists ():
    print ("OD file exist")
else:
    print ("OD file does not exist")
    

OD file exist


In [6]:
%time df = pd.read_csv(OD_file, dtype={'w_geocode': str,'h_geocode':str})

Wall time: 34.4 s


#### Group the data on geocode column

In [7]:
df_group = df.groupby(['w_geocode']).sum()

In [8]:
df_group.head()

Unnamed: 0_level_0,distance,w_group_count,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,w_lat,w_lon,h_lat,h_lon,state
w_geocode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
10010205001001,10829720,190452,182,91,68,23,91,68,23,0,94,88,5744.889,-15295.455,5756.955,-15271.453,177
10030112023027,8222942,206317,219,42,139,38,18,87,114,0,0,219,5890.167,-16962.384,5931.519,-16967.225,193
10030115021041,12166034,249613,221,98,93,30,121,71,29,0,138,83,6408.703,-18501.535,6482.335,-18579.621,211
10059505002038,10209278,211139,242,71,135,36,22,151,69,242,0,0,6328.399,-16981.267,6325.345,-17010.63,199
10150007002053,5740896,140660,137,23,87,27,20,30,87,0,0,137,4380.61,-11157.12,4391.85,-11173.799,130


In [9]:
df_group.columns

Index(['distance', 'w_group_count', 'S000', 'SA01', 'SA02', 'SA03', 'SE01',
       'SE02', 'SE03', 'SI01', 'SI02', 'SI03', 'w_lat', 'w_lon', 'h_lat',
       'h_lon', 'state'],
      dtype='object')

In [10]:
df_group.drop(['distance','w_group_count','w_lat', 'w_lon', 'h_lat', 'h_lon'], axis=1, inplace=True)

In [11]:
df_group.head()

Unnamed: 0_level_0,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,state
w_geocode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
10010205001001,182,91,68,23,91,68,23,0,94,88,177
10030112023027,219,42,139,38,18,87,114,0,0,219,193
10030115021041,221,98,93,30,121,71,29,0,138,83,211
10059505002038,242,71,135,36,22,151,69,242,0,0,199
10150007002053,137,23,87,27,20,30,87,0,0,137,130


In [12]:
ODpath = Path("../data/OD/")
OD_file_out = ODpath.joinpath("1k-10k_od_stats_18-60_miles.csv")
df_group.to_csv(OD_file_out, index=None)

#### Load the centroid file to dataframe for merging lat lng

In [13]:
blockPath = Path("../data/blocks/")
block_file = blockPath.joinpath("block_centroids.csv.gz")
if block_file.exists ():
    print ("Block Centroid file exist")
else:
    print ("Block Centroid file does not exist")

Block Centroid file exist


In [14]:
# load the centroid file
centroids = pd.read_csv(block_file, dtype={'block_geoid': 'object', 'lat': 'float', 'lon': 'float'})

#### Merge with df to get the lat lon assigned to the w_geocode

In [15]:
merge_4latlng = pd.merge(df_group, centroids,left_on=  ['w_geocode'],right_on= ['block_geoid'],how = 'left')
merge_4latlng.head()

Unnamed: 0,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,state,block_geoid,lat,lon
0,182,91,68,23,91,68,23,0,94,88,177,10010205001001,32.45674,-86.415025
1,219,42,139,38,18,87,114,0,0,219,193,10030112023027,30.518815,-87.88825
2,221,98,93,30,121,71,29,0,138,83,211,10030115021041,30.372959,-87.68456
3,242,71,135,36,22,151,69,242,0,0,199,10059505002038,31.801006,-85.332896
4,137,23,87,27,20,30,87,0,0,137,130,10150007002053,33.696812,-85.824191


In [16]:
merge_4latlng.columns

Index(['S000', 'SA01', 'SA02', 'SA03', 'SE01', 'SE02', 'SE03', 'SI01', 'SI02',
       'SI03', 'state', 'block_geoid', 'lat', 'lon'],
      dtype='object')

In [17]:
len(merge_4latlng)

15991

#### Are there any nulls?

In [18]:
df_null = merge_4latlng[merge_4latlng.isnull().any(axis=1)]
print ('\nthe number of null records:', "{:,}".format(len(df_null)),'\n\n')
df_null.head()


the number of null records: 0 




Unnamed: 0,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,state,block_geoid,lat,lon


#### Remove rows with any nulls

In [19]:
df_stats_clean = merge_4latlng.dropna(how='any')
df_null = merge_4latlng[merge_4latlng.isnull().any(axis=1)]
df_stats_clean.head()

Unnamed: 0,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,state,block_geoid,lat,lon
0,182,91,68,23,91,68,23,0,94,88,177,10010205001001,32.45674,-86.415025
1,219,42,139,38,18,87,114,0,0,219,193,10030112023027,30.518815,-87.88825
2,221,98,93,30,121,71,29,0,138,83,211,10030115021041,30.372959,-87.68456
3,242,71,135,36,22,151,69,242,0,0,199,10059505002038,31.801006,-85.332896
4,137,23,87,27,20,30,87,0,0,137,130,10150007002053,33.696812,-85.824191


In [20]:
len(df_stats_clean)

15991

#### Write the block level stats

In [21]:
OD_file_out = ODpath.joinpath("1k-15k_od_block_stats_18-60_miles.csv")
df_stats_clean.to_csv(OD_file_out, index=None)

#### Sum the records for each state

In [22]:
df_state_group = df.groupby(['state']).sum()

In [23]:
len(df_state_group)

48

In [24]:
df_state_group.columns

Index(['distance', 'w_group_count', 'S000', 'SA01', 'SA02', 'SA03', 'SE01',
       'SE02', 'SE03', 'SI01', 'SI02', 'SI03', 'w_lat', 'w_lon', 'h_lat',
       'h_lon'],
      dtype='object')

In [25]:
df_state_group.drop(['distance','w_group_count','w_lat', 'w_lon', 'h_lat', 'h_lon'], axis=1, inplace=True)
df_state_group.head()

Unnamed: 0_level_0,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,70038,13835,42671,13532,7317,20851,41870,22240,7675,40123
4,193230,33810,118291,41129,24062,58434,110734,22089,33975,137166
5,36380,6795,21745,7840,4057,13586,18737,7623,3409,25348
6,1118198,206464,675004,236730,127783,258149,732266,148751,186992,782455
8,115920,18791,69755,27374,13261,28873,73786,12816,17619,85485


#### Write the state level stats

In [26]:
OD_file_out = ODpath.joinpath("1k-15k_od_state_stats_18-60_miles.csv")
df_state_group.to_csv(OD_file_out)

#### Count the number of records for each state

In [27]:
df['count'] = 1

In [28]:
df.head()

Unnamed: 0,w_geocode,h_geocode,distance,w_group_count,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,w_lat,w_lon,h_lat,h_lon,state,count
0,10010205001001,10010209001002,29897,1076,1,0,1,0,0,1,0,0,1,0,32.457,-86.415,32.694,-86.567,1,1
1,10010205001001,10010209001003,31216,1076,1,1,0,0,1,0,0,0,0,1,32.457,-86.415,32.704,-86.573,1,1
2,10010205001001,10010210001116,46757,1076,1,0,1,0,0,1,0,0,0,1,32.457,-86.415,32.573,-86.893,1,1
3,10010205001001,10010210002037,34627,1076,1,1,0,0,0,1,0,0,0,1,32.457,-86.415,32.668,-86.687,1,1
4,10010205001001,10010210002092,30628,1076,1,1,0,0,1,0,0,0,0,1,32.457,-86.415,32.603,-86.691,1,1


In [29]:
df.columns

Index(['w_geocode', 'h_geocode', 'distance', 'w_group_count', 'S000', 'SA01',
       'SA02', 'SA03', 'SE01', 'SE02', 'SE03', 'SI01', 'SI02', 'SI03', 'w_lat',
       'w_lon', 'h_lat', 'h_lon', 'state', 'count'],
      dtype='object')

In [30]:
df_state_count = df[['state', 'count']]

In [31]:
len(df_state_count)

6518356

In [32]:
df_state_counts = df_state_count.groupby(['state']).sum()

In [33]:
df_state_counts

Unnamed: 0_level_0,count
state,Unnamed: 1_level_1
1,61640
4,174310
5,32169
6,1064791
8,108828
9,75195
10,16374
11,65361
12,365059
13,290964


#### Write the state counts

In [34]:
OD_file_out = ODpath.joinpath("1k-15k_od_state_counts_18-60_miles.csv")
df_state_counts.to_csv(OD_file_out)