## Create the Statistics Files

In [1]:
import pandas as pd
from pathlib import Path
import zipfile

In [2]:
ODpath = Path("../data/OD/")
OD_file = ODpath.joinpath("od_distance_1k_clean.csv.gz")
if OD_file.exists ():
    print ("OD file exist")
else:
    print ("OD file does not exist")
    

OD file exist


In [3]:
%time df = pd.read_csv(OD_file, compression='gzip', dtype={'w_geocode': str,'h_geocode':str})

Wall time: 2min 19s


#### Identify all state FIPS codes in data

In [4]:
state_list = df['w_geocode'].str.slice(0,2).unique().tolist()
print (len(state_list))

48


#### Create a column to contain the state FIPS

In [5]:
df['state'] = df['w_geocode'].str.slice(0,2)

#### Group the data to determine max and min values
Should be between near 1k to 70k

In [6]:
df_group = df.groupby(['w_geocode']).sum()
df_group_limited = df_group[(df_group['S000'] >= 1000)]

In [7]:
df_group_limited.S000.max()

71280

In [8]:
df_group_limited.S000.min()

1000

In [9]:
df.head()

Unnamed: 0,w_geocode,h_geocode,distance,w_group_count,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,w_lat,w_lon,h_lat,h_lon,state
0,10010205001001,10010201001016,5896,1076,1,1,0,0,1,0,0,0,1,0,32.457,-86.415,32.467,-86.477,1
1,10010205001001,10010201001025,6964,1076,1,1,0,0,0,1,0,0,1,0,32.457,-86.415,32.46,-86.489,1
2,10010205001001,10010201002016,7088,1076,2,0,2,0,0,0,2,0,2,0,32.457,-86.415,32.474,-86.488,1
3,10010205001001,10010201002017,6830,1076,3,3,0,0,1,2,0,0,1,2,32.457,-86.415,32.477,-86.484,1
4,10010205001001,10010201002022,7304,1076,1,0,1,0,0,1,0,0,1,0,32.457,-86.415,32.488,-86.483,1


In [10]:
df_group.head()

Unnamed: 0_level_0,distance,w_group_count,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,w_lat,w_lon,h_lat,h_lon
w_geocode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
10010205001001,64350037,998528,1076,522,420,134,520,405,151,13,530,533,30120.096,-80193.12,30228.317,-80207.445
10030112023027,24903670,813509,1069,170,643,256,92,403,574,0,0,1069,23224.959,-66882.768,23367.245,-66836.761
10030115021041,98730069,1081262,1183,520,482,181,657,374,152,0,751,432,27760.922,-80144.09,28510.338,-79933.25
10059505002038,21818131,597343,1061,289,610,162,74,699,288,1061,0,0,17903.963,-48042.479,17934.72,-48085.621
10150007002053,19549400,816910,1082,118,720,244,188,342,552,0,0,1082,25441.235,-64797.12,25457.605,-64856.141


In [11]:
df_group.columns

Index(['distance', 'w_group_count', 'S000', 'SA01', 'SA02', 'SA03', 'SE01',
       'SE02', 'SE03', 'SI01', 'SI02', 'SI03', 'w_lat', 'w_lon', 'h_lat',
       'h_lon'],
      dtype='object')

In [12]:
df_group.drop(['distance','w_group_count','w_lat', 'w_lon', 'h_lat', 'h_lon'], axis=1, inplace=True)

In [13]:
df_group.head()

Unnamed: 0_level_0,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03
w_geocode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
10010205001001,1076,522,420,134,520,405,151,13,530,533
10030112023027,1069,170,643,256,92,403,574,0,0,1069
10030115021041,1183,520,482,181,657,374,152,0,751,432
10059505002038,1061,289,610,162,74,699,288,1061,0,0
10150007002053,1082,118,720,244,188,342,552,0,0,1082


In [14]:
df_group.S000.max()

71280

In [15]:
df_group.S000.min()

999

In [16]:
len(df_group)

16070

In [17]:
ODpath = Path("../data/OD/")
OD_file_out = ODpath.joinpath("1k_plus_od_stats.csv")
df_group.to_csv(OD_file_out, index=None)

#### Load the centroid file to dataframe for merging lat lng

In [18]:
blockPath = Path("../data/blocks/")
block_file = blockPath.joinpath("block_centroids.csv.gz")
if block_file.exists ():
    print ("Block Centroid file exist")
else:
    print ("Block Centroid file does not exist")

Block Centroid file exist


In [19]:
# load the centroid file
centroids = pd.read_csv(block_file, dtype={'block_geoid': 'object', 'lat': 'float', 'lon': 'float'})

#### Merge with df to get the lat lon assigned to the w_geocode

In [20]:
merge_4latlng = pd.merge(df_group, centroids,left_on=  ['w_geocode'],right_on= ['block_geoid'],how = 'left')
merge_4latlng.head()

Unnamed: 0,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,block_geoid,lat,lon
0,1076,522,420,134,520,405,151,13,530,533,10010205001001,32.45674,-86.415025
1,1069,170,643,256,92,403,574,0,0,1069,10030112023027,30.518815,-87.88825
2,1183,520,482,181,657,374,152,0,751,432,10030115021041,30.372959,-87.68456
3,1061,289,610,162,74,699,288,1061,0,0,10059505002038,31.801006,-85.332896
4,1082,118,720,244,188,342,552,0,0,1082,10150007002053,33.696812,-85.824191


In [21]:
merge_4latlng.columns

Index(['S000', 'SA01', 'SA02', 'SA03', 'SE01', 'SE02', 'SE03', 'SI01', 'SI02',
       'SI03', 'block_geoid', 'lat', 'lon'],
      dtype='object')

In [22]:
len(merge_4latlng)

16070

#### Are there any nulls?

In [23]:
df_null = merge_4latlng[merge_4latlng.isnull().any(axis=1)]
print ('\nthe number of null records:', "{:,}".format(len(df_null)),'\n\n')
df_null.head()


the number of null records: 0 




Unnamed: 0,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,block_geoid,lat,lon


#### Remove rows with any nulls

In [24]:
df_stats_clean = merge_4latlng.dropna(how='any')
df_null = merge_4latlng[merge_4latlng.isnull().any(axis=1)]
df_stats_clean.head()

Unnamed: 0,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,block_geoid,lat,lon
0,1076,522,420,134,520,405,151,13,530,533,10010205001001,32.45674,-86.415025
1,1069,170,643,256,92,403,574,0,0,1069,10030112023027,30.518815,-87.88825
2,1183,520,482,181,657,374,152,0,751,432,10030115021041,30.372959,-87.68456
3,1061,289,610,162,74,699,288,1061,0,0,10059505002038,31.801006,-85.332896
4,1082,118,720,244,188,342,552,0,0,1082,10150007002053,33.696812,-85.824191


In [25]:
len(df_stats_clean)

16070

#### Write the block level stats

In [26]:
OD_file_out = ODpath.joinpath("1k_plus_od_block_stats.csv")
df_stats_clean.to_csv(OD_file_out, index=None)

#### Sum the records for each state

In [27]:
df_state_group = df.groupby(['state']).sum()

In [28]:
len(df_state_group)

48

In [29]:
df_state_group.columns

Index(['distance', 'w_group_count', 'S000', 'SA01', 'SA02', 'SA03', 'SE01',
       'SE02', 'SE03', 'SI01', 'SI02', 'SI03', 'w_lat', 'w_lon', 'h_lat',
       'h_lon'],
      dtype='object')

In [30]:
df_state_group.drop(['distance','w_group_count','w_lat', 'w_lon', 'h_lat', 'h_lon'], axis=1, inplace=True)
df_state_group.head()

Unnamed: 0_level_0,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,305768,59392,181488,64888,34799,100547,170422,64086,29681,212001
4,930733,181041,542204,207488,133390,308694,488649,92906,140015,697812
5,209617,38695,123764,47158,25289,80695,103633,35331,21535,152751
6,4821130,874375,2898970,1047785,558764,1170881,3091485,648915,680890,3491325
8,679824,114957,409071,155796,80842,184581,414401,58966,69583,551275


#### Write the state level stats

In [31]:
OD_file_out = ODpath.joinpath("1k_plus_od_state_stats.csv")
df_state_group.to_csv(OD_file_out)

#### Count the number of records for each state

In [32]:
df['count'] = 1

In [33]:
df.head()

Unnamed: 0,w_geocode,h_geocode,distance,w_group_count,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,w_lat,w_lon,h_lat,h_lon,state,count
0,10010205001001,10010201001016,5896,1076,1,1,0,0,1,0,0,0,1,0,32.457,-86.415,32.467,-86.477,1,1
1,10010205001001,10010201001025,6964,1076,1,1,0,0,0,1,0,0,1,0,32.457,-86.415,32.46,-86.489,1,1
2,10010205001001,10010201002016,7088,1076,2,0,2,0,0,0,2,0,2,0,32.457,-86.415,32.474,-86.488,1,1
3,10010205001001,10010201002017,6830,1076,3,3,0,0,1,2,0,0,1,2,32.457,-86.415,32.477,-86.484,1,1
4,10010205001001,10010201002022,7304,1076,1,0,1,0,0,1,0,0,1,0,32.457,-86.415,32.488,-86.483,1,1


In [34]:
df.columns

Index(['w_geocode', 'h_geocode', 'distance', 'w_group_count', 'S000', 'SA01',
       'SA02', 'SA03', 'SE01', 'SE02', 'SE03', 'SI01', 'SI02', 'SI03', 'w_lat',
       'w_lon', 'h_lat', 'h_lon', 'state', 'count'],
      dtype='object')

In [35]:
df_state_count = df[['state', 'count']]

In [36]:
len(df_state_count)

28588823

In [37]:
df_state_counts = df_state_count.groupby(['state']).sum()

In [38]:
df_state_counts

Unnamed: 0_level_0,count
state,Unnamed: 1_level_1
1,229357
4,773416
5,139430
6,4110184
8,552157
9,271396
10,69201
11,265545
12,1865715
13,866588


#### Write the state counts

In [40]:
OD_file_out = ODpath.joinpath("1k_plus_od_state_counts.csv")
df_state_counts.to_csv(OD_file_out)