## Select Individual State for Mapping

In [1]:
import pathlib
import pandas as pd
import numpy as np
import geopandas as gp
import shapely.geometry as geom
import folium

#### Set input file location and read into Pandas

In [2]:
input_file = 'OD_distance.csv.gz'
ODpath = pathlib.Path('../data/OD/')
ODfile = ODpath.joinpath(input_file)

with ODfile.open(mode='r') as fid:
    df_All = pd.read_csv(ODfile, dtype = {'w_geocode': object, 'h_geocode': object})

print ('input file: ',ODfile)
print('\nrecords loaded to dataframe:', "{:,}".format(len(df_All)),'\n\n')
df_All.head()

input file:  ../data/OD/OD_distance.csv.gz

records loaded to dataframe: 13,783,350 




Unnamed: 0,w_geocode,h_geocode,distance,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,w_lat,w_lon,h_lat,h_lon
0,10010205001001,10010208021004,17916.0,2,1,1,0,1,0,1,0,2,0,32.45674,-86.415025,32.560257,-86.56141
1,10010205001001,10010208021016,21718.0,2,1,1,0,1,1,0,0,0,2,32.45674,-86.415025,32.559373,-86.611869
2,10010205001001,10010208021025,18358.0,1,1,0,0,1,0,0,0,1,0,32.45674,-86.415025,32.533696,-86.58797
3,10010205001001,10010208021026,16658.0,2,0,2,0,0,0,2,0,1,1,32.45674,-86.415025,32.524822,-86.573009
4,10010205001001,10010208021032,15279.0,2,0,2,0,1,0,1,0,2,0,32.45674,-86.415025,32.516518,-86.561483


#### Set the state to analyze

In [3]:
state_id = '21'

#### Set the output file location

In [4]:
output_loc = pathlib.Path('../data/')
output_file = output_loc.joinpath(state_id+'.geojson')
print ('path for output:',output_file)

path for output: ../data/21.geojson


#### Drop all records where the work location does not being with the state_id

In [5]:
df = df_All.drop(df_All[~df_All['w_geocode'].str.startswith(state_id)].index)
df = df.reset_index(drop=True)
print('\nnew dataframe length: ', "{:,}".format(len(df)),'\n\n')
df.head()


new dataframe length:  118,821 




Unnamed: 0,w_geocode,h_geocode,distance,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,w_lat,w_lon,h_lat,h_lon
0,210099505002000,210019702001066,55447.0,1,0,1,0,0,0,1,0,0,1,37.010798,-85.906837,37.168548,-85.315083
1,210099505002000,210019702001086,47802.0,1,0,1,0,0,0,1,0,0,1,37.010798,-85.906837,37.137989,-85.393249
2,210099505002000,210019703001021,73360.0,1,0,1,0,0,0,1,0,0,1,37.010798,-85.906837,37.170777,-85.106224
3,210099505002000,210019703001030,62186.0,2,1,1,0,0,1,1,0,0,2,37.010798,-85.906837,37.156984,-85.231659
4,210099505002000,210019703002035,63240.0,1,0,1,0,0,1,0,0,0,1,37.010798,-85.906837,37.085174,-85.202002


In [11]:
df_unique_list = df.w_geocode.unique().tolist()
print ('number of unique cenus blocks with over 1000 employees in state FIPS code: ',state_id, 'is:', len(df_unique_list))

number of unique cenus blocks with over 1000 employees in state FIPS code:  21 is: 191


In [None]:
df.w_geocode.unique().tolist()

In [8]:
%time df['geom'] = df.apply(lambda x: ([(x['w_lat'], x['w_lon']),(x['h_lat'],x['h_lon'])]), axis = 1)
df.head()

CPU times: user 4.44 s, sys: 255 ms, total: 4.7 s
Wall time: 4.92 s


Unnamed: 0,w_geocode,h_geocode,distance,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,w_lat,w_lon,h_lat,h_lon,geom
0,210099505002000,210019702001066,55447.0,1,0,1,0,0,0,1,0,0,1,37.010798,-85.906837,37.168548,-85.315083,"[(37.0107975, -85.90683680000002), (37.1685483..."
1,210099505002000,210019702001086,47802.0,1,0,1,0,0,0,1,0,0,1,37.010798,-85.906837,37.137989,-85.393249,"[(37.0107975, -85.90683680000002), (37.1379890..."
2,210099505002000,210019703001021,73360.0,1,0,1,0,0,0,1,0,0,1,37.010798,-85.906837,37.170777,-85.106224,"[(37.0107975, -85.90683680000002), (37.1707773..."
3,210099505002000,210019703001030,62186.0,2,1,1,0,0,1,1,0,0,2,37.010798,-85.906837,37.156984,-85.231659,"[(37.0107975, -85.90683680000002), (37.1569843..."
4,210099505002000,210019703002035,63240.0,1,0,1,0,0,1,0,0,0,1,37.010798,-85.906837,37.085174,-85.202002,"[(37.0107975, -85.90683680000002), (37.0851743..."


#### Are there any nulls?

In [9]:
df_null = df[df.isnull().any(axis=1)]
df_null.head()

Unnamed: 0,w_geocode,h_geocode,distance,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,w_lat,w_lon,h_lat,h_lon,geom


#### Remove rows with any nulls

In [10]:
df = df.dropna(how='any')
df_null = df[df.isnull().any(axis=1)]
df_null.head()

Unnamed: 0,w_geocode,h_geocode,distance,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,w_lat,w_lon,h_lat,h_lon,geom


In [11]:
df.index

Int64Index([     0,      1,      2,      3,      4,      5,      6,      7,
                 8,      9,
            ...
            118811, 118812, 118813, 118814, 118815, 118816, 118817, 118818,
            118819, 118820],
           dtype='int64', length=118821)

In [12]:
lines = list()
for index in df.index:
    lines.append(df.geom.values[index])
print (lines[-1:],'\n')

[[(36.9215193, -84.11942640000002), (36.7027263, -83.95279599999998)]] 



In [13]:
m = folium.Map(location=[37.645556, -84.769722], tiles='Stamen Toner',
                zoom_start=5, control_scale=True, prefer_canvas=True)

In [14]:
#mylocation = [(37.99491, -85.66809559999999), (38.0899342, -84.884275)]
mylocation = df.geom.values[2000]
mylocation = lines[1000]
my_PolyLine=folium.PolyLine(mylocation, line_color='#FF0000',weight=1).add_to(m)
m
print(mylocation)

[(38.9739362, -84.6341551), (39.087748100000006, -84.5006649)]


In [15]:
for i in range(2000):
    my_PolyLine=folium.PolyLine(lines[i], color='#FF0000',weight=1).add_to(m)
m

In [19]:
unique_locations_list = df.w_geocode.unique().tolist()
print ('there are', len(unique_locations_list), 'unique locations with 1000+ employees')

there are 191 unique locations with 1000+ employees


In [None]:
df_unique = df[df['w_geocode'].isin(unique_locations_list)]

In [13]:
%time df['geometry'] =df.apply(lambda x: geom.LineString([(x['w_lon'], x['w_lat'] ),(x['h_lon'],x['h_lat'])]),axis = 1)


CPU times: user 3.58 s, sys: 45.4 ms, total: 3.62 s
Wall time: 3.65 s


In [14]:
gdf_ky = gp.GeoDataFrame(df, geometry='geometry')

In [15]:
gdf_ky.head()

Unnamed: 0,w_geocode,h_geocode,distance,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,w_lat,w_lon,h_lat,h_lon,geom,geometry
0,210290211013002,210019701002051,101676.0,1,0,1,0,0,1,0,0,1,0,37.99491,-85.668096,37.216481,-85.061005,"[(37.99491, -85.66809559999999), (37.2164811, ...","LINESTRING (-85.66809559999999 37.99491, -85.0..."
1,210290211013002,210019702001059,94353.0,1,0,1,0,0,1,0,0,1,0,37.99491,-85.668096,37.1867,-85.336811,"[(37.99491, -85.66809559999999), (37.1867001, ...","LINESTRING (-85.66809559999999 37.99491, -85.3..."
2,210290211013002,210019704011052,106911.0,1,0,1,0,0,1,0,0,1,0,37.99491,-85.668096,37.063926,-85.357581,"[(37.99491, -85.66809559999999), (37.0639255, ...","LINESTRING (-85.66809559999999 37.99491, -85.3..."
3,210290211013002,210019704011070,111035.0,1,0,1,0,0,0,1,0,1,0,37.99491,-85.668096,37.023305,-85.368772,"[(37.99491, -85.66809559999999), (37.0233049, ...","LINESTRING (-85.66809559999999 37.99491, -85.3..."
4,210290211013002,210019704012036,104973.0,1,0,1,0,0,1,0,0,1,0,37.99491,-85.668096,37.093457,-85.308605,"[(37.99491, -85.66809559999999), (37.0934567, ...","LINESTRING (-85.66809559999999 37.99491, -85.3..."


In [16]:
gdf_ky.crs = {'init' :'epsg:4326'}

In [17]:
gdf_ky.drop(columns=['S000','SA01','SA02','SA03','SE01','SE02','SE03','SI01','SI02','SI03','w_lat','w_lon','h_lat','h_lon','geom'],axis=1, inplace=True)


In [18]:
gdf_ky.to_file(output_file, driver="GeoJSON")

In [19]:
gjson = gdf_ky.geometry.to_json()

In [20]:
gjson[1:1190]

'"type": "FeatureCollection", "features": [{"id": "0", "type": "Feature", "properties": {}, "geometry": {"type": "LineString", "coordinates": [[-85.66809559999999, 37.99491], [-85.0610052, 37.2164811]]}, "bbox": [-85.66809559999999, 37.2164811, -85.0610052, 37.99491]}, {"id": "1", "type": "Feature", "properties": {}, "geometry": {"type": "LineString", "coordinates": [[-85.66809559999999, 37.99491], [-85.33681059999998, 37.1867001]]}, "bbox": [-85.66809559999999, 37.1867001, -85.33681059999998, 37.99491]}, {"id": "2", "type": "Feature", "properties": {}, "geometry": {"type": "LineString", "coordinates": [[-85.66809559999999, 37.99491], [-85.3575811, 37.0639255]]}, "bbox": [-85.66809559999999, 37.0639255, -85.3575811, 37.99491]}, {"id": "3", "type": "Feature", "properties": {}, "geometry": {"type": "LineString", "coordinates": [[-85.66809559999999, 37.99491], [-85.3687719, 37.0233049]]}, "bbox": [-85.66809559999999, 37.0233049, -85.3687719, 37.99491]}, {"id": "4", "type": "Feature", "pro

In [21]:
df_lex = df_All.drop(df_All[~df_All['w_geocode'].str.startswith('21067')].index)
df_lex = df_lex.reset_index(drop=True)
print(len(df_lex))
df_lex.head()

20264


Unnamed: 0,w_geocode,h_geocode,distance,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,w_lat,w_lon,h_lat,h_lon
0,210670005001014,210059501001056,35814.0,1,0,0,1,0,0,1,0,0,1,38.036375,-84.484479,38.049781,-84.892117
1,210670005001014,210059501001066,36067.0,1,0,1,0,0,0,1,0,0,1,38.036375,-84.484479,38.052372,-84.894859
2,210670005001014,210059501002047,37041.0,1,0,0,1,0,0,1,0,0,1,38.036375,-84.484479,38.054564,-84.905831
3,210670005001014,210059501003028,35885.0,1,0,0,1,0,0,1,0,0,1,38.036375,-84.484479,38.043894,-84.893159
4,210670005001014,210059501004013,35588.0,1,0,0,1,0,0,1,0,0,1,38.036375,-84.484479,38.037828,-84.889856


In [23]:
m = folium.Map(location=[37.645556, -84.769722], tiles='cartodbpositron',
                zoom_start=7, control_scale=True, prefer_canvas=True)

In [24]:
#mylocation = [(37.99491, -85.66809559999999), (38.0899342, -84.884275)]
mylocation = df.geom.values[2000]
mylocation = lines[1000]
my_PolyLine=folium.PolyLine(mylocation, line_color='#FF0000',weight=1).add_to(m)
m
print(mylocation)

[(37.99491, -85.66809559999999), (38.3844836, -84.3548487)]


In [25]:
%time df_lex['geometry'] = df_lex.apply(lambda x: geom.LineString([(x['w_lon'], x['w_lat'] ), (x['h_lon'],x['h_lat'])]), axis = 1)


CPU times: user 939 ms, sys: 14.3 ms, total: 953 ms
Wall time: 959 ms


In [26]:
gdf_lex = gp.GeoDataFrame(df_lex, geometry='geometry')

In [27]:
gdf_lex.drop(columns=['S000','SA01','SA02','SA03','SE01','SE02','SE03','SI01','SI02','SI03','w_lat','w_lon','h_lat','h_lon'],axis=1, inplace=True)


In [28]:
gdf_lex.crs = {'init' :'epsg:4326'}

In [29]:
gdf_lex.head()

Unnamed: 0,w_geocode,h_geocode,distance,geometry
0,210670005001014,210059501001056,35814.0,"LINESTRING (-84.48447940000001 38.0363745, -84..."
1,210670005001014,210059501001066,36067.0,"LINESTRING (-84.48447940000001 38.0363745, -84..."
2,210670005001014,210059501002047,37041.0,"LINESTRING (-84.48447940000001 38.0363745, -84..."
3,210670005001014,210059501003028,35885.0,"LINESTRING (-84.48447940000001 38.0363745, -84..."
4,210670005001014,210059501004013,35588.0,"LINESTRING (-84.48447940000001 38.0363745, -84..."


In [30]:
gjson = gdf_lex.geometry.to_json()

#### Write LEX data to geojson

In [31]:
output_lex = output_loc.joinpath('lex_all.geojson')
print (output_lex)

../data/lex_all.geojson


In [32]:
%time gdf_lex.to_file(output_lex, driver="GeoJSON")

CPU times: user 9.37 s, sys: 66.5 ms, total: 9.43 s
Wall time: 9.5 s


In [33]:
gdf_lex.head()

Unnamed: 0,w_geocode,h_geocode,distance,geometry
0,210670005001014,210059501001056,35814.0,"LINESTRING (-84.48447940000001 38.0363745, -84..."
1,210670005001014,210059501001066,36067.0,"LINESTRING (-84.48447940000001 38.0363745, -84..."
2,210670005001014,210059501002047,37041.0,"LINESTRING (-84.48447940000001 38.0363745, -84..."
3,210670005001014,210059501003028,35885.0,"LINESTRING (-84.48447940000001 38.0363745, -84..."
4,210670005001014,210059501004013,35588.0,"LINESTRING (-84.48447940000001 38.0363745, -84..."


In [34]:
%time gdf_lex_limited = gdf_lex[(gdf_lex['distance'] < 120000)]

CPU times: user 2.4 ms, sys: 756 µs, total: 3.15 ms
Wall time: 2.34 ms


In [35]:
print (len(gdf_lex_limited))
gdf_lex_limited.head()

18431


Unnamed: 0,w_geocode,h_geocode,distance,geometry
0,210670005001014,210059501001056,35814.0,"LINESTRING (-84.48447940000001 38.0363745, -84..."
1,210670005001014,210059501001066,36067.0,"LINESTRING (-84.48447940000001 38.0363745, -84..."
2,210670005001014,210059501002047,37041.0,"LINESTRING (-84.48447940000001 38.0363745, -84..."
3,210670005001014,210059501003028,35885.0,"LINESTRING (-84.48447940000001 38.0363745, -84..."
4,210670005001014,210059501004013,35588.0,"LINESTRING (-84.48447940000001 38.0363745, -84..."


In [36]:
output_loc = pathlib.Path('../data/')
output_lex_limited = output_loc.joinpath('lex_limited_output.geojson')
print (output_lex_limited)

../data/lex_limited_output.geojson


In [37]:
%time gdf_lex_limited.to_file(output_lex_limited, driver="GeoJSON")

CPU times: user 8.47 s, sys: 50.7 ms, total: 8.52 s
Wall time: 8.56 s


#### Convert distance from float with decimal to no decimal string