In [1]:
import pathlib
import pandas as pd
import folium

In [None]:
input_file = 'od_aux.csv.gz'
ODpath = pathlib.Path('../data/OD/')
ODfile = ODpath.joinpath(input_file)

with ODfile.open(mode='r') as fid:
    df_All = pd.read_csv(ODfile, dtype = {'w_geocode': object,'h_geocode': object})

print ('input file: ',ODfile)
print('\nrecords loaded to dataframe:', "{:,}".format(len(df_All)),'\n\n')
df_All.head(1)

#### Set the state to analyze

In [None]:
state_id = '21'

#### Drop all records where the work location does not being with the chosen state_id

In [None]:
df = df_All.drop(df_All[~df_All['w_geocode'].str.startswith(state_id)].index)
df = df.reset_index(drop=True)
print('\nnew dataframe length: ', "{:,}".format(len(df)),'\n\n')
df.head(1)

In [5]:
out_state = '18'

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470400 entries, 0 to 1470399
Data columns (total 12 columns):
w_geocode    1470400 non-null object
h_geocode    1470400 non-null object
S000         1470400 non-null int64
SA01         1470400 non-null int64
SA02         1470400 non-null int64
SA03         1470400 non-null int64
SE01         1470400 non-null int64
SE02         1470400 non-null int64
SE03         1470400 non-null int64
SI01         1470400 non-null int64
SI02         1470400 non-null int64
SI03         1470400 non-null int64
dtypes: int64(10), object(2)
memory usage: 134.6+ MB


In [7]:
df2 = df.drop(df[~df['h_geocode'].str.startswith(out_state)].index)
df2 = df2.reset_index(drop=True)
print('\nnew dataframe length: ', "{:,}".format(len(df2)),'\n\n')
df2.head(10)


new dataframe length:  54,109 




Unnamed: 0,w_geocode,h_geocode,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03
0,210019704011008,181619608001025,1,0,0,1,0,1,0,0,0,1
1,210019704011024,181399745004001,1,0,0,1,1,0,0,0,1,0
2,210019704013028,180610603001027,1,1,0,0,1,0,0,0,1,0
3,210019704013032,181379689002046,1,0,1,0,1,0,0,0,0,1
4,210019704013037,180799603012034,1,1,0,0,1,0,0,0,0,1
5,210019704013048,180050105002006,1,0,0,1,0,0,1,0,0,1
6,210019704013053,180050103003012,1,0,0,1,0,0,1,0,0,1
7,210019704022002,180430710062001,1,0,1,0,0,1,0,1,0,0
8,210019704022014,180973221002005,1,0,1,0,0,0,1,0,0,1
9,210019704022024,180973419032012,1,0,0,1,0,1,0,0,0,1


In [10]:
df2.w_geocode.nunique()

6837

In [11]:
# create a group of all workplace geoids
df_group = df.groupby(['w_geocode']).sum()
# include only the records with over 1000 employees in the block
df_group_limited = df_group[(df_group['S000'] >= 1000)]
# create list of all block with > 1000 workers
top_block_list = df_group_limited.index.tolist()
# create a dataframe with only block with > 1000
df2 = df[df['w_geocode'].isin(top_block_list)]
# sort the dataframe

In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470400 entries, 0 to 1470399
Data columns (total 12 columns):
w_geocode    1470400 non-null object
h_geocode    1470400 non-null object
S000         1470400 non-null int64
SA01         1470400 non-null int64
SA02         1470400 non-null int64
SA03         1470400 non-null int64
SE01         1470400 non-null int64
SE02         1470400 non-null int64
SE03         1470400 non-null int64
SI01         1470400 non-null int64
SI02         1470400 non-null int64
SI03         1470400 non-null int64
dtypes: int64(10), object(2)
memory usage: 134.6+ MB


In [64]:

#df_group = df.groupby(['w_geocode']).sum().reset_index()
df_group = df.groupby(['w_geocode'], as_index=False).sum()
df_group.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28301 entries, 0 to 28300
Data columns (total 11 columns):
w_geocode    28301 non-null object
S000         28301 non-null int64
SA01         28301 non-null int64
SA02         28301 non-null int64
SA03         28301 non-null int64
SE01         28301 non-null int64
SE02         28301 non-null int64
SE03         28301 non-null int64
SI01         28301 non-null int64
SI02         28301 non-null int64
SI03         28301 non-null int64
dtypes: int64(10), object(1)
memory usage: 2.6+ MB


In [65]:
cols_to_use = df.columns
print (cols_to_use)

Index(['w_geocode', 'h_geocode', 'S000', 'SA01', 'SA02', 'SA03', 'SE01',
       'SE02', 'SE03', 'SI01', 'SI02', 'SI03'],
      dtype='object')


In [66]:

df_group_limited = df_group[(df_group['S000'] >= 1000)]
df_group_limited.head(1)

Unnamed: 0,w_geocode,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03
604,210099505002000,1238,137,854,247,71,599,568,0,0,1238


In [94]:
result = pd.merge(df, df_group_limited, on='w_geocode', suffixes=('','_del'))
result.rename(columns={'S000_del': 'w_group_count'}, inplace=True)
result.drop(list(result.filter(regex='_del')), axis=1, inplace=True)
result.head(1)

Unnamed: 0,w_geocode,h_geocode,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,w_group_count
0,210099505002000,210019702001066,1,0,1,0,0,0,1,0,0,1,1238


In [91]:
print(len(df))
print(len(df_group_limited))
print(len(result))
result.head()

1470400
223
336546


Unnamed: 0,w_geocode,w_group_count,h_geocode
0,210099505002000,1238,210019702001066
1,210099505002000,1238,210019702001086
2,210099505002000,1238,210019703001021
3,210099505002000,1238,210019703001030
4,210099505002000,1238,210019703002035
