In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [4]:
## read all households

households_all = pd.read_csv("../../production/newyork/urbansim_v2/13122k-NYC-all-ages/households.csv.gz")
print(households_all.shape)
households_all.head(3)

(4864508, 4)


Unnamed: 0,household_id,income,cars,block_id
0,34-013-013400-5:2009000778784:138,64380,2,340130134005
1,34-013-013400-5:2011001033670:215,281000,4,340130134005
2,34-013-013400-5:2011001045287:27,90000,1,340130134005


In [7]:
## split all households into 3 parts, shuffling before

hh_dfs = np.array_split(households_all.sample(frac=1), 3)
for hh_df in hh_dfs:
    print(f"the shape is {hh_df.shape}")

the shape is (1621503, 4)
the shape is (1621503, 4)
the shape is (1621502, 4)


In [9]:
## read the rest of scenario

persons_all = pd.read_csv("../../production/newyork/urbansim_v2/13122k-NYC-all-ages/persons.csv.gz")
plans_all = pd.read_csv("../../production/newyork/urbansim_v2/13122k-NYC-all-ages/plans.csv.gz")
blocks_all = pd.read_csv("../../production/newyork/urbansim_v2/13122k-NYC-all-ages/blocks.csv.gz")
print(f"there are {len(persons_all)} rows in persons, {len(plans_all)} rows in plans, {len(blocks_all)} rows in blocks")
      
display(persons_all.head(3))
display(plans_all.head(3))
display(blocks_all.head(3))

there are 13122080 rows in persons, 39216776 rows in plans, 4864508 rows in blocks


Unnamed: 0,person_id,household_id,age,sex,industry
0,1,34-013-013400-5:2009000778784:138,28,1,manufacturing
1,0,34-013-013400-5:2009000778784:138,57,2,educational / health
2,9,34-013-013400-5:2011001033670:215,23,1,information


Unnamed: 0,trip_id,person_id,PlanElementIndex,ActivityElement,trip_mode,ActivityType,x,y,departure_time
0,,1,1,activity,,Home,-74.163214,40.836926,7.26
1,,1,2,leg,,,,,
2,,1,3,activity,,Work,-74.221004,40.820302,17.831023


Unnamed: 0,block_id,x,y
0,340130134005,-74.163214,40.836926
1,340130134005,-74.163017,40.837018
2,340130134005,-74.162813,40.836792


In [10]:
persons_dfs = []
plans_dfs = []
blocks_dfs = []

for (hh, i) in zip(hh_dfs, [1,2,3]):
    selected_hh = set(hh['household_id'])
    persons_df = persons_all[persons_all['household_id'].isin(selected_hh)].copy()
    persons_dfs.append(persons_df)
    
    selected_blocks = set(hh['block_id'])
    blocks_df = blocks_all[blocks_all['block_id'].isin(selected_blocks)].copy()
    blocks_dfs.append(blocks_df)
    
    selected_persons = set(persons_df['person_id'])
    plans_df = plans_all[plans_all['person_id'].isin(selected_persons)].copy()
    plans_dfs.append(plans_df)
    print(f'hh {i} processed')

hh 1 processed
hh 2 processed
hh 3 processed


In [11]:
paths = ['../../production/newyork/urbansim_v2/13122k-NYC-all-ages-part1', 
         '../../production/newyork/urbansim_v2/13122k-NYC-all-ages-part2',
         '../../production/newyork/urbansim_v2/13122k-NYC-all-ages-part3']

for (persons_df, households_df, blocks_df, plans_df, path) in zip(persons_dfs, hh_dfs, blocks_dfs, plans_dfs, paths):
    print(f"processing {path} ...")
    persons_df.to_csv(f"{path}/persons.csv.gz", index=False)
    households_df.to_csv(f"{path}/households.csv.gz", index=False)
    blocks_df.to_csv(f"{path}/blocks.csv.gz", index=False)
    plans_df.to_csv(f"{path}/plans.csv.gz", index=False)
    
print('done')

processing ../../production/newyork/urbansim_v2/13122k-NYC-all-ages-part1 ...
processing ../../production/newyork/urbansim_v2/13122k-NYC-all-ages-part2 ...
processing ../../production/newyork/urbansim_v2/13122k-NYC-all-ages-part3 ...
done
