In [1]:
import pandas as pd

import numpy as np

from csynthpop import zone_synthesizer as zs

#### Specify sample data csv paths. See the files listed here for expected structure. Marginal tables require multi-indexed columns with category name and category value in levels 0 and 1 of the index. Sample file category columns should be labeled with corresponding category names and values in those columns should match the category value headers in the marginal table.

In [2]:
hh_marginal_file = './data/hh_marginals.csv'
person_marginal_file = './data/person_marginals.csv'
hh_sample_file = './data/household_sample.csv'
person_sample_file = './data/person_sample.csv'

In [3]:
def convert_to_str(margin, sample):
    colist = []
    for acol in margin.columns:
        colist.append(acol[0])
    colist = np.unique(colist)
    for acol in colist:
        if acol in sample.columns:
            sample[acol] = sample[acol].astype(str)
    return sample

#### Load and process input marginals and samples and geography crosswalk

In [4]:
hh_marg, p_marg, hh_sample, p_sample, xwalk = zs.load_data(hh_marginal_file, person_marginal_file, hh_sample_file, person_sample_file)

hh_sample = convert_to_str(hh_marg,hh_sample)
p_sample = convert_to_str(p_marg,p_sample)

Custom Synthpop!


In [19]:
xwalk

[(49.0, 1), (50.0, 1), (51.0, 1), (52.0, 1)]

In [5]:
hh_sample

Unnamed: 0,serialno,sample_geog,BEDRD,MRERD,RNTRD,STRD,VEHRD
0,CSF11B00068468,51,3,11,22,1,2
1,CSF11B00074011,50,5,21,22,1,2
2,CSF11B00074253,49,2,19,22,1,1
3,CSF11B00074820,51,2,21,12,1,1
4,CSF11B00075651,49,4,21,22,1,4
5,CSF11B00075690,49,4,13,22,1,4
6,CSF11B00075691,49,4,21,22,1,2
7,CSF11B00075692,49,7,21,22,6,6
8,CSF11B00075693,49,3,21,22,1,2
9,CSF11B00075694,49,3,21,22,1,2


In [6]:
hh_marg = hh_marg.astype(int)
p_marg = p_marg.astype(int)

In [18]:
xwalk

[(49.0, 1), (50.0, 1), (51.0, 1), (52.0, 1)]

In [7]:
hh_marg.head()

cat_name,BEDRD,BEDRD,BEDRD,BEDRD,BEDRD,BEDRD,BEDRD,BEDRD,MRERD,MRERD,...,STRD,STRD,STRD,VEHRD,VEHRD,VEHRD,VEHRD,VEHRD,VEHRD,VEHRD
cat_values,0,1,2,3,4,5,6,7,1,2,...,4,5,6,0,1,2,3,4,5,6
49.0,14,162,586,1076,782,164,198,268,10,8,...,24,2,268,246,996,1064,282,170,224,268
50.0,8,74,330,1344,1492,196,178,100,28,6,...,22,2,100,166,1080,1488,486,200,202,100
51.0,12,96,344,1344,1146,170,162,158,12,18,...,16,0,158,212,1022,1214,402,224,200,158
52.0,8,72,298,928,1070,138,152,116,26,6,...,26,2,116,142,858,1000,312,188,166,116


In [8]:
p_marg.head()

cat_name,INCP,INCP,INCP,INCP,INCP,INCP,INCP,INCP,INCP,INCP,...,RLHP,RLHP,RLHP,RLHP,RLHP,RLHP,RLHP,RLHP,SEXP,SEXP
cat_values,1,2,3,4,5,6,7,8,9,10,...,4,5,6,7,8,9,10,11,1,2
49.0,38,448,462,484,488,628,482,494,488,392,...,382,364,142,1136,210,316,338,124,3794,3862
50.0,42,566,522,674,600,766,668,644,656,512,...,480,482,188,1106,220,288,110,96,4664,4944
51.0,40,588,466,624,688,758,612,592,610,448,...,400,506,188,1230,204,254,194,88,4256,4246
52.0,28,480,368,458,492,594,484,456,462,384,...,364,408,148,814,184,254,132,68,3424,3532


In [9]:
p_sample.head()

Unnamed: 0,serialno,sample_geog,AGEP,INCP,INDP,LFSP,OCCP,RLHP,SEXP
0,CSF11B00068468,51,25,9,16,1,5,5,1
1,CSF11B00068468,51,36,5,22,3,10,6,2
2,CSF11B00068468,51,30,4,22,3,10,1,1
3,CSF11B00068468,51,29,5,22,3,10,1,2
4,CSF11B00068468,51,25,7,10,1,5,5,1


In [10]:
p_sample.columns

Index(['serialno', 'sample_geog', 'AGEP', 'INCP', 'INDP', 'LFSP', 'OCCP',
       'RLHP', 'SEXP'],
      dtype='object')

In [11]:
hh_sample.columns

Index(['serialno', 'sample_geog', 'BEDRD', 'MRERD', 'RNTRD', 'STRD', 'VEHRD'], dtype='object')

In [12]:
hh_sample

Unnamed: 0,serialno,sample_geog,BEDRD,MRERD,RNTRD,STRD,VEHRD
0,CSF11B00068468,51,3,11,22,1,2
1,CSF11B00074011,50,5,21,22,1,2
2,CSF11B00074253,49,2,19,22,1,1
3,CSF11B00074820,51,2,21,12,1,1
4,CSF11B00075651,49,4,21,22,1,4
5,CSF11B00075690,49,4,13,22,1,4
6,CSF11B00075691,49,4,21,22,1,2
7,CSF11B00075692,49,7,21,22,6,6
8,CSF11B00075693,49,3,21,22,1,2
9,CSF11B00075694,49,3,21,22,1,2


In [13]:
xwalk[1]

(50.0, 1)

In [14]:
p_sample.dtypes

serialno       object
sample_geog     int64
AGEP            int64
INCP            int64
INDP            int64
LFSP            int64
OCCP            int64
RLHP            int64
SEXP            int64
dtype: object

In [15]:
hh_sample[hh_sample.sample_geog == xwalk[1]]

Unnamed: 0,serialno,sample_geog,BEDRD,MRERD,RNTRD,STRD,VEHRD


In [16]:
xwalk

[(49.0, 1), (50.0, 1), (51.0, 1), (52.0, 1)]

#### Iterate over all marginals in the geography crosswalk and synthesize in-line

In [17]:
all_households, all_persons, all_stats = zs.synthesize_all_zones(hh_marg, p_marg, hh_sample, p_sample, xwalk)

sample df Empty DataFrame
Columns: [serialno, sample_geog, BEDRD, MRERD, RNTRD, STRD, VEHRD]
Index: []
cate df                               cat_id
BEDRD MRERD RNTRD STRD VEHRD        
0     1     1     1    0           0
                       1           1
                       2           2
                       3           3
                       4           4
                       5           5
                       6           6
                  2    0           7
                       1           8
                       2           9
                       3          10
                       4          11
                       5          12
                       6          13
                  3    0          14
                       1          15
                       2          16
                       3          17
                       4          18
                       5          19
                       6          20
                  4    0          21
 

ValueError: cannot insert VEHRD, already exists

In [None]:
all_households.head()

In [None]:
all_households.shape

In [None]:
all_persons.shape

#### all_persons.household_id maps person records to all_households.index

In [None]:
all_persons.head()

In [None]:
all_stats