In [20]:
import pandas as pd

import numpy as np

import synthpop.zone_synthesizer as zs

#### Specify sample data csv paths. See the files listed here for expected structure. Marginal tables require multi-indexed columns with category name and category value in levels 0 and 1 of the index. Sample file category columns should be labeled with corresponding category names and values in those columns should match the category value headers in the marginal table.

In [21]:
hh_marginal_file = './data/hh_marginals.csv'
person_marginal_file = './data/person_marginals.csv'
hh_sample_file = './data/household_sample.csv'
person_sample_file = './data/person_sample.csv'

In [22]:
def convert_to_str(margin, sample):
    colist = []
    for acol in margin.columns:
        colist.append(acol[0])
    colist = np.unique(colist)
    for acol in colist:
        if acol in sample.columns:
            sample[acol] = sample[acol].astype(str)
    return sample

#### Load and process input marginals and samples and geography crosswalk

In [23]:
hh_marg, p_marg, hh_sample, p_sample, xwalk = zs.load_data(hh_marginal_file, person_marginal_file, hh_sample_file, person_sample_file)

hh_sample = convert_to_str(hh_marg,hh_sample)
p_sample = convert_to_str(p_marg,p_sample)

In [24]:
hh_sample

Unnamed: 0,sample_geog,BEDRD,DWTD,HIND,MRERD,NPRD,STRD,TEND,VEHRD
0,1,2 bedrooms,Occupied private dwellings,"$1,500-$1,999","$2,400-$2,599",2,Separate house,Owned with a mortgage (includes being purchase...,2 motor vehicles
1,1,4 bedrooms,Occupied private dwellings,"$2,000-$2,499","$1,800-$1,999",2,Separate house,Owned with a mortgage (includes being purchase...,2 motor vehicles
2,1,3 bedrooms,Occupied private dwellings,$400-$599,Not applicable,3,Separate house,Rented (includes being occupied rent free),2 motor vehicles
3,1,Not stated,Occupied private dwellings,Not applicable,Not applicable,1,Separate house,Not stated,Not stated
4,1,4 bedrooms,Occupied private dwellings,$300-$399,Not applicable,1,Separate house,Owned outright,1 motor vehicle
5,1,4 bedrooms,Occupied private dwellings,"$2,500-$2,999","$3,000-$3,999",4,Separate house,Owned with a mortgage (includes being purchase...,2 motor vehicles
6,1,5 or more bedrooms,Occupied private dwellings,"$1,000-$1,249",Not applicable,1,Separate house,Owned outright,1 motor vehicle
7,1,4 bedrooms,Occupied private dwellings,"$2,000-$2,499","$2,600-$2,999",3,Separate house,Owned with a mortgage (includes being purchase...,2 motor vehicles
8,1,4 bedrooms,Occupied private dwellings,"$1,000-$1,249",Not applicable,2,Separate house,Owned outright,2 motor vehicles
9,1,3 bedrooms,Occupied private dwellings,$800-$999,Not applicable,1,Separate house,Rented (includes being occupied rent free),1 motor vehicle


In [25]:
hh_marg.head()

cat_name,BEDRD,BEDRD,BEDRD,BEDRD,BEDRD,BEDRD,BEDRD,BEDRD,DWTD,DWTD,...,TEND,TEND,TEND,VEHRD,VEHRD,VEHRD,VEHRD,VEHRD,VEHRD,VEHRD
cat_values,1 bedroom,2 bedrooms,3 bedrooms,4 bedrooms,5 or more bedrooms,None (includes bedsitters),Not applicable,Not stated,Non-private dwellings,Occupied private dwellings,...,Owned outright,Owned with a mortgage (includes being purchased under a rent/buy scheme),Rented (includes being occupied rent free),1 motor vehicle,2 motor vehicles,3 motor vehicles,4 or more motor vehicles,None,Not applicable,Not stated
1.0,138.0,618.0,1652.0,1036.0,224.0,20.0,478.0,234.0,478.0,3922.0,...,1396.0,1262.0,954.0,1300.0,1454.0,406.0,214.0,240.0,478.0,308.0
2.0,80.0,450.0,1030.0,610.0,122.0,12.0,114.0,120.0,114.0,2424.0,...,840.0,772.0,656.0,902.0,806.0,236.0,90.0,224.0,114.0,166.0
3.0,146.0,462.0,1458.0,888.0,204.0,12.0,264.0,196.0,264.0,3366.0,...,1142.0,1124.0,882.0,1114.0,1230.0,388.0,186.0,236.0,264.0,212.0
4.0,240.0,904.0,1990.0,1010.0,210.0,40.0,354.0,240.0,354.0,4634.0,...,1840.0,1214.0,1272.0,1920.0,1498.0,402.0,142.0,358.0,354.0,314.0
5.0,152.0,498.0,1374.0,768.0,162.0,26.0,332.0,216.0,332.0,3196.0,...,1128.0,908.0,906.0,1136.0,1046.0,310.0,160.0,276.0,332.0,268.0


In [26]:
p_marg.head()

cat_name,CHCAREP,CHCAREP,CHCAREP,CHCAREP,CHCAREP,HRSP,HRSP,HRSP,HRSP,HRSP,...,OCCP,OCCP,OCCP,OCCP,OCCP,OCCP,OCCP,OCCP,SEXP,SEXP
cat_values,1,2,3,4,5,1,2,3,4,5,...,4,5,6,7,8,9,10,11,1,2
1.0,5186.0,2180.0,748.0,1926.0,34.0,5864.0,460.0,386.0,462.0,724.0,...,506.0,552.0,372.0,294.0,492.0,96.0,5666.0,34.0,5106.0,4968.0
2.0,2974.0,1498.0,392.0,1268.0,34.0,3670.0,332.0,252.0,310.0,532.0,...,324.0,368.0,246.0,164.0,272.0,44.0,3554.0,34.0,2974.0,3192.0
3.0,4186.0,2092.0,530.0,1806.0,34.0,4926.0,414.0,338.0,374.0,636.0,...,348.0,470.0,346.0,468.0,430.0,70.0,4748.0,34.0,4302.0,4346.0
4.0,6238.0,2326.0,634.0,1964.0,64.0,7132.0,570.0,474.0,590.0,786.0,...,470.0,586.0,462.0,234.0,460.0,64.0,6954.0,64.0,5486.0,5740.0
5.0,3812.0,1836.0,554.0,1722.0,22.0,4676.0,354.0,306.0,334.0,588.0,...,378.0,394.0,272.0,236.0,412.0,68.0,4534.0,22.0,3924.0,4022.0


In [27]:
p_sample.head()

Unnamed: 0,sample_geog,AGEP,CHCAREP,HRSP,INCP,INDP,OCCP,SEXP
0,1,31,1,2,11,16,2,2
1,1,20,1,1,3,22,10,1
2,1,25,1,7,9,13,1,1
3,1,26,1,5,8,17,2,2
4,1,28,2,3,6,16,2,2


In [28]:
p_sample.columns

Index(['sample_geog', 'AGEP', 'CHCAREP', 'HRSP', 'INCP', 'INDP', 'OCCP',
       'SEXP'],
      dtype='object')

In [29]:
for i in p_marg.columns:
    print(i)

('CHCAREP', '1')
('CHCAREP', '2')
('CHCAREP', '3')
('CHCAREP', '4')
('CHCAREP', '5')
('HRSP', '1')
('HRSP', '2')
('HRSP', '3')
('HRSP', '4')
('HRSP', '5')
('HRSP', '6')
('HRSP', '7')
('HRSP', '8')
('HRSP', '9')
('HRSP', '10')
('INCP', '1')
('INCP', '2')
('INCP', '3')
('INCP', '4')
('INCP', '5')
('INCP', '6')
('INCP', '7')
('INCP', '8')
('INCP', '9')
('INCP', '10')
('INCP', '11')
('INCP', '12')
('INCP', '13')
('INCP', '14')
('INCP', '15')
('INDP', '1')
('INDP', '2')
('INDP', '3')
('INDP', '4')
('INDP', '5')
('INDP', '6')
('INDP', '7')
('INDP', '8')
('INDP', '9')
('INDP', '10')
('INDP', '11')
('INDP', '12')
('INDP', '13')
('INDP', '14')
('INDP', '15')
('INDP', '16')
('INDP', '17')
('INDP', '18')
('INDP', '19')
('INDP', '20')
('INDP', '21')
('INDP', '22')
('INDP', '23')
('OCCP', '1')
('OCCP', '2')
('OCCP', '3')
('OCCP', '4')
('OCCP', '5')
('OCCP', '6')
('OCCP', '7')
('OCCP', '8')
('OCCP', '9')
('OCCP', '10')
('OCCP', '11')
('SEXP', '1')
('SEXP', '2')


In [11]:
p_sample.dtypes

sample_geog    int64
AGEP           int64
CHCAREP        int64
HRSP           int64
INCP           int64
INDP           int64
OCCP           int64
SEXP           int64
dtype: object

#### Iterate over all marginals in the geography crosswalk and synthesize in-line

In [9]:
all_households, all_persons, all_stats = zs.synthesize_all_zones(hh_marg, p_marg, hh_sample, p_sample, xwalk)

ValueError: You are trying to merge on int64 and object columns. If you wish to proceed you should use pd.concat

In [None]:
all_households.head()

In [None]:
all_households.shape

In [None]:
all_persons.shape

#### all_persons.household_id maps person records to all_households.index

In [None]:
all_persons.head()

In [None]:
all_stats