# Synthesizing the PUF with `synthimpute` 

In [1]:
import pandas as pd
import numpy as np
import synthimpute as si

## Load data

Columns to synthesize listed in https://github.com/donboyd5/synpuf/issues/4.

In [7]:
COLS = [
    'dsi',
    'e00200',
    'e00300',
    'e00400',
    'e00600',
    'e00650',
    'e00700',
    'e00800',
    'e00900',
    'e01100',
    'e01200',
    'e01400',
    'e01500',
    'e01700',
    'e02000',
    'e02100',
    'e02300',
    'e02400',
    'e03150',
    'e03210',
    'e03220',
    'e03230',
    'e03240',
    'e03270',
    'e03290',
    'e03300',
    'e03400',
    'e03500',
    'e07240',
    'e07260',
    'e07300',
    'e07400',
    'e07600',
    'e09700',
    'e09800',
    'e09900',
    'e11200',
    'e17500',
    'e18400',
    'e18500',
    'e19200',
    'e19800',
    'e20100',
    'e20400',
    'e24515',
    'e24518',
    'e26270',
    'e27200',
    'e32800',
    'e58990',
    'e62900',
    'e87521',
    'e87530',
    'eic',
    'f2441',
    'f6251',
    'fded',
    'mars',
    'midr',
    'n24',
    'p08000',
    'p22250',
    'p23250',
    's006',
    'xtot']

In [29]:
# Include RECID to exclude 4 aggregate records.
input_cols = [x.upper() for x in COLS] + ['RECID']
raw = pd.read_csv('puf2011.csv', usecols=input_cols)

## Preprocess

Drop aggregates and `RECID`.

In [31]:
AGG_RECIDS = [999996, 999997, 999998, 999999]
full = raw[~raw.RECID.isin(AGG_RECIDS)].drop('RECID', axis=1)
train = full.sample(frac=0.1, random_state=0)

## Synthesize

In [38]:
%%time
synth = si.rf_synth(train, random_state=0, seed_cols=['DSI', 'XTOT'], trees=20)

Synthesizing feature 1 of 63: E18400...
Synthesizing feature 2 of 63: E20400...
Synthesizing feature 3 of 63: E03220...
Synthesizing feature 4 of 63: E09900...
Synthesizing feature 5 of 63: E07400...
Synthesizing feature 6 of 63: EIC...
Synthesizing feature 7 of 63: FDED...
Synthesizing feature 8 of 63: E02100...
Synthesizing feature 9 of 63: N24...
Synthesizing feature 10 of 63: F6251...
Synthesizing feature 11 of 63: E00900...
Synthesizing feature 12 of 63: E03240...
Synthesizing feature 13 of 63: E07600...
Synthesizing feature 14 of 63: E87521...
Synthesizing feature 15 of 63: E00650...
Synthesizing feature 16 of 63: P23250...
Synthesizing feature 17 of 63: E03290...
Synthesizing feature 18 of 63: E24518...
Synthesizing feature 19 of 63: E01100...
Synthesizing feature 20 of 63: E01700...
Synthesizing feature 21 of 63: P08000...
Synthesizing feature 22 of 63: E24515...
Synthesizing feature 23 of 63: E01400...
Synthesizing feature 24 of 63: E03300...
Synthesizing feature 25 of 63: E03

In [39]:
%%time
synth_full = si.rf_synth(full, random_state=0, seed_cols=['DSI', 'XTOT'], trees=20)

Synthesizing feature 1 of 63: E18400...
Synthesizing feature 2 of 63: E20400...
Synthesizing feature 3 of 63: E03220...
Synthesizing feature 4 of 63: E09900...
Synthesizing feature 5 of 63: E07400...
Synthesizing feature 6 of 63: EIC...
Synthesizing feature 7 of 63: FDED...
Synthesizing feature 8 of 63: E02100...
Synthesizing feature 9 of 63: N24...
Synthesizing feature 10 of 63: F6251...
Synthesizing feature 11 of 63: E00900...
Synthesizing feature 12 of 63: E03240...
Synthesizing feature 13 of 63: E07600...
Synthesizing feature 14 of 63: E87521...
Synthesizing feature 15 of 63: E00650...
Synthesizing feature 16 of 63: P23250...
Synthesizing feature 17 of 63: E03290...
Synthesizing feature 18 of 63: E24518...
Synthesizing feature 19 of 63: E01100...
Synthesizing feature 20 of 63: E01700...
Synthesizing feature 21 of 63: P08000...
Synthesizing feature 22 of 63: E24515...
Synthesizing feature 23 of 63: E01400...
Synthesizing feature 24 of 63: E03300...
Synthesizing feature 25 of 63: E03

## Export

In [40]:
synth_full.to_csv('~/Downloads/puf_synth_full.csv')

In [41]:
synth.to_csv('~/Downloads/puf_synth_10p_sample.csv')