# Synthesizing the PUF with `synthimpute` 

In [1]:
import pandas as pd
import numpy as np
import synthimpute as si
from sklearn.model_selection import train_test_split

## Load data

Columns to synthesize listed in https://github.com/donboyd5/synpuf/issues/4.

In [2]:
COLS = [
    'dsi',
    'e00200',
    'e00300',
    'e00400',
    'e00600',
    'e00650',
    'e00700',
    'e00800',
    'e00900',
    'e01100',
    'e01200',
    'e01400',
    'e01500',
    'e01700',
    'e02000',
    'e02100',
    'e02300',
    'e02400',
    'e03150',
    'e03210',
    'e03220',
    'e03230',
    'e03240',
    'e03270',
    'e03290',
    'e03300',
    'e03400',
    'e03500',
    'e07240',
    'e07260',
    'e07300',
    'e07400',
    'e07600',
    'e09700',
    'e09800',
    'e09900',
    'e11200',
    'e17500',
    'e18400',
    'e18500',
    'e19200',
    'e19800',
    'e20100',
    'e20400',
    'e24515',
    'e24518',
    'e26270',
    'e27200',
    'e32800',
    'e58990',
    'e62900',
    'e87521',
    'e87530',
    'eic',
    'f2441',
    'f6251',
    'fded',
    'mars',
    'midr',
    'n24',
    'p08000',
    'p22250',
    'p23250',
    's006',
    'e00100', 'e09600',  # Calculateds for this test.
    'xtot']

In [3]:
# Include RECID to exclude 4 aggregate records.
input_cols = [x.upper() for x in COLS] + ['RECID']
raw = pd.read_csv('~/puf2011.csv', usecols=input_cols)

## Preprocess

Drop aggregates and `RECID`.

In [4]:
AGG_RECIDS = [999996, 999997, 999998, 999999]
full = raw[~raw.RECID.isin(AGG_RECIDS)].drop('RECID', axis=1)

Calculate differences of variables that must be nonnegative for Tax-Calculator to run. Per [synpuf#17](https://github.com/donboyd5/synpuf/issues/17), `e00600` must be weakly greater than `e00650` and `e01500` must be weakly greater than `e01700`.

In [5]:
full['e00600_minus_e00650'] = full.E00600 - full.E00650
full['e01500_minus_e01700'] = full.E01500 - full.E01700
full.drop(['E00600', 'E01500'], axis=1, inplace=True)

Convert MARS to dummies.

*Seed on MARS instead.*

In [6]:
# full[['MARS2', 'MARS3', 'MARS4']] = pd.get_dummies(full.MARS, drop_first=True)
# full.drop('MARS', axis=1, inplace=True)

Split into two 10% test and train samples by starting with a 20% sample and splitting.

In [7]:
train, test = train_test_split(
    full.sample(frac=0.2, random_state=0), test_size=0.5, random_state=0)

## Synthesize

In [8]:
SEED_COLS = ['MARS', 'E00100', 'E09600', 'XTOT', 'S006']
CLASSIFICATION_COLS = ['F6251', 'MIDR', 'FDED', 'DSI']
SEED_COLS += CLASSIFICATION_COLS  # Until rf_synth handles classification.

How many records are uniquely identified by seeds?

In [32]:
seed_nrow = full.groupby(SEED_COLS).size().reset_index()
(seed_nrow[0] == 1).sum() / full.shape[0]

0.5936282710365965

In [9]:
%%time
synth = si.rf_synth(train, random_state=0, seed_cols=SEED_COLS, trees=50)

Synthesizing feature 1 of 58: E07240...
Synthesizing feature 2 of 58: E19800...
Synthesizing feature 3 of 58: E03240...
Synthesizing feature 4 of 58: E03500...
Synthesizing feature 5 of 58: e00600_minus_e00650...
Synthesizing feature 6 of 58: E03150...
Synthesizing feature 7 of 58: EIC...
Synthesizing feature 8 of 58: e01500_minus_e01700...
Synthesizing feature 9 of 58: E20100...
Synthesizing feature 10 of 58: E17500...
Synthesizing feature 11 of 58: E09700...
Synthesizing feature 12 of 58: E02400...
Synthesizing feature 13 of 58: F2441...
Synthesizing feature 14 of 58: E24518...
Synthesizing feature 15 of 58: P22250...
Synthesizing feature 16 of 58: E26270...
Synthesizing feature 17 of 58: E19200...
Synthesizing feature 18 of 58: E03270...
Synthesizing feature 19 of 58: E03300...
Synthesizing feature 20 of 58: E58990...
Synthesizing feature 21 of 58: E87530...
Synthesizing feature 22 of 58: E03230...
Synthesizing feature 23 of 58: E87521...
Synthesizing feature 24 of 58: E01100...
Syn

In [10]:
%%time
synth_full = si.rf_synth(full, random_state=0, seed_cols=SEED_COLS, trees=50)

Synthesizing feature 1 of 58: E07240...
Synthesizing feature 2 of 58: E19800...
Synthesizing feature 3 of 58: E03240...
Synthesizing feature 4 of 58: E03500...
Synthesizing feature 5 of 58: e00600_minus_e00650...
Synthesizing feature 6 of 58: E03150...
Synthesizing feature 7 of 58: EIC...
Synthesizing feature 8 of 58: e01500_minus_e01700...
Synthesizing feature 9 of 58: E20100...
Synthesizing feature 10 of 58: E17500...
Synthesizing feature 11 of 58: E09700...
Synthesizing feature 12 of 58: E02400...
Synthesizing feature 13 of 58: F2441...
Synthesizing feature 14 of 58: E24518...
Synthesizing feature 15 of 58: P22250...
Synthesizing feature 16 of 58: E26270...
Synthesizing feature 17 of 58: E19200...
Synthesizing feature 18 of 58: E03270...
Synthesizing feature 19 of 58: E03300...
Synthesizing feature 20 of 58: E58990...
Synthesizing feature 21 of 58: E87530...
Synthesizing feature 22 of 58: E03230...
Synthesizing feature 23 of 58: E87521...
Synthesizing feature 24 of 58: E01100...
Syn

## Checks

These should be nonnegative (probably 0).

In [11]:
synth.e00600_minus_e00650.min()

0.0

In [12]:
synth.e01500_minus_e01700.min()

0.0

In [13]:
synth_full.e00600_minus_e00650.min()

0.0

In [14]:
synth_full.e01500_minus_e01700.min()

0.0

## Postprocessing

In [15]:
def add_subtracted_features(df):
    df['E00600'] = df.E00650 + df.e00600_minus_e00650
    df['E01500'] = df.E01700 + df.e01500_minus_e01700
    df.drop(['e00600_minus_e00650', 'e01500_minus_e01700'], axis=1, inplace=True)

In [16]:
add_subtracted_features(synth)
add_subtracted_features(synth_full)

In [17]:
(synth_full.E00600 < synth_full.E00650).sum()

0

In [18]:
(synth_full.E01500 < synth_full.E01700).sum()

0

Un-dummy MARS.

*Seeded on it instead.*

In [19]:
def undummy_MARS(df):
    df['MARS'] = np.where(df.MARS2, 2, np.where(df.MARS3, 3, np.where(df.MARS4, 4, 1)))
    df.drop(['MARS2', 'MARS3', 'MARS4'], axis=1, inplace=True)

In [20]:
# undummy_MARS(synth)
# undummy_MARS(synth_full)

Round.

In [21]:
synth = synth.round()
synth_full = synth_full.round()

## Export

In [22]:
def export_csv(df, f):
    df.to_csv('~/Downloads/' + f + '.csv', index=False)

In [23]:
export_csv(synth_full, 'puf_synth_full')
export_csv(synth, 'puf_synth_10p_sample')
export_csv(train, 'puf_10p_sample_train')
export_csv(test, 'puf_10p_sample_test')