# Evaluate CPS age imputation

Using a holdout set of the CPS and quantile loss.

In [1]:
import synthimpute as si
import pandas as pd
import numpy as np
from sklearn import model_selection
from statsmodels.stats.weightstats import DescrStatsW

## Load data

In [2]:
XCOLS = ['e00200', 'e02100', 'e01100', 'e01400',
         'MARS', 'EIC', 'f2441', 'n24', 'DSI',
         'elderly_dependents']

In [3]:
AGES = ['age_head', 'age_spouse']
W = 's006'

In [4]:
cps = pd.read_csv('/home/mghenis/PSLmodels/Tax-Calculator/taxcalc/cps.csv.gz',
                  usecols=XCOLS + AGES + [W])

## Preprocess

In [5]:
train, test = model_selection.train_test_split(cps, test_size=0.3,
                                               random_state=0)
test = test.copy()  # Avoid SettingWithCopyWarning.

## Build model

Takes 2-3min.

In [6]:
test['age_head_imp'] = si.rf_impute(train[XCOLS], train.age_head, test[XCOLS],
                                    sample_weight_train=train[W])

## Evaluate imputation

In [7]:
weighted_stats_imp = DescrStatsW(test.age_head_imp, weights=test[W], ddof=0)
weighted_stats = DescrStatsW(test.age_head, weights=test[W], ddof=0)

In [8]:
weighted_stats_imp.mean, weighted_stats.mean

(47.2280961943543, 46.89814000000572)

In [9]:
weighted_stats_imp.std, weighted_stats.std 

(15.177525706139582, 18.38715285437945)

In [10]:
weighted_stats_imp.sumsquares / 1e12, weighted_stats.sumsquares / 1e12

(1.1278614789490053, 1.655323126499314)