# Random forests imputation

Impute a random realization of a random forests model using CPS data.

For this example, use Schedule C (`e00900`), since it can be positive, zero, or negative, like Schedule E in https://github.com/open-source-economics/taxdata/issues/221.

## Setup

### Imports

In [43]:
import taxcalc as tc
import pandas as pd
import numpy as np
from sklearn import ensemble
from sklearn import linear_model
from sklearn import model_selection
import os

In [2]:
tc.__version__

'0.20.1'

## Data

Get raw CPS records.

In [34]:
data = os.path.join(tc.Records.CUR_PATH, 'cps.csv.gz')
df = pd.read_csv(data)

Remove `e00900p` and `e00900s`, the taxpayer and spouse Schedule C components.

In [35]:
df.drop(['e00900p', 'e00900s'], axis=1, inplace=True)

In [36]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age_head,456465.0,49.614030,1.715312e+01,0.0,36.0,50.0,63.0,8.500000e+01
age_spouse,456465.0,27.416347,2.770706e+01,0.0,0.0,30.0,52.0,8.500000e+01
e00200p,456465.0,47805.744890,1.266239e+05,0.0,0.0,13511.0,50764.0,4.339028e+06
e02100p,456465.0,15337.615933,1.777691e+05,-19998.0,0.0,0.0,0.0,1.157928e+07
e00200s,456465.0,26644.054000,8.937828e+04,0.0,0.0,0.0,20537.0,4.526829e+06
e02100s,456465.0,6296.968872,1.052554e+05,-19998.0,0.0,0.0,0.0,9.677610e+06
a_lineno,456465.0,1.370892,9.262111e-01,1.0,1.0,1.0,1.0,1.600000e+01
e00600,456465.0,8724.914673,1.594430e+06,0.0,0.0,0.0,73.0,1.075466e+09
e00800,456465.0,130.100146,1.497370e+04,0.0,0.0,0.0,0.0,5.440465e+06
e01500,456465.0,11824.040174,3.894361e+04,0.0,0.0,0.0,0.0,1.447936e+06


## Model

Train a random forests model.

In [37]:
YCOL = 'e00900'

In [39]:
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(
    df.drop(YCOL, axis=1), df[YCOL], random_state=3)

In [None]:
# Reduce for faster runtime.
N_ESTIMATORS = 100
rf = ensemble.RandomForestRegressor(n_estimators=N_ESTIMATORS, 
                                    min_samples_leaf=1, random_state=3, 
                                    verbose=True, 
                                    n_jobs=-1)  # Use maximum number of cores.
rf.fit(X_train, Y_train)

### Model description

In [9]:
feature_importance = pd.Series(rf.feature_importances_, index=X_train.columns)

In [10]:
feature_importance.sort_values(ascending=False)

e00300               0.086969
e00650               0.070918
e18400               0.048008
e00200s              0.045677
e20400               0.042812
h_seq                0.041276
e18500               0.038607
RECID                0.036186
e02100               0.035376
e03300               0.034689
s006                 0.033739
e19200               0.033435
age_spouse           0.032961
e00600               0.032448
e20100               0.031810
agi_bin              0.029116
e00200               0.028967
e19800               0.025905
e01100               0.023834
e03270               0.023805
e00200p              0.018671
age_head             0.018213
e02100p              0.017849
fips                 0.017134
e02400               0.015334
FLPDYR               0.014724
e03240               0.010979
e32800               0.010534
mcare_ben            0.010030
e00400               0.009532
                       ...   
e17500               0.005846
e02100s              0.004229
n1820     

## Predict

### Top-line (average)

In [21]:
pred = pd.DataFrame({'actual': Y_test,
                     'pred': rf.predict(X_test)})
pred['error'] = pred.pred - pred.actual
pred['actual_sign'] = np.sign(pred.actual)
pred['pred_sign'] = np.sign(pred.pred)
pred['correct_sign'] = (pred.actual_sign == pred.pred_sign)
pred['count'] = 1

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.9s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    2.2s finished


MAE, MSE, and % negative/zero/positive.

In [12]:
pred.error.abs().mean()

5282.16166828781

In [13]:
pred.error.pow(2).mean() / 1e6

4081.530563470306

In [14]:
pred.pivot_table(index='actual_sign', columns='pred_sign', values='count', 
                 aggfunc=sum, margins=True)

pred_sign,-1.0,0.0,1.0,All
actual_sign,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1,155,28,212,395
0,2487,51180,43159,96826
1,71,188,16637,16896
All,2713,51396,60008,114117


In [22]:
pred.correct_sign.mean()

0.5956343051429673

### All trees

In [15]:
preds = []
for estimator in rf.estimators_:
    preds.append(estimator.predict(X_test))
preds = np.array(preds).transpose()  # One row per record.

In [16]:
random_tree = []
for i in preds:
    random_tree.append(i[np.random.randint(N_ESTIMATORS)])

In [23]:
pred_random_tree = pd.DataFrame({'actual': Y_test,
                                 'pred': random_tree})
pred_random_tree['error'] = pred_random_tree.pred - pred_random_tree.actual
pred_random_tree['actual_sign'] = np.sign(pred_random_tree.actual)
pred_random_tree['pred_sign'] = np.sign(pred_random_tree.pred)
pred_random_tree['correct_sign'] = (
    pred_random_tree.actual_sign == pred_random_tree.pred_sign)
pred_random_tree['count'] = 1

As expected, MAE and MSE are higher than the point estimates.

In [18]:
pred_random_tree.error.abs().mean()

6334.647309340414

In [19]:
pred_random_tree.error.pow(2).mean() / 1e6

7227.462319797147

But the distribution of sign is closer to correct, since it's not averaging out the zeros.

In [20]:
pred_random_tree.pivot_table(index='actual_sign', columns='pred_sign', 
                             values='count', aggfunc=sum, margins=True)

pred_sign,-1.0,0.0,1.0,All
actual_sign,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1,139,199,57,395
0,206,93374,3246,96826
1,57,2769,14070,16896
All,402,96342,17373,114117


In [24]:
pred_random_tree.correct_sign.mean()

0.9427429743158338

## Linear models

In [41]:
Y_train_sign = np.sign(Y_train)
Y_test_sign = np.sign(Y_test)

In [45]:
mult = linear_model.LogisticRegression(
    multi_class='multinomial', solver='newton-cg', random_state=3)
mult.fit(X_train, Y_train_sign)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False)

In [46]:
mult.coef_

array([[ 7.14436501e-05,  4.88779252e-04,  3.78403616e-07,
         1.04969478e-07,  4.64435790e-07,  2.24919574e-07,
        -1.46228008e-05,  1.43243228e-06, -1.05979430e-04,
         7.24330516e-06,  5.48095396e-05,  7.09760389e-04,
        -1.27695854e-06, -1.34832999e-06, -3.74490870e-06,
         3.12513753e-05, -4.34570265e-07, -1.23595906e-07,
        -1.45616398e-06, -2.62515636e-06,  1.18699196e-05,
        -8.51232511e-05,  8.43461692e-06,  6.00588293e-05,
        -1.16000306e-05, -8.27837491e-06,  4.58340784e-06,
        -9.13262234e-05,  7.56700644e-05, -5.86252173e-05,
         7.78859925e-06,  1.86249656e-06, -1.67176030e-03,
         4.89107064e-06,  1.61362422e-07, -7.76865842e-08,
         2.41254156e-06,  2.71015014e-05, -2.50173110e-05,
         1.29238775e-04, -1.00853235e-05,  7.05832616e-05,
         3.92428360e-06,  1.17941556e-06, -2.43542898e-07,
         2.23238749e-06, -9.94623060e-07, -1.27153300e-06,
         7.38394297e-06, -1.89348151e-07, -1.30387525e-0

### Predict

In [51]:
mult_pred = pd.DataFrame({'actual': Y_test_sign,
                          'pred': mult.predict(X_test)})
mult_pred['sign_correct'] = (mult_pred.actual == mult_pred.pred)
mult_pred.sign_correct.mean()

0.8783178667507908