# Identical records in the PUF

Look at both the raw PUF and the taxdata version.

## Setup

In [1]:
import pandas as pd

## Load

`pufr` is raw and `puf` is the taxdata version. Limit `pufr` columns to those used by taxdata (excluding `s006` which could be summed).

In [2]:
puf = pd.read_csv('~/puf.csv').drop(['RECID', 's006'], axis=1)

In [3]:
COLS = [
    'DSI',
    'E00200',
    'E00300',
    'E00400',
    'E00600',
    'E00650',
    'E00700',
    'E00800',
    'E00900',
    'E01100',
    'E01200',
    'E01400',
    'E01500',
    'E01700',
    'E02000',
    'E02100',
    'E02300',
    'E02400',
    'E03150',
    'E03210',
    'E03220',
    'E03230',
    'E03240',
    'E03270',
    'E03290',
    'E03300',
    'E03400',
    'E03500',
    'E07240',
    'E07260',
    'E07300',
    'E07400',
    'E07600',
    'E09700',
    'E09800',
    'E09900',
    'E11200',
    'E17500',
    'E18400',
    'E18500',
    'E19200',
    'E19800',
    'E20100',
    'E20400',
    'E24515',
    'E24518',
    'E26270',
    'E27200',
    'E32800',
    'E58990',
    'E62900',
    'E87521',
    'E87530',
    'EIC',
    'F2441',
    'F6251',
    'FDED',
    'MARS',
    'MIDR',
    'N24',
    'P08000',
    'P22250',
    'P23250',
#     'S006',  # Don't need weight.
    'XTOT']

In [4]:
pufr = pd.read_csv('~/puf2011.csv', usecols=COLS)

## Raw PUF

Limit to columns needed for taxdata.

In [5]:
def count_per_unique(df):
    return df.groupby(df.columns.tolist(), as_index=False).size().reset_index().rename(
        columns={0:'records'})

In [6]:
pufru = count_per_unique(pufr)

In [7]:
pufr.shape[0] - pufru.shape[0]

15043

In [8]:
pufru.groupby('records').size()

records
1      144259
2        2069
3         905
4         494
5         281
6         170
7          91
8          71
9          42
10         30
11         33
12         25
13         30
14         22
15         25
16         17
17         17
18         22
19         16
20         10
21         19
22         13
23         12
24         10
25          5
26         10
27         10
28          7
29          9
30          4
31          2
32          3
33          2
34          1
35          3
36          3
37          3
47          1
131         1
dtype: int64

## taxdata PUF

Far fewer unique records since it involves imputation.

In [9]:
pufu = count_per_unique(puf)

In [10]:
puf.shape[0] - pufu.shape[0]

40

In [11]:
pufu.groupby('records').size()

records
1    248511
2        40
dtype: int64