Finds categories for source and target shared by a specified number of individuals.

In [1]:
import os
import re
import sys
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from ipfn import ipfn
from collections import Counter, defaultdict

from src import plots, pdf, coding_pums as CODING, models

In [2]:
# directory where the source and target recoded input files are located
DATA_DIR = 'data'

# recoded PUMS file for all 50 states and DC
PUMS_RECODED_FILE_NAME = 'pums_usa_2020_recoded.csv'

# name of the recoded NHIS data
NHIS_RECODED_FILE_NAME = 'nhis_2018_recoded.csv'

In [3]:
target_file = os.path.join(DATA_DIR, PUMS_RECODED_FILE_NAME)
print('Loading target file "{0}"...'.format(target_file))
pums_usa_df = pd.read_csv(target_file)
# drop unnecessary cols
pums_usa_df = pums_usa_df.drop(columns=['ST', 'SERIALNO', 'SPORDER', 'PWGTP'])
# sort cols in alphabetical order
pums_usa_df = pums_usa_df.sort_index(axis=1)
pums_usa_df

Loading target file "data/pums_usa_2020_recoded.csv"...


Unnamed: 0,Age,Education,Income,Insurance,RaceEth,Sex
0,7,18,3,0,0,1
1,6,18,2,0,0,1
2,2,16,0,0,0,0
3,2,13,0,1,0,0
4,1,15,0,0,3,1
...,...,...,...,...,...,...
2641049,0,21,4,0,0,0
2641050,3,18,1,0,0,1
2641051,3,16,1,0,0,0
2641052,2,14,2,0,0,0


In [4]:
source_file = os.path.join(DATA_DIR, NHIS_RECODED_FILE_NAME)
source_df = pd.read_csv(source_file)
# drop unnecessary cols
source_df = source_df.drop(columns=['WTFA'])
# sort cols in alphabetical order
source_df = source_df.sort_index(axis=1)
source_df

Unnamed: 0,Age,Education,Income,Insurance,RaceEth,Sex
0,7,9,4,0,0,1
1,5,13,4,0,0,1
2,4,15,1,0,0,0
3,0,10,4,0,0,0
4,3,20,3,0,0,0
...,...,...,...,...,...,...
72068,4,5,4,1,1,1
72069,3,17,0,0,0,1
72070,5,17,2,0,0,1
72071,6,17,2,0,0,0


In [5]:
# convert pums dataframe to list of tuples
target_tuples = list(pums_usa_df.itertuples(index=False, name=None))
target_ctr = Counter(target_tuples)
# order tuples by count
target_data = [(count, tup) for tup, count in target_ctr.items()]
target_data = sorted(target_data, key=lambda x: x[0])

In [6]:
# convert NHIS dataframe to list of tuples
source_tuples = list(source_df.itertuples(index=False, name=None))
source_ctr = Counter(source_tuples)
# order tuples by count
source_data = [(count, tup) for tup, count in source_ctr.items()]
source_data = sorted(source_data, key=lambda x: x[0])

In [7]:
# compute bin counts for the variables, in order
BIN_COUNTS = []

# longest name for any column
maxlen = max([len(col) for col in source_df.columns])

for i, col in enumerate(source_df.columns):
    source_values = source_df[col].values
    target_values = pums_usa_df[col].values
    max_val = max(np.max(source_values), np.max(target_values))
    BIN_COUNTS.append(max_val + 1)
    
for i, col in enumerate(source_df.columns):
    print('{0:>{2}} : {1}'.format(col, BIN_COUNTS[i], maxlen))

      Age : 9
Education : 22
   Income : 5
Insurance : 2
  RaceEth : 6
      Sex : 2


In [8]:
for min_count in range(1, 10+1):
    # get all tuples with counts >= min_count
    sources = [tup for tup in source_data if tup[0] >= min_count]
    targets = [tup for tup in target_data if tup[0] >= min_count]
    print('\nMin cell count: {0}, num_sources: {1}, num_targets: {2}'.
          format(min_count, len(sources), len(targets)))
    
    # build sets of occurring categorical values for each variable from these tuples
    source_sets = [set() for i in range(len(source_df.columns))]
    target_sets = [set() for i in range(len(pums_usa_df.columns))]
    
    for count,tup in sources:
        for q in range(len(tup)):
            # q indexes the next categorical value
            val = tup[q]
            source_sets[q].add(val)
            
    for count,tup in targets:
        for q in range(len(tup)):
            # q indexes the next categorical value
            val = tup[q]
            target_sets[q].add(val)
            
    # print overlap between categorical values
    for q in range(len(source_sets)):
        print('{0}: '.format(source_df.columns[q]))
        
        # number of categories for the variable at index q
        n = BIN_COUNTS[q]
        
        source_categories = list(source_sets[q])
        if n < 10:
            symbol_list = [' ' for q in range(n)]
        else:
            l1 = [' ' for q in range(10)]
            l2 = ['  ' for q in range(10,n)]
            symbol_list = l1 + l2
        for c in source_categories:
            symbol_list[c] = str(c)
        source_display = ' '.join(symbol_list)
        print('\tSource : {0}'.format(source_display))
        
        target_categories = list(target_sets[q])
        if n < 10:
            symbol_list = [' ' for q in range(n)]
        else:
            l1 = [' ' for q in range(10)]
            l2 = ['  ' for q in range(10,n)]
            symbol_list = l1 + l2
        for c in target_categories:
            symbol_list[c] = str(c)
        target_display = ' '.join(symbol_list)
        print('\tTarget : {0}'.format(target_display))


Min cell count: 1, num_sources: 5427, num_targets: 10875
Age: 
	Source : 0 1 2 3 4 5 6 7 8
	Target : 0 1 2 3 4 5 6 7 8
Education: 
	Source : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
	Target : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
Income: 
	Source : 0 1 2 3 4
	Target : 0 1 2 3 4
Insurance: 
	Source : 0 1
	Target : 0 1
RaceEth: 
	Source : 0 1 2 3 4 5
	Target : 0 1 2 3 4 5
Sex: 
	Source : 0 1
	Target : 0 1

Min cell count: 2, num_sources: 3621, num_targets: 9223
Age: 
	Source : 0 1 2 3 4 5 6 7 8
	Target : 0 1 2 3 4 5 6 7 8
Education: 
	Source : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
	Target : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
Income: 
	Source : 0 1 2 3 4
	Target : 0 1 2 3 4
Insurance: 
	Source : 0 1
	Target : 0 1
RaceEth: 
	Source : 0 1 2 3 4 5
	Target : 0 1 2 3 4 5
Sex: 
	Source : 0 1
	Target : 0 1

Min cell count: 3, num_sources: 2816, num_targets: 8408
Age: 
	Source : 0 1 2 3 4 5 6 7 8
	Target : 0 1 2 3 4 5 6 7 8
