In [1]:
# importing required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings

# Setting default options
warnings.filterwarnings("ignore")
pd.set_option("mode.copy_on_write", True)

In [2]:
# loading the data
df = pd.read_csv("/Users/revanth/Downloads/ipums_06.csv")
df.sample(5)

Unnamed: 0,YEAR,SERIAL,STRATA,PSU,NHISHID,HHWEIGHT,URBRRL,PERNUM,NHISPID,HHX,...,SMOKESTATUS3,CIGSDAYCSFS,CIGSDAYCSFS2,CIGSLONGFS,MORTSTAT,MORTDODY,MORTUCOD,MORTUCODLD,MORTWT,MORTWTSA
1619096,2005,35356,5090,1,2005060490,1297.0,,2,20050604900102,60490.0,...,,,,0.0,9.0,9999.0,,96.0,0.0,0.0
121751,1991,837,3013,4,19911057078108,1878.0,,1,1991105707810801,,...,7.0,,,,2.0,9999.0,999.0,96.0,2972.0,
2238928,2012,47267,6107,2,2012071638,3640.0,,3,20120716380103,71638.0,...,,,,,2.0,9999.0,,96.0,4749.0,0.0
95209,1990,38568,3011,1,19904181057802,1890.0,,4,1990418105780204,,...,,96.0,96.0,,9.0,9999.0,999.0,96.0,0.0,
887104,1998,8971,5315,2,1998030326,2491.0,,1,19980303260101,30326.0,...,7.0,,,,9.0,9999.0,999.0,96.0,0.0,0.0


In [3]:
# printing the columns
print(df.columns)

Index(['YEAR', 'SERIAL', 'STRATA', 'PSU', 'NHISHID', 'HHWEIGHT', 'URBRRL',
       'PERNUM', 'NHISPID', 'HHX', 'FMX', 'PX', 'PERWEIGHT', 'SAMPWEIGHT',
       'LONGWEIGHT', 'PARTWEIGHT', 'FWEIGHT', 'SUPP2WT', 'ASTATFLG',
       'CSTATFLG', 'AGE', 'SEX', 'BIRTHYR', 'RACENEW', 'OCC', 'POORYN',
       'INCFAM97ON2', 'INCFAM07ON', 'FAMTOTINC', 'EARNIMP1', 'HINOTCOVE',
       'HICOVSTAT', 'HIOTHGOVYR', 'HINOLAPYMO', 'HINOLASTCOV', 'HINOUNEMPR',
       'HINOEMPR', 'HINOFAMR', 'HINOAGER', 'HINOCOSTR', 'HINOREFUSER',
       'HINOTHER', 'HINOWANT', 'HINOELIG', 'HINOCONF', 'HINOMEET', 'HINOWAIT',
       'HINOMISS', 'HISTOP1', 'HISTOP3', 'HISTOP4', 'HISTOP8', 'HISTOP11',
       'HISTOP12', 'HISTOP13', 'HISTOP14', 'HISTOP15', 'HISTOP16', 'HISTOP17',
       'HISTOP18', 'HISTOP19', 'HISTOP20', 'HISTOP22', 'HISTOP23', 'HISTOP24',
       'HISTOP25', 'HIBUYNOCOND', 'HIBUYNOCOST', 'HIBUYNODENY', 'HIBUYNOSORC',
       'HIBUYNOOTHR', 'HISTOP5A', 'CNLUNG', 'CNLUNGAG', 'CANHICHANGE',
       'SMOKEV', 'CIGDAYM

- HINOTCOVE (Health Insurance coverage status)
- HICOVSTAT (Health insurance coverage status)
- HIOTHGOVYR (Had other government program coverage in the past 12 months )
- HINOLAPYMO (Months without any health insurance, past 12 months, covered at time of survey)
- HINOLASTCOV (When last covered by health insurance)
- HINOUNEMPR (Reasons for no insurance: Unemployment)
- HINOEMPR (Reasons for no insurance: Employment-related reason)
- HINOFAMR (Reasons for no insurance: Family-related)
- HINOAGER (Reasons for no insurance: Aged out of family plan)
- HINOCOSTR (Reasons for no insurance: Too expensive)
- HINOREFUSER (Reasons for no insurance: Poor health/refused coverage)
- HINOTHER (Reasons for no insurance: Other)
- HINOWANT (Reasons for no insurance: Do not want or need coverage)
- HINOELIG (Reasons for no insurance: Not eligible)
- HINOCONF (Reasons for no insurance: Too difficult or confusing)
- HINOMEET (Reasons for no insurance: Plans don't meet needs)
- HINOWAIT (Reasons for no insurance: Coverage has not started yet)
- HINOMISS (Reasons for no insurance: Missed Deadline)

In [4]:
# including only the above columns and year
required_columns = [
    "YEAR",
    "HINOTCOVE",
    "HICOVSTAT",
    "HIOTHGOVYR",
    "HINOLAPYMO",
    "HINOLASTCOV",
    "HINOUNEMPR",
    "HINOEMPR",
    "HINOFAMR",
    "HINOAGER",
    "HINOCOSTR",
    "HINOREFUSER",
    "HINOTHER",
    "HINOWANT",
    "HINOELIG",
    "HINOCONF",
    "HINOMEET",
    "HINOWAIT",
    "HINOMISS",
]

In [5]:
df_analysis = df[required_columns]

In [6]:
df_analysis.shape

(2973180, 19)

In [7]:
# dropping all the rows with missing values
df_analysis.isna().sum() / df_analysis.shape[0] * 100

YEAR            0.000000
HINOTCOVE      25.554928
HICOVSTAT      86.824915
HIOTHGOVYR     73.815914
HINOLAPYMO     86.824915
HINOLASTCOV    86.824915
HINOUNEMPR     12.379842
HINOEMPR       17.472235
HINOFAMR       17.472235
HINOAGER       17.472235
HINOCOSTR      12.379842
HINOREFUSER    17.472235
HINOTHER       12.379842
HINOWANT       94.907607
HINOELIG       94.907607
HINOCONF       94.907607
HINOMEET       94.907607
HINOWAIT       94.907607
HINOMISS       94.907607
dtype: float64

In [8]:
# filtering only the rows for people wihout health insurance
df_analysis = df_analysis[df_analysis["HINOTCOVE"] == 2]

In [9]:
df_analysis.isna().sum() / df_analysis.shape[0] * 100

YEAR             0.000000
HINOTCOVE        0.000000
HICOVSTAT      100.000000
HIOTHGOVYR      70.270428
HINOLAPYMO     100.000000
HINOLASTCOV    100.000000
HINOUNEMPR       0.000000
HINOEMPR         3.396505
HINOFAMR         3.396505
HINOAGER         3.396505
HINOCOSTR        0.000000
HINOREFUSER      3.396505
HINOTHER         0.000000
HINOWANT        96.603495
HINOELIG        96.603495
HINOCONF        96.603495
HINOMEET        96.603495
HINOWAIT        96.603495
HINOMISS        96.603495
dtype: float64

In [None]:
level_2 = [
    "YEAR",
    "HINOUNEMPR",
    "HINOEMPR",
    "HINOFAMR",
    "HINOAGER",
    "HINOCOSTR",
    "HINOREFUSER",
    "HINOTHER",
]