# Data Import and Cleaning 

#### Dataset from here:
#### Column definition from here: https://www2.census.gov/programs-surveys/sipp/data/datasets/2008/l08puw1.sas

In [5]:
import numpy as np
import pandas as pd

In [4]:
input_dict_core_w1 = np.loadtxt('input_core_w1.txt', dtype = str)
input_dict_core_w1

array([['SSUSEQ', '1-5'],
       ['SSUID', '6-17'],
       ['SPANEL', '18-21'],
       ...,
       ['LGTKEY', '2328-2335'],
       ['LGTMON', '2336-2337'],
       ['FILLER', '2338-2340']], dtype='<U9')

In [40]:
column_lst_core_w1 = input_dict_core_w1[:, 0].tolist()

In [41]:
colspecs_core_w1 = []
for string in input_dict_core_w1[:, 1]:
    colspecs_core_w1.append(tuple(map(int, string.split('-'))))

colspecs_core_w1[0:4]

[(1, 5), (6, 17), (18, 21), (22, 23)]

# ToDo: 
- separetly import core and topical module wave 1 and wave 2. each with own column specifications
> - subselect columns needed into new df
> - drop original file since it is too large
- merge core and topical module
- Rowbind  wave 1 and wave 2
- compare to houshold dataframe and individual dataframe
- implement the cleaning steps from the other notebook

In [42]:
core_w1 = pd.read_fwf('/Users/maxweber/Desktop/DataMasterThesis/l08puw1.dat', names=column_lst_core_w1, colspecs=colspecs_core_w1) # this is core
print(core_w1.shape)
core_w1.head()

(421911, 1023)


Unnamed: 0,SSUSEQ,SSUID,SPANEL,SWAVE,SROTATON,SREFMON,RHCALMN,RHCALYR,SHHADID,GVARSTR,...,AHINOMTH,EHIEVRCV,AHIEVRCV,THICVYR,AHICVYR,EHICVMTH,AHICVMTH,LGTKEY,LGTMON,FILLER
0,1,19128000276,8,1,,,6,8,11,97,...,,1,,1,,1,,1001,1,0
1,1,19128000276,8,1,,,7,8,11,97,...,,1,,1,,1,,1001,2,0
2,1,19128000276,8,1,,,8,8,11,97,...,,1,,1,,1,,1001,3,0
3,1,19128000276,8,1,,,9,8,11,97,...,,1,,1,,1,,1001,4,0
4,1,19128000276,8,1,,,6,8,11,97,...,,1,,1,,1,,1002,1,0


In [79]:
# Subset dataframe to make handling easier. 
cols_join = ['SSUID', 'SPANEL', 'SWAVE', 'SREFMON', 'TFIPSST', 'EOUTCOME', 'SHHADID', 'RFID', 'RFID2', 'EENTAID','EPPPNUM', 'EPOPSTAT', 'EPPINTVW', 'ESEX', 'ERACE', 'EORIGIN',
            'WPFINWGT', 'ERRP', 'EMS', 'EPNMOM', 'EPNDAD', 'EPNGUARD', 'EPNSPOUS', 'RDESGPNT', 'TAGE', 'EEDUCATE']

cols_general = ['EHREFPER', 'RHCALMN', 'RHCALYR', 'TMOVRFLG', 'EHHNUMPP', 'EFSPOUSE', 'RFNKIDS', 'EBORNUS', 'ECITIZEN', 'RENROLL']

# make hist for all income categories and combine the small ones into a new 'other' or substract all interesting income streams from total and take the residual.
cols_income = ['TFEARN', 'TFTOTINC', 'TPEARN', 'TPTOTINC', 'THTRNINC', 'THOTHINC', 'THPRPINC', 'ETENURE', 'THPNDIST', 'THSOCSEC', 'THVETS', 'THLUMPSM', 'THAFDC']

core_w1_subset = core_w1[cols_join + cols_general + cols_income]
core_w1_subset.describe()
del core_w1



In [84]:
# Export as csv
core_w1_subset.to_csv('/Users/maxweber/Desktop/MasterThesis/core_w1_subset.csv')

In [85]:
core_w1_subset[['SSUID', 'EPPPNUM', 'EHREFPER']].describe()
len(core_w1_subset.columns)

48

In [87]:
# sas input statement needs to be cleaned by hand: remove all whitespaces and $-signs e.g. replace ' -' with '-'
# columns that should be kept, need to be defined by hand as well
def import_sipp_data(sas_input_statement, dat_file, columns_keep):
    input_dict = np.loadtxt(sas_input_statement, dtype = str)
    column_lst = input_dict[:, 0].tolist()
    
    colspecs = []
    for string in input_dict[:, 1]:
        colspecs.append(tuple(map(int, string.split('-'))))
    
    df_raw = pd.read_fwf(dat_file, names=column_lst, colspecs=colspecs)

    df = df_raw[columns_keep]
    del df_raw
    return df

In [9]:
# columns core dataset
cols_join = ['SSUID', 'SPANEL', 'SWAVE', 'TFIPSST', 'EOUTCOME', 'SHHADID', 'RFID', 'RFID2', 'EENTAID','EPPPNUM', 'EPOPSTAT', 'EPPINTVW', 'ESEX', 
            'ERACE', 'EORIGIN', 'WPFINWGT', 'ERRP', 'EMS', 'EPNMOM', 'EPNDAD', 'EPNGUARD', 'EPNSPOUS', 'RDESGPNT', 'TAGE', 'EEDUCATE']

cols_general = ['SREFMON', 'EHREFPER', 'RHCALMN', 'RHCALYR', 'TMOVRFLG', 'EHHNUMPP', 'EFSPOUSE', 'RFNKIDS', 'EBORNUS', 'ECITIZEN', 'RENROLL']

# make hist for all income categories and combine the small ones into a new 'other' or substract all interesting income streams from total and take the residual.
cols_income = ['TFEARN', 'TFTOTINC', 'TPEARN', 'TPTOTINC', 'THTRNINC', 'THOTHINC', 'ETENURE', 'THPNDIST', 'THSOCSEC', 'THVETS', 'THLUMPSM', 'THAFDC', 
                'EAST1C', 'THPRPINC', 'TINTINC', 'TDIVINC', 'EAST3B', 'EAST3C', 'EAST3D', 'EAST3E', 'EAST4A' ]
# moonlit
columns_labor = ['EPAYHR1', 'EPAYHR2', 'TPYRATE1', 'TPYRATE2', 'EJBHRS1', 'EJBHRS2', 'EUNION1', 'EUNION2', 'TMLMSUM', 'EDISABL', 'EDISPREV', 
                'EAWOP', 'EABRE', 'ERSNOWRK', 'EPTRESN', 'EBUSCNTR', 'EJOBCNTR', 'RMWKSAB', 'EPDJBTHN', 'ELKWRK', 'ELAYOFF', 'EHRSALL', 'RMHRSWK', 'EPTWRK', 
                'RMESR', 'RMWKWJB', 'RWKSPERM', 'EBNO1', 'EBNO2']

columns_core = cols_join + cols_general + cols_income + columns_labor
print('length core columns: ', len(columns_core))

# columns topical dataset
cols_join

cols_rebate = ['EREBATE', 'ERBAMTH', 'ERBATAMT', 'ERBATTYP', 'EREBATOC']

columns_topical = cols_join + cols_rebate
print('length topical columns: ', len(columns_topical))



length core columns:  78
length topical columns:  30


In [None]:
# 