# Data Import and Cleaning 

#### Dataset from here:
#### Column definition from here: https://www2.census.gov/programs-surveys/sipp/data/datasets/2008/l08puw1.sas

In [5]:
import numpy as np
import pandas as pd

In [14]:
# sas input statement needs to be cleaned by hand: remove all whitespaces and $-signs e.g. replace ' -' with '-'
# columns that should be kept, need to be defined by hand as well
def import_sipp_data(sas_input_statement, dat_file, columns_keep):
    input_dict = np.loadtxt(sas_input_statement, dtype = str)
    column_lst = input_dict[:, 0].tolist()
    
    colspecs = []
    for string in input_dict[:, 1]:
        colspecs.append(tuple(map(int, string.split('-'))))
    
    df_raw = pd.read_fwf(dat_file, names=column_lst, colspecs=colspecs)

    df = df_raw[columns_keep]
    del df_raw
    return df

In [22]:
# columns core dataset
clmn_join = ['SSUID', 'SPANEL', 'SWAVE', 'TFIPSST', 'EOUTCOME', 'SHHADID', 'RFID', 'RFID2', 'EENTAID','EPPPNUM', 'EPOPSTAT', 'EPPINTVW', 'ESEX', 
            'ERACE', 'EORIGIN', 'WPFINWGT', 'ERRP', 'EMS', 'EPNMOM', 'EPNDAD', 'EPNGUARD', 'EPNSPOUS', 'RDESGPNT', 'TAGE', 'EEDUCATE']

clmn_general = ['SREFMON', 'EHREFPER', 'RHCALMN', 'RHCALYR', 'TMOVRFLG', 'EHHNUMPP', 'EFSPOUSE', 'RFNKIDS', 'EBORNUS', 'ECITIZEN', 'RENROLL']

# make hist for all income categories and combine the small ones into a new 'other' or substract all interesting income streams from total and take the residual.
clmn_income = ['TFEARN', 'TFTOTINC', 'TPEARN', 'TPTOTINC', 'THTRNINC', 'THOTHINC', 'ETENURE', 'THPNDIST', 'THSOCSEC', 'THVETS', 'THLUMPSM', 'THAFDC', 
                'EAST1C', 'THPRPINC', 'TINTINC', 'TDIVINC', 'EAST3B', 'EAST3C', 'EAST3D', 'EAST3E', 'EAST4A' ]
# moonlit
clmn_labor = ['EPAYHR1', 'EPAYHR2', 'TPYRATE1', 'TPYRATE2', 'EJBHRS1', 'EJBHRS2', 'EUNION1', 'EUNION2', 'TMLMSUM', 'EDISABL', 'EDISPREV', 
                'EAWOP', 'EABRE', 'ERSNOWRK', 'EPTRESN', 'EBUSCNTR', 'EJOBCNTR', 'RMWKSAB', 'EPDJBTHN', 'ELKWRK', 'ELAYOFF', 'EHRSALL', 'RMHRSWK', 'EPTWRK', 
                'RMESR', 'RMWKWJB', 'RWKSPERM', 'EBNO1', 'EBNO2']

columns_core = clmn_join + clmn_general + clmn_income + clmn_labor
print('length core columns: ', len(columns_core))

# columns topical dataset
clmn_join

clmn_rebate = ['EREBATE', 'ERBAMTH', 'ERBATAMT', 'ERBATTYP', 'EREBATOC']

columns_topical = clmn_join + clmn_rebate
print('length topical columns: ', len(columns_topical))

length core columns:  86
length topical columns:  30


### Import SIPP data

In [18]:
w1_core = import_sipp_data('input_core.txt', '/Users/maxweber/Desktop/DataMasterThesis/l08puw1.dat', columns_core)
print('w1 core imported')
w1_core.to_csv('/Users/maxweber/Desktop/DataMasterThesis/core_w1.csv') 
print('w1 core exported to csv')

w2_core = import_sipp_data('input_core.txt', '/Users/maxweber/Desktop/DataMasterThesis/l08puw2.dat', columns_core)
print('w2 core imported')
w2_core.to_csv('/Users/maxweber/Desktop/DataMasterThesis/core_w2.csv') 
print('w2 core exported to csv')

w1 core imported
w1 core exported to csv
w2 core imported
w2 core exported to csv


In [16]:
w1_topical = import_sipp_data('input_topical.txt', '/Users/maxweber/Desktop/DataMasterThesis/p08putm1.dat', columns_topical)
print('w1 topical imported')
w1_topical.to_csv('/Users/maxweber/Desktop/DataMasterThesis/topical_w1.csv') 
print('w1 topical exported to csv')

w2_topical = import_sipp_data('input_topical.txt', '/Users/maxweber/Desktop/DataMasterThesis/p08putm2.dat', columns_topical)
print('w2 topical imported')
w2_topical.to_csv('/Users/maxweber/Desktop/DataMasterThesis/topical_w2.csv') 
print('w2 topical exported to csv')

w1 topical imported
w1 topical exported to csv
w2 topical imported
w2 topical exported to csv


# ToDo: 

- merge core and topical module
- Rowbind  wave 1 and wave 2
- compare to houshold dataframe and individual dataframe
- implement the cleaning steps from the other notebook

In [24]:
print(w1_core.shape)
print(w1_topical.shape)
w1_full = w1_core.merge(w1_topical, how = 'left', on = clmn_join)
w1_full.shape

(421911, 86)
(105663, 30)


(421911, 91)

In [25]:
print(w2_core.shape)
print(w2_topical.shape)
w2_full = w2_core.merge(w2_topical, how = 'left', on = clmn_join)
w2_full.shape

(392702, 86)
(98504, 30)


(392702, 91)

In [28]:
df = pd.concat([w1_full, w2_full])
print(df.shape)
df.head()

(814613, 91)


Unnamed: 0,SSUID,SPANEL,SWAVE,TFIPSST,EOUTCOME,SHHADID,RFID,RFID2,EENTAID,EPPPNUM,...,RMESR,RMWKWJB,RWKSPERM,EBNO1,EBNO2,EREBATE,ERBAMTH,ERBATAMT,ERBATTYP,EREBATOC
0,19128000276,8,1,2,1,11,1,1,11,101,...,8,0,4,1,1,,,,,
1,19128000276,8,1,2,1,11,2,2,11,101,...,8,0,5,1,1,,,,,
2,19128000276,8,1,2,1,11,3,3,11,101,...,8,0,4,1,1,,,,,
3,19128000276,8,1,2,1,11,4,4,11,101,...,7,0,4,1,1,2.0,1.0,0.0,1.0,1.0
4,19128000276,8,1,2,1,11,1,1,11,102,...,8,0,4,1,1,,,,,
