In [1]:
import pandas as pd
import numpy as np

In [2]:
year_list = range(2011,2017,1)
cpi = [114.51, 117.76, 119.07, 119.48, 120.15, 120.15]
income_list = ["earns","dispy","tax","sicer","sicee","sicse","ben"]
data_path = "data/"

In [3]:
# read and merge data

silc = pd.read_table(data_path+"be_2012_a4.txt",sep='\t')
#print(silc.shape)

# for each year in year_list
for year in year_list:
    # read the tab seperated file
    df = pd.read_table(data_path+"be_%d_std.txt" %year,sep='\t')
    # keep only id and columns starting with 'ils'
    columns = df.columns
    keep_columns = ["idperson"] + list(columns[columns.str.startswith("ils_")])
    df = df[keep_columns]
    # add suffix of the year to all the column names except idperson
    columns = pd.Series(df.columns)
    columns[columns.str.startswith("ils_")] = columns[columns.str.startswith("ils_")] + "_%d" %year
    df.columns = columns
    # merge with silc on idperson
    silc = silc.merge(df,on="idperson")
    #print(silc.shape)

In [4]:
## GENERATE ADDITIONAL VARIABLES AT INDIVIDUAL LEVEL

# gross income for calculating household head later
silc['tmp_gr_inc'] = -1000 * (silc['dag']<18) + silc['ils_origy_2011'] + 0.0001*np.random.uniform(size=silc.shape[0])
    # make sure no minor becomes hhh
    #it happens that people in the hh have same gross income -> add random very small amount

# indicator for underage 
silc['ch'] = silc['dag']<18

# socio-economic status
silc['working']= (silc['dag'] >= 18) & ( (silc['yem'] + silc['yse']) > 200 ) 
silc['pension']= (silc['dag']  >= 40) & (silc['poa'] > (silc['yem'] + silc['yse'] + silc['bun'])) & (silc['working'] == False)
silc['unempl']= (silc['dag']  >= 18) & ( (silc['bun']>0) | (silc['les'] == 5) ) & (silc['working'] == False) & (silc['pension'] == False)
silc['inactive']= (silc['dag']  >= 18) & (silc['les'] != 6) & (silc['unempl'] == 0) & (silc['working'] == 0) & (silc['pension'] == False)

In [5]:
## AGGREGATIONS AT HOUSEHOLD LEVEL

# create dictionary to store aggregations at the household level
hh_agg_dict = {}

for year in year_list:
    for var in income_list:
        #print("ils_%s_%s" %(var,year))
        hh_agg_dict["ils_%s_%s" %(var,year)] = {"hh_%s_%s" %(var,year):"sum"}

hh_agg_dict['idperson'] = {'hh_size': 'count'}
hh_agg_dict['ch'] = {'hh_ch':'sum'}
hh_agg_dict['tmp_gr_inc'] = {'tmp_gr_max':'max'}

# create household measures by grouping and using aggregation dictionary
hh_data = silc.groupby('idhh').agg(hh_agg_dict)

# drop upper level of multiindex
hh_data.columns = hh_data.columns.droplevel(0)

# merge household measures back to individual measures
silc = silc.merge(hh_data,left_on='idhh',right_index=True)

In [6]:
# indicator for household head
silc['hhh'] = silc['tmp_gr_inc'] == silc['tmp_gr_max']

In [7]:
## EQUIVALENCE

# create equivalence scale
tmp = 1 * silc['hhh']==1  #assign 1 to the oldest person in the hh
tmp[(silc['dag']>=14) & (tmp==0)] = 0.5 # other adults 0.5
tmp[(silc['dag'] < 14)  & (tmp == 0)] = 0.3 # and children 0.3

# calculate sum of equivalence scale by household
silc['tmp'] = tmp
eq_scale = silc.groupby('idhh')['tmp'].sum()

# merge sum back to silc
silc = silc.merge(pd.DataFrame(data=eq_scale).rename(columns={"tmp":"eq_scale"}),left_on='idhh',right_index=True)

# create equivalent disposable income
for year in year_list:
    silc['eq_dispy_%s' %year] = silc['hh_dispy_%s' %year] / silc['eq_scale']

In [8]:
# weights corrected for household size
silc['w_hh'] = silc['dwt'] * silc['hh_size']
silc['w_hh'] = silc['w_hh'].astype(int)

In [9]:
silc.to_pickle("data/be_em_2011_2016")

In [10]:
silc = pd.read_pickle("data/be_em_2011_2016")

In [None]:
## Create subgroups (deciles, ventiles, etc)

In [None]:
## CHART