### Read data

In [2]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from copy import deepcopy

# dataset
dftrain = pd.read_csv('../data/train.csv')
dftest  = pd.read_csv('../data/test.csv')
df      = pd.concat([dftrain, dftest], ignore_index=True)

# features
index = 'Id'
target = 'earn_over_4k_euros_per_year'
predictors = [col for col in dftrain.columns if col not in [index, target]]
continuous_predictors = ['age','income_from_investment_sources',\
                         'losses_from_investment_sources',\
                         'number_of_years_of_education',\
                         'working_hours_per_week']
categorical_predictors = [col for col in predictors if col not in continuous_predictors]

### Normalize data

In [1]:
def reintroduce_nan(df, index='Id', target='earn_over_4k_euros_per_year'):
    predictors = [col for col in df.columns if col not in [index, target]]
    nan_colmns = [col for col in predictors if col.endswith('_nan')]
    nan_colmns_headers = [col[:-4] for col in nan_colmns]

    for headers in nan_colmns_headers:
        nan_col = headers + '_nan'
        feats = [col for col in predictors if col.startswith(headers) and (not col.endswith('_nan'))]
        nan_idx = df[nan_col].index[df[nan_col].apply(lambda x : x==1)]
        df.ix[nan_idx, feats] = df.ix[nan_idx, feats].apply(lambda row : np.nan, axis=1)
    
    df = df.drop(nan_colmns, axis=1)
    
    return df

In [14]:
from sklearn import preprocessing

# init
dfcomplete_norm = pd.concat([dftrain, dftest], ignore_index=True)

# normalize continuous data
dfcomplete_norm[continuous_predictors] = dfcomplete_norm[continuous_predictors].apply(
    lambda x: (x - np.min(x)) / np.max(x))
# mas = preprocessing.MaxAbsScaler()
# dfcomplete_norm[continuous_predictors] = mas.fit_transform(dfcomplete_norm[continuous_predictors])

# categorize categorical data
for col in categorical_predictors:
    dfcomplete_norm[col] = dfcomplete_norm[col].astype('category')
    
# get dummies
dfcomplete_norm = pd.get_dummies(dfcomplete_norm, dummy_na=True)

# re-introduce NaN for categorical data
dfcomplete_norm = reintroduce_nan(dfcomplete_norm)

# split dftrain and dftest
dftrain_norm = dfcomplete_norm[dfcomplete_norm[target]!=-1]
dftest_norm  = dfcomplete_norm[dfcomplete_norm[target]==-1]

In [19]:
dftest_norm.head()

Unnamed: 0,Id,age,income_from_investment_sources,losses_from_investment_sources,number_of_years_of_education,working_hours_per_week,earn_over_4k_euros_per_year,education_2.0,education_3.0,education_4.0,...,sex_37.0,sex_59.0,work_class_36.0,work_class_57.0,work_class_66.0,work_class_80.0,work_class_87.0,work_class_88.0,work_class_92.0,work_class_104.0
10500,1,0.077775,0.0,0.0,0.562496,0.171716,-1,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
10501,2,0.377782,0.0,0.561049,0.624993,0.383832,-1,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
10502,3,0.49999,0.0,0.0,0.499998,,-1,0.0,0.0,0.0,...,,,,,,,,,,
10503,4,0.033332,0.0,,0.562496,0.111109,-1,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
10504,5,0.277775,0.0,0.0,0.624993,0.313131,-1,,,,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [105]:
dftrain_norm.shape, dftest_norm.shape

((10500, 106), (38342, 106))

In [16]:
# save summies
dftrain_norm.to_csv('../data/train.norm.csv', index=False)
dftest_norm.to_csv('../data/test.norm.csv', index=False)

### Convert category data to binary vectors

In [4]:
 # Binarize category data
# init
dftrain_bin = deepcopy(dftrain)
dftest_bin  = deepcopy(dftest)

lb = preprocessing.LabelBinarizer()

# binarization
for col in categorical_predictors:
    # use both train and test for indexing
    categories = np.unique(df[df[col].notnull()][col])
    # fit categories
    lb.fit(categories)
    
    # binarize train
    dftrain_bin[col] = dftrain[col].apply(lambda x : lb.transform([x]).flatten() if pd.notnull(x) else x)
    
dftrain_bin

Unnamed: 0,Id,age,education,income_from_investment_sources,losses_from_investment_sources,marital_status,native_country,number_of_years_of_education,occupation,race,relationship,sex,work_class,working_hours_per_week,earn_over_4k_euros_per_year
0,1,12.4140,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]",692.01,,"[0, 0, 0, 0, 1, 0, 0]",,,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 1]","[0, 1, 0, 0, 0, 0]",[1],"[0, 0, 0, 0, 0, 0, 1, 0]",,0
1,2,,,0.00,0.00,"[0, 0, 1, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",,,"[0, 0, 0, 0, 1]","[1, 0, 0, 0, 0, 0]",[1],"[0, 0, 0, 0, 0, 1, 0, 0]",4.1380,0
2,3,12.0960,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]",0.00,0.00,"[1, 0, 0, 0, 0, 0, 0]",,,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 1]","[0, 1, 0, 0, 0, 0]",,"[0, 0, 0, 1, 0, 0, 0, 0]",12.7320,0
3,4,,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",,0.00,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]",,"[1, 0, 0, 0, 0, 0]",[1],"[0, 0, 0, 1, 0, 0, 0, 0]",12.7320,0
4,5,8.9127,,,0.00,"[0, 0, 1, 0, 0, 0, 0]","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4.1380,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]","[0, 0, 1, 0, 0]","[0, 0, 0, 0, 0, 1]",,"[0, 0, 0, 1, 0, 0, 0, 0]",12.7320,0
5,6,11.7770,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]",0.00,0.00,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4.4563,,"[0, 0, 0, 0, 1]","[0, 0, 0, 0, 0, 1]",[0],,12.7320,0
6,7,15.5970,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0.00,0.00,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.5915,,"[0, 0, 1, 0, 0]","[0, 1, 0, 0, 0, 0]",[0],"[0, 0, 0, 1, 0, 0, 0, 0]",,0
7,8,7.3211,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]",0.00,0.00,"[0, 0, 0, 0, 1, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4.1380,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 1]","[0, 0, 0, 1, 0, 0]",,"[0, 0, 0, 1, 0, 0, 0, 0]",9.5493,0
8,9,10.1860,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]",0.00,0.00,"[0, 0, 0, 0, 1, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3.8197,,"[0, 0, 1, 0, 0]","[0, 1, 0, 0, 0, 0]",[1],"[0, 0, 0, 1, 0, 0, 0, 0]",15.9150,0
9,10,10.8230,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0.00,0.00,"[0, 0, 1, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.2732,,"[1, 0, 0, 0, 0]","[1, 0, 0, 0, 0, 0]",,"[0, 0, 0, 1, 0, 0, 0, 0]",14.3240,0


In [5]:
dftrain_bin.to_csv('../data/train.bin.csv', index=False)
dftest_bin.to_csv('../data/test.bin.csv', index=False)

dftrain_bin = pd.read_csv('../data/train.bin.csv')
dftest_bin = pd.read_csv('../data/test.bin.csv')

### Get dummies

In [34]:
# categorize categorical data

# init
dfcomplete = pd.concat([dftrain, dftest], ignore_index=True)

# categorize categorical data
for col in categorical_predictors:
    dfcomplete[col] = dfcomplete[col].astype('category')
    
# get dummies
dfcomplete = pd.get_dummies(dfcomplete, dummy_na=True)

# re-introduce NaN for categorical data
dfcomplete = reintroduce_nan(dfcomplete)

# split dftrain and dftest
dftrain_dum = dfcomplete[dfcomplete[target]!=-1]
dftest_dum  = dfcomplete[dfcomplete[target]==-1]

# save to file
dftrain_dum.to_csv('../data/train.dum.csv', index=False)
dftest_dum.to_csv('../data/test.dum.csv', index=False)

In [37]:
dftest_dum.head(10)

Unnamed: 0,Id,age,income_from_investment_sources,losses_from_investment_sources,number_of_years_of_education,working_hours_per_week,earn_over_4k_euros_per_year,education_2.0,education_3.0,education_4.0,...,sex_37.0,sex_59.0,work_class_36.0,work_class_57.0,work_class_66.0,work_class_80.0,work_class_87.0,work_class_88.0,work_class_92.0,work_class_104.0
10500,1,7.6394,0.0,0.0,3.1831,5.7296,-1,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
10501,2,16.234,0.0,777.95,3.5014,12.414,-1,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
10502,3,19.735,0.0,0.0,2.8648,,-1,0.0,0.0,0.0,...,,,,,,,,,,
10503,4,6.3662,0.0,,3.1831,3.8197,-1,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
10504,5,13.369,0.0,0.0,3.5014,10.186,-1,,,,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
10505,6,11.777,0.0,0.0,1.2732,15.915,-1,,,,...,0.0,1.0,,,,,,,,
10506,7,10.504,0.0,,2.2282,12.732,-1,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
10507,8,15.915,0.0,0.0,3.1831,12.732,-1,0.0,0.0,0.0,...,0.0,1.0,,,,,,,,
10508,9,11.459,0.0,0.0,4.138,17.507,-1,,,,...,1.0,0.0,,,,,,,,
10509,10,13.369,0.0,0.0,2.8648,12.732,-1,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


### get mcar dummies

In [43]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from copy import deepcopy

# dataset
dftrain = pd.read_csv('../data/train.mcar.csv')
dftest  = pd.read_csv('../data/test.mcar.csv')
df      = pd.concat([dftrain, dftest])

# features
index = 'Id'
target = 'earn_over_4k_euros_per_year'
predictors = [col for col in dftrain.columns if col not in [index, target]]
continuous_predictors = ['age','income_from_investment_sources',\
                         'losses_from_investment_sources',\
                         'number_of_years_of_education',\
                         'working_hours_per_week']
categorical_predictors = [col for col in predictors if col not in continuous_predictors]

# init
dfcomplete = pd.concat([dftrain, dftest], ignore_index=True)

# categorize categorical data
for col in categorical_predictors:
    dfcomplete[col] = dfcomplete[col].astype('category')
    
# get dummies
dfcomplete = pd.get_dummies(dfcomplete, dummy_na=True)

# re-introduce NaN for categorical data
dfcomplete = reintroduce_nan(dfcomplete)

# split dftrain and dftest
dftrain_dum = dfcomplete[dfcomplete[target]!=-1]
dftest_dum  = dfcomplete[dfcomplete[target]==-1]

# save to file
dftrain_dum.to_csv('../data/train.mcar.dum.csv', index=False)
dftest_dum.to_csv('../data/test.mcar.dum.csv', index=False)

### get mice dummies

In [3]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from copy import deepcopy

# dataset
dftrain = pd.read_csv('../data/train.mice.csv')
dftest  = pd.read_csv('../data/test.mice.csv')
df      = pd.concat([dftrain, dftest])



# features
index = 'Id'
target = 'earn_over_4k_euros_per_year'
predictors = [col for col in dftrain.columns if col not in [index, target]]
continuous_predictors = ['age','income_from_investment_sources',\
                         'losses_from_investment_sources',\
                         'number_of_years_of_education',\
                         'working_hours_per_week']
categorical_predictors = [col for col in predictors if col not in continuous_predictors]

# dftrain[dftrain<0]=0
# dftest[dftest<0]=0

# init
dfcomplete = pd.concat([dftrain, dftest], ignore_index=True)

# categorize categorical data
for col in categorical_predictors:
    dfcomplete[col] = dfcomplete[col].astype('category')
    
# get dummies
dfcomplete = pd.get_dummies(dfcomplete, dummy_na=True)

# re-introduce NaN for categorical data
dfcomplete = reintroduce_nan(dfcomplete)

# split dftrain and dftest
dftrain_dum = dfcomplete[dfcomplete[target]!=-1]
dftest_dum  = dfcomplete[dfcomplete[target]==-1]

# save to file
dftrain_dum.to_csv('../data/train.mice.dum.csv', index=False)
dftest_dum.to_csv('../data/test.mice.dum.csv', index=False)