### Read data

In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from copy import deepcopy

# dataset
dftrain = pd.read_csv('../data/train.csv')
dftest  = pd.read_csv('../data/test.csv')
df      = pd.concat([dftrain, dftest])

# features
index = 'Id'
target = 'earn_over_4k_euros_per_year'
predictors = [col for col in dftrain.columns if col not in [index, target]]
continuous_predictors = ['age','income_from_investment_sources',\
                         'losses_from_investment_sources',\
                         'number_of_years_of_education',\
                         'working_hours_per_week']
categorical_predictors = [col for col in predictors if col not in continuous_predictors]

### Normalize data

In [73]:
from sklearn import preprocessing

# init
dftrain_norm = deepcopy(dftrain)
dftest_norm  = deepcopy(dftest)


# normalize continuous data
dftrain_norm[continuous_predictors] = dftrain[continuous_predictors].apply(lambda x: (x - np.mean(x)) / (np.max(x) - np.min(x)))
dftest_norm[continuous_predictors]  = dftest[continuous_predictors].apply(lambda x: (x - np.mean(x)) / (np.max(x) - np.min(x)))

# dftrain_norm[continuous_predictors] = dftrain[continuous_predictors].apply(lambda x: (x - np.min(x)) / np.max(x))
# dftest_norm[continuous_predictors]  = dftest[continuous_predictors].apply(lambda x: (x - np.min(x)) / np.max(x))


# categorical data encoding
le = preprocessing.LabelEncoder()
df = pd.concat(dftrain, dftest)

# encode
for col in categorical_predictors:
    # use both train and test for indexing
    categories = np.unique(df[df[col].notnull()][col])
    # fit categories
    lb.fit(categories)
    
    # binarize train
    dftrain_bin[col] = dftrain[col].apply(lambda x : lb.transform([x]).flatten() if pd.notnull(x) else x)
    
# categorize categorical data
for col in categorical_predictors:
    dftrain_norm[col] = dftrain_norm[col].astype('category')
    dftest_norm[col]  = dftest_norm[col].astype('category')

# get dummies
dftrain_norm_dum = pd.get_dummies(dftrain_norm, dummy_na=False)
dftest_norm_dum  = pd.get_dummies(dftest_norm, dummy_na=False)

# # re-introduce NaN for categorical data
# dftrain_norm_dum = reintroduce_nan(dftrain_norm_dum)
# dftest_norm_dum  = reintroduce_nan(dftest_norm_dum)

In [70]:
def reintroduce_nan(df, index='Id', target='earn_over_4k_euros_per_year'):
    predictors = [col for col in df.columns if col not in [index, target]]
    nan_colmns = [col for col in predictors_norm_dum if col.endswith('_nan')]
    nan_colmns_headers = [col[:-4] for col in nan_colmns]

    for headers in nan_colmns_headers:
        nan_col = headers + '_nan'
        feats = [col for col in predictors_norm_dum if col.startswith(headers) and (not col.endswith('_nan'))]
        nan_idx = df[nan_col].index[df[nan_col].apply(lambda x : x==1)]
        df.ix[nan_idx, feats] = df.ix[nan_idx, feats].apply(lambda row : np.nan, axis=1)
    
    df_final = df.drop(nan_colmns, axis=1)
    
    return df_final

In [74]:
dftrain_norm_dum.shape, dftest_norm_dum.shape

((10500, 104), (38342, 106))

In [3]:
# save summies
dftrain_norm_dum.to_csv('../data/train.norm.csv', index=False)
dftest_norm_dum.to_csv('../data/test.norm.csv', index=False)

### Convert category data to binary

In [4]:
 # Binarize category data
# init
dftrain_bin = deepcopy(dftrain)
dftest_bin  = deepcopy(dftest)

lb = preprocessing.LabelBinarizer()

# binarization
for col in categorical_predictors:
    # use both train and test for indexing
    categories = np.unique(df[df[col].notnull()][col])
    # fit categories
    lb.fit(categories)
    
    # binarize train
    dftrain_bin[col] = dftrain[col].apply(lambda x : lb.transform([x]).flatten() if pd.notnull(x) else x)
    
dftrain_bin

Unnamed: 0,Id,age,education,income_from_investment_sources,losses_from_investment_sources,marital_status,native_country,number_of_years_of_education,occupation,race,relationship,sex,work_class,working_hours_per_week,earn_over_4k_euros_per_year
0,1,12.4140,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]",692.01,,"[0, 0, 0, 0, 1, 0, 0]",,,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 1]","[0, 1, 0, 0, 0, 0]",[1],"[0, 0, 0, 0, 0, 0, 1, 0]",,0
1,2,,,0.00,0.00,"[0, 0, 1, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",,,"[0, 0, 0, 0, 1]","[1, 0, 0, 0, 0, 0]",[1],"[0, 0, 0, 0, 0, 1, 0, 0]",4.1380,0
2,3,12.0960,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]",0.00,0.00,"[1, 0, 0, 0, 0, 0, 0]",,,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 1]","[0, 1, 0, 0, 0, 0]",,"[0, 0, 0, 1, 0, 0, 0, 0]",12.7320,0
3,4,,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",,0.00,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]",,"[1, 0, 0, 0, 0, 0]",[1],"[0, 0, 0, 1, 0, 0, 0, 0]",12.7320,0
4,5,8.9127,,,0.00,"[0, 0, 1, 0, 0, 0, 0]","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4.1380,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]","[0, 0, 1, 0, 0]","[0, 0, 0, 0, 0, 1]",,"[0, 0, 0, 1, 0, 0, 0, 0]",12.7320,0
5,6,11.7770,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]",0.00,0.00,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4.4563,,"[0, 0, 0, 0, 1]","[0, 0, 0, 0, 0, 1]",[0],,12.7320,0
6,7,15.5970,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0.00,0.00,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.5915,,"[0, 0, 1, 0, 0]","[0, 1, 0, 0, 0, 0]",[0],"[0, 0, 0, 1, 0, 0, 0, 0]",,0
7,8,7.3211,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]",0.00,0.00,"[0, 0, 0, 0, 1, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4.1380,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 1]","[0, 0, 0, 1, 0, 0]",,"[0, 0, 0, 1, 0, 0, 0, 0]",9.5493,0
8,9,10.1860,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]",0.00,0.00,"[0, 0, 0, 0, 1, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3.8197,,"[0, 0, 1, 0, 0]","[0, 1, 0, 0, 0, 0]",[1],"[0, 0, 0, 1, 0, 0, 0, 0]",15.9150,0
9,10,10.8230,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0.00,0.00,"[0, 0, 1, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.2732,,"[1, 0, 0, 0, 0]","[1, 0, 0, 0, 0, 0]",,"[0, 0, 0, 1, 0, 0, 0, 0]",14.3240,0


In [5]:
dftrain_bin.to_csv('../data/train.bin.csv', index=False)
dftest_bin.to_csv('../data/test.bin.csv', index=False)

dftrain_bin = pd.read_csv('../data/train.bin.csv')
dftest_bin = pd.read_csv('../data/test.bin.csv')

### Get dummies

In [12]:
# categorize categorical data

for col in categorical_predictors:
    dftrain[col] = dftrain[col].astype('category')
    dftest[col] = dftrain[col].astype('category')

# get dummies
dftrain_dum = pd.get_dummies(dftrain)
dftest_dum  = pd.get_dummies(dftest)

# save summies
dftrain_dum.to_csv('../data/train.dum.csv', index=False)
dftest_dum.to_csv('../data/test.dum.csv',   index=False)

### Manage Columns

In [7]:
print predictors
for col in predictors:
    print dftrain_bin[col].dtype.name

['age', 'education', 'income_from_investment_sources', 'losses_from_investment_sources', 'marital_status', 'native_country', 'number_of_years_of_education', 'occupation', 'race', 'relationship', 'sex', 'work_class', 'working_hours_per_week']
float64
object
float64
float64
object
object
float64
object
object
object
object
object
float64
