In [145]:
# Import dataset
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
## Define path data
COLUMNS = ['age','workclass', 'fnlwgt', 'education', 'education_num', 'marital',
           'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
           'hours_week', 'native_country', 'label']
### Define continuous list
CONTI_FEATURES  = ['age', 'fnlwgt','capital_gain', 'education_num', 'capital_loss', 'hours_week']
### Define categorical list
CATE_FEATURES = ['workclass', 'education', 'marital', 'occupation', 'relationship', 'race', 'sex', 'native_country']

## Prepare the data
features = ['age','workclass', 'fnlwgt', 'education', 'education_num', 'marital',
           'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
           'hours_week', 'native_country']

PATH = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"

df_train = pd.read_csv(PATH, skipinitialspace=True, names = COLUMNS, index_col=False)
df_train[CONTI_FEATURES] =df_train[CONTI_FEATURES].astype('float64')
## Drop Netherland, because only one row
df_train = df_train[df_train.native_country != "Holand-Netherlands"]
## Get the column index of the categorical features
conti_features = []
for i in CONTI_FEATURES:
    position = df_train.columns.get_loc(i)
    conti_features.append(position)
    ## Get the column index of the categorical features
categorical_features = []
for i in CATE_FEATURES:
    position = df_train.columns.get_loc(i)
    categorical_features.append(position)

In [62]:
df_train['age'][0] = np.nan
df_train['workclass'][0] = np.nan

In [63]:
df_train

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital,occupation,relationship,race,sex,capital_gain,capital_loss,hours_week,native_country,label
0,,,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38.0,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53.0,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28.0,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K
5,37.0,Private,284582.0,Masters,14.0,Married-civ-spouse,Exec-managerial,Wife,White,Female,0.0,0.0,40.0,United-States,<=50K
6,49.0,Private,160187.0,9th,5.0,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0.0,0.0,16.0,Jamaica,<=50K
7,52.0,Self-emp-not-inc,209642.0,HS-grad,9.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,45.0,United-States,>50K
8,31.0,Private,45781.0,Masters,14.0,Never-married,Prof-specialty,Not-in-family,White,Female,14084.0,0.0,50.0,United-States,>50K
9,42.0,Private,159449.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178.0,0.0,40.0,United-States,>50K


In [189]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_train[features],
                                                    df_train.label,
                                                    test_size = 0.2,
                                                    random_state=0)

In [200]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier

In [201]:
from sklearn.base import TransformerMixin

class add_cluster(TransformerMixin):

    def __init__(self, model):
        self.model = model

    def fit(self, *args, **kwargs):
        self.model.fit(*args, **kwargs)
        return self

    def transform(self, X, **transform_params):
        return pd.concat([pd.DataFrame(X),pd.DataFrame(self.model.predict(X))],axis=1,ignore_index=True)
#add = add_cluster(KMeans(n_clusters=2))
#add.fit_transform(x_train_pre)
#add.transform(x_test_pre)

In [214]:
preprocess = ColumnTransformer([
    ('num', Pipeline(steps=[('num_scalar',StandardScaler()),
                            ('num_impute',SimpleImputer(missing_values=np.nan,strategy='mean'))
                           ]),conti_features),
    ### Need to be numeric not string to specify columns name 
    ('cat', Pipeline(steps=[('num_impute',SimpleImputer(missing_values=np.nan,strategy='most_frequent')),
                            ('cat_encode',OneHotEncoder(sparse=False))
                           ]),categorical_features)
])
generation = Pipeline([('cluster_2',add_cluster(KMeans(n_clusters=2))),
                       ('cluster_3',add_cluster(KMeans(n_clusters=3))),
                       ('cluster_4',add_cluster(KMeans(n_clusters=4)))
    
])
pipe_pre_gen = Pipeline(steps=[('preprocess',preprocess),
                       ('generation',generation)
])
x_train_pre_gen = pipe_pre_gen.fit_transform(X_train)
x_test_pre_gen = pipe_pre_gen.transform(X_test)

In [215]:
x_test_pre_gen

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,100,101,102,103,104,105,106,107,108,109
0,0.106610,0.037382,-0.146071,-0.027789,-0.217040,-0.035384,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0,3
1,-0.848434,0.255720,-0.146071,1.138454,-0.217040,-0.035384,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,1,2
2,-0.995364,0.634077,-0.146071,0.749706,-0.217040,-0.035384,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0,1
3,0.547399,-0.399010,-0.146071,-2.749023,4.477405,-0.035384,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,2,0
4,0.473934,-0.159510,-0.146071,-1.194032,6.753052,2.904039,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,2,0
5,-0.701504,0.134135,-0.146071,1.138454,-0.217040,2.822388,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0,1
6,0.253540,0.346443,-0.146071,-0.416537,-0.217040,1.597629,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,2
7,-0.334180,-0.657503,-0.146071,-0.027789,-0.217040,0.781123,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,1,2
8,1.575908,-0.841546,-0.146071,0.360959,-0.217040,-0.035384,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0,1
9,3.779855,-0.221130,-0.146071,-3.137771,-0.217040,-0.035384,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0,3


In [216]:
pipe = Pipeline(steps=[('pre_gen',pipe_pre_gen),
                       ('lr',ExtraTreesClassifier())
                      ])

In [217]:
pipe.fit(X_train,y_train)



Pipeline(memory=None,
     steps=[('pre_gen', Pipeline(memory=None,
     steps=[('preprocess', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('num', Pipeline(memory=None,
     steps=[('num_scalar', StandardScaler(copy=True, with_mean=True, with...ators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False))])

In [218]:
pipe.predict(X_test)

array(['<=50K', '<=50K', '<=50K', ..., '<=50K', '>50K', '<=50K'],
      dtype=object)

In [219]:
pipe.score(X_test,y_test)

0.8296990171990172

In [222]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
# Construct the parameter grid
params = {
    'lr__n_estimators': [100, 200, 300, 400],
    }
# Train the model
skf = StratifiedKFold(n_splits=3,
                      shuffle = True,
                      random_state = 1001)

random_search = RandomizedSearchCV(pipe,
                                   param_distributions=params,
                                   n_iter=4,
                                   scoring='accuracy',
                                   n_jobs=1,
                                   cv=skf.split(X_train, y_train),
                                   verbose=3,
                                   random_state=1001)
random_search.fit(X_train, y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] lr__n_estimators=100 ............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ... lr__n_estimators=100, score=0.8332373603593228, total=  16.3s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   17.2s remaining:    0.0s


[CV] lr__n_estimators=100 ............................................
[CV] ... lr__n_estimators=100, score=0.8323160198088219, total=  15.4s
[CV] lr__n_estimators=100 ............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   35.1s remaining:    0.0s


[CV] ... lr__n_estimators=100, score=0.8320663441603318, total=  16.5s
[CV] lr__n_estimators=200 ............................................
[CV] ... lr__n_estimators=200, score=0.8318553495335713, total=  22.2s
[CV] lr__n_estimators=200 ............................................
[CV] ... lr__n_estimators=200, score=0.8343890360474491, total=  21.7s
[CV] lr__n_estimators=200 ............................................
[CV] ... lr__n_estimators=200, score=0.8316056208246948, total=  20.6s
[CV] lr__n_estimators=300 ............................................
[CV] ... lr__n_estimators=300, score=0.8342738684786364, total=  25.7s
[CV] lr__n_estimators=300 ............................................
[CV] .... lr__n_estimators=300, score=0.833813198203386, total=  26.7s
[CV] lr__n_estimators=300 ............................................
[CV] ... lr__n_estimators=300, score=0.8333333333333334, total=  24.8s
[CV] lr__n_estimators=400 ............................................
[CV] .

[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:  5.3min finished


RandomizedSearchCV(cv=<generator object _BaseKFold.split at 0x000000000BB9F410>,
          error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('pre_gen', Pipeline(memory=None,
     steps=[('preprocess', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('num', Pipeline(memory=None,
     steps=[('num_scalar', StandardScaler(copy=True, with_mean=True, with...ators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False))]),
          fit_params=None, iid='warn', n_iter=4, n_jobs=1,
          param_distributions={'lr__n_estimators': [100, 200, 300, 400]},
          pre_dispatch='2*n_jobs', random_state=1001, refit=True,
          return_train_score='warn', scoring='accuracy', verbose=3)

In [223]:
print("best logistic regression from grid search: %f" % random_search.best_estimator_.score(X_test, y_test))

best logistic regression from grid search: 0.836149
