In [27]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier

In [2]:
df = pd.read_csv('income_evaluation.csv', na_values= '?')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
df.shape

(32561, 15)

In [4]:
df.isna().sum()

age                0
 workclass         0
 fnlwgt            0
 education         0
 education-num     0
 marital-status    0
 occupation        0
 relationship      0
 race              0
 sex               0
 capital-gain      0
 capital-loss      0
 hours-per-week    0
 native-country    0
 income            0
dtype: int64

In [5]:
df.columns

Index(['age', ' workclass', ' fnlwgt', ' education', ' education-num',
       ' marital-status', ' occupation', ' relationship', ' race', ' sex',
       ' capital-gain', ' capital-loss', ' hours-per-week', ' native-country',
       ' income'],
      dtype='object')

In [6]:
df.columns = df.columns.str.strip()

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('income', axis ='columns'), df.income, 
                                                    random_state =0, test_size = 0.2)

In [8]:
from sklearn.pipeline import Pipeline

In [10]:
from sklearn.compose import ColumnTransformer

In [23]:
cat_cols = [col for col in X_train.columns if X_train[col].dtypes == 'O']
cat_cols

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country']

In [24]:
ct = ColumnTransformer([
    ('step 1', RobustScaler(), ['age', 'fnlwgt', 'hours-per-week']),
    ('step 2', StandardScaler(), ['capital-gain', 'capital-loss']),
    ('step 3', OneHotEncoder(sparse= False, handle_unknown= 'ignore'), ['workclass', 'marital-status',
                                                                        'occupation','relationship','race',
                                                                        'sex', 'native-country']),
    ], remainder = 'drop')

In [34]:
P = Pipeline([
    ('column Transformer', ct),
    ('Model', DecisionTreeClassifier())
])

In [37]:
P.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('column Transformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('step 1',
                                                  RobustScaler(copy=True,
                                                               quantile_range=(25.0,
                                                                               75.0),
                                                               with_centering=True,
                                                               with_scaling=True),
                                                  ['age', 'fnlwgt',
                                                   'hours-per-week']),
                                                 ('step 2',
                                                  StandardScaler(copy=True,
        

In [38]:
P.predict(X_test)

array([' <=50K', ' <=50K', ' >50K', ..., ' >50K', ' <=50K', ' >50K'],
      dtype=object)

In [39]:
P.score(X_test, y_test)

0.8000921234454169

In [40]:
P.named_steps

{'column Transformer': ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                   transformer_weights=None,
                   transformers=[('step 1',
                                  RobustScaler(copy=True,
                                               quantile_range=(25.0, 75.0),
                                               with_centering=True,
                                               with_scaling=True),
                                  ['age', 'fnlwgt', 'hours-per-week']),
                                 ('step 2',
                                  StandardScaler(copy=True, with_mean=True,
                                                 with_std=True),
                                  ['capital-gain', 'capital-loss']),
                                 ('step 3',
                                  OneHotEncoder(categories='auto', drop=None,
                                                dtype=<class 'numpy.float64'>,
                     

In [46]:
P.named_steps['column Transformer'].transformers_[2][1].get_feature_names()


array(['x0_ ?', 'x0_ Federal-gov', 'x0_ Local-gov', 'x0_ Never-worked',
       'x0_ Private', 'x0_ Self-emp-inc', 'x0_ Self-emp-not-inc',
       'x0_ State-gov', 'x0_ Without-pay', 'x1_ Divorced',
       'x1_ Married-AF-spouse', 'x1_ Married-civ-spouse',
       'x1_ Married-spouse-absent', 'x1_ Never-married', 'x1_ Separated',
       'x1_ Widowed', 'x2_ ?', 'x2_ Adm-clerical', 'x2_ Armed-Forces',
       'x2_ Craft-repair', 'x2_ Exec-managerial', 'x2_ Farming-fishing',
       'x2_ Handlers-cleaners', 'x2_ Machine-op-inspct',
       'x2_ Other-service', 'x2_ Priv-house-serv', 'x2_ Prof-specialty',
       'x2_ Protective-serv', 'x2_ Sales', 'x2_ Tech-support',
       'x2_ Transport-moving', 'x3_ Husband', 'x3_ Not-in-family',
       'x3_ Other-relative', 'x3_ Own-child', 'x3_ Unmarried', 'x3_ Wife',
       'x4_ Amer-Indian-Eskimo', 'x4_ Asian-Pac-Islander', 'x4_ Black',
       'x4_ Other', 'x4_ White', 'x5_ Female', 'x5_ Male', 'x6_ ?',
       'x6_ Cambodia', 'x6_ Canada', 'x6_ China', 'x

In [52]:
P.named_steps['column Transformer'].transformers_[0][1].center_

array([3.700000e+01, 1.779985e+05, 4.000000e+01])