In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier

In [8]:
df = pd.read_csv('income_evaluation.csv', na_values=' ?')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [9]:
df.columns

Index(['age', ' workclass', ' fnlwgt', ' education', ' education-num',
       ' marital-status', ' occupation', ' relationship', ' race', ' sex',
       ' capital-gain', ' capital-loss', ' hours-per-week', ' native-country',
       ' income'],
      dtype='object')

In [10]:
df.columns = df.columns.str.strip()
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

In [11]:
df.isna().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     583
income               0
dtype: int64

# Infusing Missing values in numerical values as well.

In [12]:
np.array(df.index)

array([    0,     1,     2, ..., 32558, 32559, 32560], dtype=int64)

In [13]:
np.random.seed(15)

r = np.random.choice(df.index, size = 40, replace=False)
df.loc[r, 'age'] = np.nan

In [14]:
np.random.seed(25)

r = np.random.choice(df.index, size = 40, replace=False)
df.loc[r, 'hours-per-week'] = np.nan

In [15]:
df.isna().sum()

age                 40
workclass         1836
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week      40
native-country     583
income               0
dtype: int64

In [16]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('income', axis= 'columns'), df.income, 
                                                   random_state= 0, test_size= 0.2)

In [18]:
num_cols = [col for col in X_train.columns if X_train[col].dtypes != 'O']
num_cols

['age',
 'fnlwgt',
 'education-num',
 'capital-gain',
 'capital-loss',
 'hours-per-week']

In [20]:
cat_cols = [col for col in X_train.columns if (X_train[col].dtypes == 'O') & (col != 'education')]
cat_cols

['workclass',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country']

# Unscalable approach.

In [27]:
from sklearn.impute import SimpleImputer

In [28]:
ct1 = ColumnTransformer([
    ('Simple_Imputer_num', SimpleImputer(strategy= 'median', add_indicator= True), num_cols),
    ('Simple_imputer_cat', SimpleImputer(strategy= 'constant', fill_value= 'missing', add_indicator= True), cat_cols),
], remainder='drop')

In [30]:
pd.DataFrame(ct1.fit_transform(X_train)).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,36,174308,7,0,0,40,0,0,Private,Divorced,Transport-moving,Not-in-family,White,Male,United-States,False,False,False
1,35,198202,9,0,0,54,0,0,Private,Never-married,Exec-managerial,Not-in-family,White,Female,United-States,False,False,False
2,38,52963,13,0,0,50,0,0,Private,Never-married,Adm-clerical,Not-in-family,White,Female,United-States,False,False,False
3,50,138270,9,0,0,40,0,0,Private,Married-civ-spouse,Sales,Wife,Black,Female,United-States,False,False,False
4,68,116903,11,0,2149,40,0,0,Self-emp-not-inc,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States,False,False,False


In [32]:
ct2 = ColumnTransformer([
    ('rob_num', RobustScaler(), list(range(6))),
    ('ohe_cat', OneHotEncoder(sparse= False, handle_unknown= 'ignore'), list(range(8,18)))  
])

In [33]:
xtf = ct1.fit_transform(X_train)

In [34]:
ct2.fit_transform(xtf)

array([[-0.05263158, -0.03097116, -1.        , ...,  0.        ,
         1.        ,  0.        ],
       [-0.10526316,  0.16955041, -0.33333333, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.05263158, -1.04931426,  1.        , ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.73684211, -1.11348049,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.42105263,  0.31775544,  2.        , ...,  0.        ,
         1.        ,  0.        ],
       [-0.63157895,  0.07491235,  0.        , ...,  0.        ,
         1.        ,  0.        ]])

# Scalable Approach.

In [51]:
pipe_num = Pipeline([
    ('si_num', SimpleImputer(strategy= 'median', add_indicator= False)),
    ('robust_scale_num', RobustScaler())
])

In [52]:
pipe_cat= Pipeline([
    ('si_cat', SimpleImputer(strategy= 'constant', fill_value= 'missing', add_indicator= False)),
    ('ohe_cat', OneHotEncoder(sparse= False, handle_unknown= 'ignore'))
])

In [53]:
ct = ColumnTransformer([
    ('step_num', pipe_num, num_cols),
    ('step_cat', pipe_cat, cat_cols)
])

In [54]:
pd.DataFrame(ct.fit_transform(X_train)).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,82,83,84,85,86,87,88,89,90,91
0,-0.052632,-0.030971,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.105263,0.16955,-0.333333,0.0,0.0,2.8,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.052632,-1.049314,1.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.684211,-0.333407,-0.333333,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1.631579,-0.512721,0.333333,0.0,2149.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


#Feed into Machine Learning Model.

In [55]:
Final_Pipe= Pipeline([
    ('Pre_processor', ct),
    ('model', DecisionTreeClassifier())
])

In [56]:
Final_Pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('Pre_processor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('step_num',
                                                  Pipeline(memory=None,
                                                           steps=[('si_num',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                        

In [57]:
Final_Pipe.predict(X_test)

array([' <=50K', ' <=50K', ' >50K', ..., ' >50K', ' <=50K', ' >50K'],
      dtype=object)

In [58]:
Final_Pipe.score(X_test, y_test)

0.8114540150468295

In [66]:
Final_Pipe.named_steps['Pre_processor'].transformers_[0][1].named_steps['robust_scale_num']

RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
             with_scaling=True)