In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier

In [2]:
df = pd.read_csv('income_evaluation.csv', na_values=' ?')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
df.isna().sum()

age                   0
 workclass         1836
 fnlwgt               0
 education            0
 education-num        0
 marital-status       0
 occupation        1843
 relationship         0
 race                 0
 sex                  0
 capital-gain         0
 capital-loss         0
 hours-per-week       0
 native-country     583
 income               0
dtype: int64

In [4]:
df.shape

(32561, 15)

In [5]:
df.dropna(inplace=True)
df.shape

(30162, 15)

In [6]:
df.isna().sum()

age                0
 workclass         0
 fnlwgt            0
 education         0
 education-num     0
 marital-status    0
 occupation        0
 relationship      0
 race              0
 sex               0
 capital-gain      0
 capital-loss      0
 hours-per-week    0
 native-country    0
 income            0
dtype: int64

In [7]:
df.columns

Index(['age', ' workclass', ' fnlwgt', ' education', ' education-num',
       ' marital-status', ' occupation', ' relationship', ' race', ' sex',
       ' capital-gain', ' capital-loss', ' hours-per-week', ' native-country',
       ' income'],
      dtype='object')

In [8]:
#We can see, that there are leaving spaces in the column names, hence well change it!
df.columns = df.columns.str.strip()
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('income', axis=1), df.income,
                                                   test_size=0.2, random_state=0)

In [10]:
#Let's extract columns which are numeric!
num_cols = [col for col in X_train.columns if X_train[col].dtypes!='O']
num_cols

['age',
 'fnlwgt',
 'education-num',
 'capital-gain',
 'capital-loss',
 'hours-per-week']

In [11]:
#Let's extract columns which are categorical!
cat_cols = [col for col in X_train.columns if X_train[col].dtypes=='O']
cat_cols

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country']

In [13]:
#Here we can see that, education and education-num columns are same!
df[['education', 'education-num']].head()
#just label encoded!

Unnamed: 0,education,education-num
0,Bachelors,13
1,Bachelors,13
2,HS-grad,9
3,11th,7
4,Bachelors,13


In [14]:
#Now we will create a Column transform pipeline, in which we'll scale our numeric and Non-numeric columns
ct = ColumnTransformer([
    ('step1', RobustScaler(), ['age', 'fnlwgt', 'hours-per-week']),
    ('step2', StandardScaler(), ['capital-gain', 'capital-loss', 'education-num']),
    ('step3', OneHotEncoder(sparse=False, handle_unknown='ignore'), ['workclass', 
                                                                     'marital-status', 'occupation',
                                                                     'relationship', 'race', 
                                                                     'sex', 'native-country'])
    #One hot encoding all the categorical values!
], remainder='drop')

### Pipeline Use Case-1 - With an 'Estimator' as Final step!

In [15]:
p = Pipeline([
    ('coltf_step', ct), #This will go to our column transformer and execute the column transformations!
    ('model', DecisionTreeClassifier()), #And our second step will be applying model to the estimator!
])

In [16]:
p.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('coltf_step',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('step1',
                                                  RobustScaler(copy=True,
                                                               quantile_range=(25.0,
                                                                               75.0),
                                                               with_centering=True,
                                                               with_scaling=True),
                                                  ['age', 'fnlwgt',
                                                   'hours-per-week']),
                                                 ('step2',
                                                  StandardScaler(copy=True,
                  

In [17]:
p.predict(X_test)

array([' <=50K', ' <=50K', ' <=50K', ..., ' >50K', ' <=50K', ' <=50K'],
      dtype=object)

In [18]:
p.score(X_test, y_test)

0.8073926736283773

In [19]:
p.named_steps

{'coltf_step': ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                   transformer_weights=None,
                   transformers=[('step1',
                                  RobustScaler(copy=True,
                                               quantile_range=(25.0, 75.0),
                                               with_centering=True,
                                               with_scaling=True),
                                  ['age', 'fnlwgt', 'hours-per-week']),
                                 ('step2',
                                  StandardScaler(copy=True, with_mean=True,
                                                 with_std=True),
                                  ['capital-gain', 'capital-loss',
                                   'education-num']),
                                 ('step3',
                                  OneHotEncoder(categories='auto', drop=None,
                                                dtype=<clas

### Pipeline Use Case-2 - Without an 'Estimator' as Final step!

In [20]:
p1 = Pipeline([
    ('coltf_step', ct),
    ('minmax', MinMaxScaler())
#     ('model', DecisionTreeClassifier()),
])

In [21]:
p1.fit(X_train)

Pipeline(memory=None,
         steps=[('coltf_step',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('step1',
                                                  RobustScaler(copy=True,
                                                               quantile_range=(25.0,
                                                                               75.0),
                                                               with_centering=True,
                                                               with_scaling=True),
                                                  ['age', 'fnlwgt',
                                                   'hours-per-week']),
                                                 ('step2',
                                                  StandardScaler(copy=True,
                  

In [22]:
p1.transform(X_test)

array([[0.36986301, 0.04628617, 0.39795918, ..., 1.        , 0.        ,
        0.        ],
       [0.05479452, 0.1987476 , 0.19387755, ..., 1.        , 0.        ,
        0.        ],
       [0.26027397, 0.11716417, 0.39795918, ..., 1.        , 0.        ,
        0.        ],
       ...,
       [0.38356164, 0.17738365, 0.60204082, ..., 1.        , 0.        ,
        0.        ],
       [0.32876712, 0.23260631, 0.65306122, ..., 1.        , 0.        ,
        0.        ],
       [0.10958904, 0.09400613, 0.39795918, ..., 0.        , 0.        ,
        0.        ]])