In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [3]:
df = pd.read_csv('income.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
df.isnull().sum()

age                0
 workclass         0
 fnlwgt            0
 education         0
 education-num     0
 marital-status    0
 occupation        0
 relationship      0
 race              0
 sex               0
 capital-gain      0
 capital-loss      0
 hours-per-week    0
 native-country    0
 income            0
dtype: int64

In [5]:
df.shape

(32561, 15)

In [6]:
df.columns = df.columns.str.strip()
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

# train test split


In [7]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('income',axis=1), df.income,
                                                   test_size=0.2, random_state=0)

# Step1 : Scale the numerical columns

# Step2 : One Hot Encoding the categorical columns

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [9]:
num_cols = [col for col in X_train.columns if X_train[col].dtypes!='object']
num_cols

['age',
 'fnlwgt',
 'education-num',
 'capital-gain',
 'capital-loss',
 'hours-per-week']

In [10]:
cat_cols = [col for col in X_train.columns if X_train[col].dtypes=='object']
cat_cols

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country']

In [11]:
X_train[num_cols].head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
15282,36,174308,7,0,0,40
24870,35,198202,9,0,0,54
18822,38,52963,13,0,0,50
26404,50,138270,9,0,0,40
7842,68,116903,11,0,2149,40


In [12]:
r = RobustScaler()
r.fit(X_train[num_cols])

RobustScaler()

In [13]:
X_train_num_scaled = r.transform(X_train[num_cols])

In [14]:
X_test_num_scaled = r.transform(X_test[num_cols])

In [15]:
# categorical columns

In [16]:
X_train[cat_cols].head()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country
15282,Private,11th,Divorced,Transport-moving,Not-in-family,White,Male,United-States
24870,Private,HS-grad,Never-married,Exec-managerial,Not-in-family,White,Female,United-States
18822,Private,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Female,United-States
26404,Private,HS-grad,Married-civ-spouse,Sales,Wife,Black,Female,United-States
7842,Self-emp-not-inc,Assoc-voc,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States


In [17]:
o = OneHotEncoder(sparse=False,handle_unknown='ignore')
o.fit(X_train[cat_cols])

OneHotEncoder(handle_unknown='ignore', sparse=False)

In [18]:
X_train_cat_ecoded = o.transform(X_train[cat_cols])

In [19]:
X_test_cat_ecoded = o.transform(X_test[cat_cols])

In [20]:
pd.DataFrame(np.concatenate((X_train_num_scaled, X_train_cat_ecoded),axis=1))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,98,99,100,101,102,103,104,105,106,107
0,-0.052632,-0.030971,-1.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,-0.105263,0.169550,-0.333333,0.0,0.0,2.8,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.052632,-1.049314,1.000000,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.684211,-0.333407,-0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1.631579,-0.512721,0.333333,0.0,2149.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26043,0.315789,-1.214069,1.333333,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
26044,0.368421,-0.667447,-1.333333,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
26045,-0.736842,-1.113480,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
26046,0.421053,0.317755,2.000000,7688.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


# Column Transformer enters

In [21]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [22]:
df[['education','education-num']].head()

Unnamed: 0,education,education-num
0,Bachelors,13
1,Bachelors,13
2,HS-grad,9
3,11th,7
4,Bachelors,13


In [23]:
cat_cols

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country']

In [42]:
ct = ColumnTransformer([
    ('step1', RobustScaler(), num_cols),
    ('step2', OneHotEncoder(sparse=False, handle_unknown='ignore'),['workclass',
                                                                    'marital-status','occupation',
                                                                    'relationship','race', 'sex',
                                                                    'native-country']),
    
],remainder='passthrough')

In [43]:
ct.fit(X_train)

ColumnTransformer(remainder='passthrough',
                  transformers=[('step1', RobustScaler(),
                                 ['age', 'fnlwgt', 'education-num',
                                  'capital-gain', 'capital-loss',
                                  'hours-per-week']),
                                ('step2',
                                 OneHotEncoder(handle_unknown='ignore',
                                               sparse=False),
                                 ['workclass', 'marital-status', 'occupation',
                                  'relationship', 'race', 'sex',
                                  'native-country'])])

In [47]:
ct.transform(X_train)

array([[-0.05263157894736842, -0.030971158344820063, -1.0, ..., 0.0, 0.0,
        ' 11th'],
       [-0.10526315789473684, 0.1695504125781255, -0.3333333333333333,
        ..., 0.0, 0.0, ' HS-grad'],
       [0.05263157894736842, -1.0493142580202544, 1.0, ..., 0.0, 0.0,
        ' Bachelors'],
       ...,
       [-0.7368421052631579, -1.113480489345141, 0.0, ..., 0.0, 0.0,
        ' Some-college'],
       [0.42105263157894735, 0.3177554407232338, 2.0, ..., 0.0, 0.0,
        ' Doctorate'],
       [-0.631578947368421, 0.07491235468501187, 0.0, ..., 0.0, 0.0,
        ' Some-college']], dtype=object)

In [48]:
ct.transform(X_test)

array([[-0.5263157894736842, -0.007380878949808764, 0.0, ..., 0.0, 0.0,
        ' Some-college'],
       [-0.5263157894736842, 0.32295016962594175, 1.0, ..., 0.0, 0.0,
        ' Bachelors'],
       [-0.631578947368421, 0.6568059130952906, 0.6666666666666666, ...,
        0.0, 0.0, ' Assoc-acdm'],
       ...,
       [0.3684210526315789, -0.0922588888399348, 1.0, ..., 0.0, 0.0,
        ' Bachelors'],
       [0.8421052631578947, 0.6708795162775865, -0.3333333333333333, ...,
        0.0, 0.0, ' HS-grad'],
       [0.5263157894736842, -0.4856316232269001, -0.3333333333333333,
        ..., 0.0, 0.0, ' HS-grad']], dtype=object)

In [49]:
pd.DataFrame(ct.transform(X_train))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,83,84,85,86,87,88,89,90,91,92
0,-0.052632,-0.030971,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,11th
1,-0.105263,0.16955,-0.333333,0.0,0.0,2.8,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,HS-grad
2,0.052632,-1.049314,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,Bachelors
3,0.684211,-0.333407,-0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,HS-grad
4,1.631579,-0.512721,0.333333,0.0,2149.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,Assoc-voc
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26043,0.315789,-1.214069,1.333333,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,Masters
26044,0.368421,-0.667447,-1.333333,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,10th
26045,-0.736842,-1.11348,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,Some-college
26046,0.421053,0.317755,2.0,7688.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,Doctorate


In [50]:
ct.transformers_

[('step1',
  RobustScaler(),
  ['age',
   'fnlwgt',
   'education-num',
   'capital-gain',
   'capital-loss',
   'hours-per-week']),
 ('step2',
  OneHotEncoder(handle_unknown='ignore', sparse=False),
  ['workclass',
   'marital-status',
   'occupation',
   'relationship',
   'race',
   'sex',
   'native-country']),
 ('remainder', 'passthrough', [3])]