## scikit-learn tip #1

Use ColumnTransformer to apply different preprocessing to different columns:

- select from DataFrame columns by name
- passthrough or drop unspecified columns

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('http://bit.ly/kaggletrain', nrows=6)

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
cols = ['Fare', 'Embarked', 'Sex', 'Age']
X = df[cols]

In [5]:
X

Unnamed: 0,Fare,Embarked,Sex,Age
0,7.25,S,male,22.0
1,71.2833,C,female,38.0
2,7.925,S,female,26.0
3,53.1,S,female,35.0
4,8.05,S,male,35.0
5,8.4583,Q,male,


In [6]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer

In [7]:
ohe = OneHotEncoder()
imp = SimpleImputer()

In [8]:
ct = make_column_transformer(
    (ohe, ['Embarked', 'Sex']), # apply OneHotEncoder to Embarked and Sex
    (imp, ['Age']),             # apply SimpleImputer to Age
 remainder='passthrough')       # include remaining column (Fare) in the output

In [9]:
# column order: Embarked (3 columns), Sex (2 columns), Age (1 column), Fare (1 column)
ct.fit_transform(X)

array([[ 0.    ,  0.    ,  1.    ,  0.    ,  1.    , 22.    ,  7.25  ],
       [ 1.    ,  0.    ,  0.    ,  1.    ,  0.    , 38.    , 71.2833],
       [ 0.    ,  0.    ,  1.    ,  1.    ,  0.    , 26.    ,  7.925 ],
       [ 0.    ,  0.    ,  1.    ,  1.    ,  0.    , 35.    , 53.1   ],
       [ 0.    ,  0.    ,  1.    ,  0.    ,  1.    , 35.    ,  8.05  ],
       [ 0.    ,  1.    ,  0.    ,  0.    ,  1.    , 31.2   ,  8.4583]])