# Column Transformer 


## Columntransformer allows you to apply your transformers to the columns you have selected.

In [41]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
import seaborn as sns

In [42]:
titanic=sns.load_dataset('titanic')
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [43]:
y=titanic['survived']
x=titanic.drop('survived',axis=1)

# StandardScaler on the columns selected 

### We have decided to apply the transformations only to the “age” column and the “price” column.

In [44]:
from sklearn.compose import make_column_transformer
transformer=make_column_transformer((StandardScaler(),['age','fare']))

###  We can see the new scaling as follow.

In [45]:
transformer.fit_transform(x)

array([[-0.53037664, -0.50244517],
       [ 0.57183099,  0.78684529],
       [-0.25482473, -0.48885426],
       ...,
       [        nan, -0.17626324],
       [-0.25482473, -0.04438104],
       [ 0.15850313, -0.49237783]])

# Categorical and Numerical Pipeline in series


In [143]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder , Binarizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.impute import SimpleImputer
import seaborn as sns

In [144]:
titanic = sns.load_dataset('titanic')
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [145]:
y = titanic['survived']
X = titanic.drop('survived', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)


## Method 1 : We choose our columns

In [146]:
numerical_features = ['pclass', 'age', 'fare']
categorical_features = ['sex', 'deck', 'alone']


### Create a pipeline for categorical functionalities 
### Create pipeline for numerical functionalities

In [150]:
numerical_pipeline = make_pipeline(SimpleImputer(missing_values=np.nan,strategy='mean'), StandardScaler())
categorical_pipeline = make_pipeline(SimpleImputer(missing_values=np.nan,strategy='most_frequent'), OneHotEncoder())

In [151]:
preprocessor = make_column_transformer((numerical_pipeline, numerical_features),
                                   (categorical_pipeline, categorical_features))

In [152]:

model = make_pipeline(preprocessor, SGDClassifier())

model.fit(X_train, y_train)
model.score(X_test, y_test)

0.7937219730941704

## Method 2 : We use a colomn Selector by type
- make_column_transformer

In [153]:
titanic = sns.load_dataset('titanic')
titanic
y = titanic['survived']
X = titanic.drop('survived', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

###  dtype_include=np.number will choose only numerical columns.
### dtype_exclude=np.number will exclude all numerical columns and so we left with only categorical columns.

In [154]:
from sklearn.compose import make_column_selector


numerical_features =make_column_selector(dtype_include=np.number)
categorical_features = make_column_selector(dtype_exclude=np.number)

In [155]:
preprocessor = make_column_transformer((numerical_pipeline, numerical_features),
                                   (categorical_pipeline, categorical_features))

In [156]:
model = make_pipeline(preprocessor, SGDClassifier())

model.fit(X_train, y_train)
model.score(X_test, y_test)

1.0

# Categorical and Numerical Pipeline in parallel 
# Use Two transformation in one parallel pipeline.
 
- make_union () allows you to create parallel pipelines.
- Results are concatenated at the exit of the transformer.
- One table with de Standard Scaler Transformation and one table with the Binary Transformation .
- Both Transformation method will giving us a table with Age and Fare. 
- We did the transformation in parralel so tables will concatenate.


In [173]:
titanic = sns.load_dataset('titanic')
titanic
y = titanic['survived']
X = titanic.drop('survived', axis=1)

X.dropna(inplace=True)

numerical_features =X[['age', 'fare']]
numerical_features



Unnamed: 0,age,fare
1,38.0,71.2833
3,35.0,53.1000
6,54.0,51.8625
10,4.0,16.7000
11,58.0,26.5500
...,...,...
871,47.0,52.5542
872,33.0,5.0000
879,56.0,83.1583
887,19.0,30.0000


In [174]:
from sklearn.pipeline import make_union

In [178]:
pipeline=make_union(StandardScaler(),Binarizer())

In [179]:
# Both Transformation method gave us a table with age and price.

# We did the transformation in parallel so the tables so here are the two concatenated tables.

# Age, price, age, price

#Two fisrt Age, Fare, with StandardScaler () and the other two with Binarizer ().

pipeline.fit_transform(numerical_features)

array([[ 1.52081957e-01, -1.00110129e-01,  1.00000000e+00,
         1.00000000e+00],
       [-3.98750185e-02, -3.38484771e-01,  1.00000000e+00,
         1.00000000e+00],
       [ 1.17585249e+00, -3.54707823e-01,  1.00000000e+00,
         1.00000000e+00],
       [-2.02343043e+00, -8.15671915e-01,  1.00000000e+00,
         1.00000000e+00],
       [ 1.43179512e+00, -6.86542977e-01,  1.00000000e+00,
         1.00000000e+00],
       [-1.03860677e-01, -8.64177202e-01,  1.00000000e+00,
         1.00000000e+00],
       [-4.87774627e-01, -5.69212621e-01,  1.00000000e+00,
         1.00000000e+00],
       [-1.06364555e+00,  2.41320703e+00,  1.00000000e+00,
         1.00000000e+00],
       [ 8.55924198e-01, -2.87169016e-02,  1.00000000e+00,
         1.00000000e+00],
       [ 1.87969473e+00, -2.22082571e-01,  1.00000000e+00,
         1.00000000e+00],
       [ 5.99981565e-01,  5.97174125e-02,  1.00000000e+00,
         1.00000000e+00],
       [-4.23788969e-01, -8.96951044e-01,  1.00000000e+00,
      

In [180]:
pipeline.fit_transform(numerical_features).shape

(182, 4)