In [3]:
import pandas as pd

from sklearn import set_config
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import StandardScaler, PowerTransformer, OrdinalEncoder

set_config(display='diagram')

In [4]:
DF_PATH = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'

df = pd.read_csv(DF_PATH, index_col='PassengerId')
df['personal_titles'] = df['Name'].str.extract('.*, (.+?)\.', expand=False)
df = df.drop(columns=['Name', 'Ticket', 'Cabin']).dropna()

df_features = df.drop(columns='Survived')
target = df['Survived']

df_features.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,personal_titles
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,3,male,22.0,1,0,7.25,S,Mr
2,1,female,38.0,1,0,71.2833,C,Mrs
3,3,female,26.0,0,0,7.925,S,Miss
4,1,female,35.0,1,0,53.1,S,Mrs
5,3,male,35.0,0,0,8.05,S,Mr


In [10]:
numerical_features = df_features.select_dtypes('number').columns
categorical_features = df_features.select_dtypes(exclude='number').columns
print(numerical_features, categorical_features, sep='\n')

Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')
Index(['Sex', 'Embarked', 'personal_titles'], dtype='object')


In [13]:
preprocessing_pipeline = make_column_transformer(
  [StandardScaler(), numerical_features],
  [OrdinalEncoder(), categorical_features]
)
preprocessing_pipeline

In [14]:
preprocessing_pipeline.fit(df_features)

In [15]:
pd.DataFrame(
  data=preprocessing_pipeline.transform(df_features),
  columns=preprocessing_pipeline.get_feature_names_out()
)

Unnamed: 0,standardscaler__Pclass,standardscaler__Age,standardscaler__SibSp,standardscaler__Parch,standardscaler__Fare,ordinalencoder__Sex,ordinalencoder__Embarked,ordinalencoder__personal_titles
0,0.908600,-0.527669,0.522511,-0.506787,-0.516380,1.0,2.0,11.0
1,-1.482983,0.577094,0.522511,-0.506787,0.694046,0.0,0.0,12.0
2,0.908600,-0.251478,-0.552714,-0.506787,-0.503620,0.0,2.0,8.0
3,-1.482983,0.369951,0.522511,-0.506787,0.350326,0.0,2.0,12.0
4,0.908600,0.369951,-0.552714,-0.506787,-0.501257,1.0,2.0,11.0
...,...,...,...,...,...,...,...,...
707,0.908600,0.646142,-0.552714,5.350885,-0.102875,0.0,1.0,12.0
708,-0.287191,-0.182430,-0.552714,-0.506787,-0.407687,1.0,2.0,14.0
709,-1.482983,-0.734812,-0.552714,-0.506787,-0.086335,0.0,2.0,8.0
710,-1.482983,-0.251478,-0.552714,-0.506787,-0.086335,1.0,0.0,11.0
