## Import

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, RobustScaler

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn import set_config

from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, ParameterGrid

from sklearn.metrics import accuracy_score, f1_score

## Data

In [2]:
X, y = fetch_openml('titanic', version=1, as_frame=True, return_X_y=True)
y = y.astype('int')

In [3]:
X.head()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [4]:
# split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Pipeline

In [5]:
# numerical and categorical variables
num_cols = ['age', 'sibsp', 'parch', 'fare']
cat_cols = ['sex', 'pclass']

# pipeline: preprocessing
num_transformer = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy='mean')), # 平均値で欠損値補完
        ('scaler', StandardScaler())                 # 標準化
    ]
)
cat_transformer = Pipeline(
    steps = [
        ('encoder', OneHotEncoder(handle_unknown='ignore')) # ダミー変数作成
    ]
)
preprocessor = ColumnTransformer(transformers = [
    ('num_transformr', num_transformer, num_cols),
    ('cat_transformer', cat_transformer, cat_cols)
])

# pipeline: all
pipeline = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression())
    ]
)

# Display
set_config(display='diagram')
pipeline

## Learning & Prediction

In [6]:
# 学習
set_config(display='None')
pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num_transformr',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'sibsp', 'parch',
                                                   'fare']),
                                                 ('cat_transformer',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['sex', 'pclass'])])),
                ('classifier', LogisticRegression())])

In [7]:
# 予測
y_test_pred = pipeline.predict(X_test)
print(f'F1_score: {f1_score(y_test, y_test_pred)}')

F1_score: 0.7039106145251396
