## Import

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    StandardScaler,
    OneHotEncoder,
    MinMaxScaler,
    RobustScaler,
)

from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    AdaBoostClassifier,
)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn import set_config

from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, ParameterGrid

from sklearn.metrics import accuracy_score, f1_score

## Data

In [2]:
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)
y = y.astype("int")

In [3]:
X.head()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [4]:
# split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Pipeline

In [5]:
# numerical and categorical variables
numeric_features = ["age", "sibsp", "parch", "fare"]
categorical_features = ["sex", "pclass"]

# pipeline: preprocessing
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")),  # 平均値で欠損値補完
        ("scaler", StandardScaler()),  # 標準化
    ]
)
categorical_transformer = Pipeline(
    steps=[("encoder", OneHotEncoder(handle_unknown="ignore"))]  # ダミー変数作成
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num_transform", numeric_transformer, numeric_features),
        ("cat_transform", categorical_transformer, categorical_features),
    ]
)

# pipeline: all
pipeline = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression())]
)

# Display
set_config(display="diagram")
pipeline

## Learn & Predict

In [6]:
# learn
set_config(display="None")
pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num_transform',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'sibsp', 'parch',
                                                   'fare']),
                                                 ('cat_transform',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['sex', 'pclass'])])),
                ('classifier', LogisticRegression())])

In [7]:
# predict
y_test_pred = pipeline.predict(X_test)
print("accuracy:", accuracy_score(y_test, y_test_pred))
print(f"F1_score: {f1_score(y_test, y_test_pred)}")

accuracy: 0.7977099236641222
F1_score: 0.7039106145251396


## Grid search

In [8]:
# Settings
# *There should be two underscores between estimator name and it's parameters
param_grid = [
    {
        "preprocessor__num_transform__imputer__strategy": ["mean", "median"],
        "preprocessor__num_transform__scaler": [
            StandardScaler(),
            MinMaxScaler(),
            RobustScaler(),
        ],
        "classifier__C": [0.1, 1.0, 10.0, 100.0],
        "classifier": [LogisticRegression()],
    }
]
grid_search = GridSearchCV(pipeline, param_grid, cv=10, verbose=3, n_jobs=-1)

In [9]:
# execute
grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num_transform',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer()),
                                                                                         ('scaler',
                                                                                          StandardScaler())]),
                                                                         ['age',
                                                                          'sibsp',
                                                                          'parch',
                                                                          'fare']),
                                                                        ('cat_trans

In [10]:
# results
print(grid_search.best_params_)
print(grid_search.best_score_)

{'classifier': LogisticRegression(C=0.1), 'classifier__C': 0.1, 'preprocessor__num_transform__imputer__strategy': 'median', 'preprocessor__num_transform__scaler': StandardScaler()}
0.7918406593406593


In [11]:
# predict
y_test_pred_gs = grid_search.predict(X_test)
print("accuracy:", accuracy_score(y_test, y_test_pred_gs))
print(f"f1_score: {f1_score(y_test, y_test_pred_gs)}")

accuracy: 0.7900763358778626
f1_score: 0.6892655367231638


## References
- https://www.salesanalytics.co.jp/datascience/datascience007/#i-14