In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from category_encoders import TargetEncoder
from sklearn.model_selection import RepeatedKFold,cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

df=pd.read_csv("spaceship.csv")

#PassengerId、Nameカラムは不要なので削除
df=df.loc[:,(df.columns!="Name")&(df.columns!="PassengerId")]

#目的変数"Transported"がbooleanなのでintに変更
df["Transported"]=df["Transported"].astype("int")

X=df.drop("Transported",axis=1)
y=df["Transported"]

cat_imputer=SimpleImputer(strategy="most_frequent")
num_imputer=SimpleImputer(strategy="median")

cat_cols=X.select_dtypes(exclude=np.number).columns.to_list()
num_cols=X.select_dtypes(include=np.number).columns.to_list()

ct=ColumnTransformer([("cat_imputer",cat_imputer,cat_cols),
                    ("num_imputer",num_imputer,num_cols)])
ct.set_output(transform="pandas")

k=5
n_repeats=3
cv=RepeatedKFold(n_splits=k,n_repeats=n_repeats,random_state=0)

pipe_te=Pipeline(steps=[("ct",ct),
                     ("encoder",TargetEncoder()),
                     ("scaler",StandardScaler()),
                     ("model",LogisticRegression())])

score=np.mean(cross_val_score(pipe_te,X,y,cv=cv,scoring="accuracy"))