In [57]:
import numpy as np
import pandas as pd
from typing import Optional, Union

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold

In [6]:
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")

df_train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [18]:
df_train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [None]:



class CabinSplitter(TransformerMixin, BaseEstimator):
    def __init__(self):
        return
    
    def fit(self, X: np.ndarray, y: Optional[np.ndarray]=None):
        return self
    
    def transform(self, X: Union[np.ndarray, pd.DataFrame, pd.Series]):
        if isinstance(X, pd.DataFrame):
            s = X.iloc[:, 0]
        elif isinstance(X, np.ndarray):
            s = pd.Series(X.ravel())
        else:
            s = pd.Series(X)

        s = s.astype(str)
        parts = s.str.split("/", n=2, expand=True)  # (n, 3)
        return parts.values

class PassengerIdSplitter(TransformerMixin, BaseEstimator):
    def __init__(self):
        return
    
    def fit(self, X: np.ndarray, y: Optional[np.ndarray]=None):
        return self
    
    def transform(self, X: Union[np.ndarray, pd.DataFrame, pd.Series]):
        if isinstance(X, pd.DataFrame):
            s = X.iloc[:, 0]
        elif isinstance(X, np.ndarray):
            s = pd.Series(X.ravel())
        else:
            s = pd.Series(X)

        s = s.astype(str)
        parts = s.str.split("_", n=1, expand=True)  # (n, 3)
        return parts.values

class TotalSpend(TransformerMixin, BaseEstimator):
    def __init__(self):
        return
    
    def fit(self, X: np.ndarray, y: Optional[np.ndarray]=None):
        return self
    
    def transform(self, X: Union[np.ndarray, pd.DataFrame, pd.Series]):
        s: pd.DataFrame
        if isinstance(X, pd.DataFrame):
            s = X
        elif isinstance(X, np.ndarray):
            s = pd.DataFrame(X)
        else:
            s = pd.DataFrame(X)
        return s.sum(axis=1).to_numpy().reshape(-1, 1)


num_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
bin_cols = ['VIP', 'CryoSleep']              # строковые True/False/Yes/No/0/1 и т.п.
cat_cols = ['HomePlanet', 'Destination']     # обычные категориальные
cabin_col = ['Cabin']
passanger_id_col = ['PassengerId']

num_pipe = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler()),
])

bin_pipe = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    # drop='if_binary' оставит 1 столбец вместо 2 для строго бинарных,
    ('ohe', OneHotEncoder(handle_unknown='ignore', drop='if_binary')),
])

cat_pipe = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore')),
])

cabin_pipe = Pipeline(steps=[
    ('split', CabinSplitter()),
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore')),
])

passanger_id_pipe = Pipeline(steps=[
    ('split', CabinSplitter()),
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore')),
])

total_spend_pipe = Pipeline(steps=[
    ('calc', TotalSpend()),
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler()),
])

preprocess = ColumnTransformer(
    transformers=[
        ('num',   num_pipe,  num_cols),
        ('bin',   bin_pipe,  bin_cols),
        ('cat',   cat_pipe,  cat_cols),
        ('cabin', cabin_pipe, cabin_col),
        ('passanger_id', passanger_id_pipe, passanger_id_col),
        ('total_spend', total_spend_pipe, ["ShoppingMall", "FoodCourt", "RoomService", "Spa", "VRDeck"]),
    ],
    remainder='drop'
)

In [61]:
y = df_train['Transported'].astype(int)
X = df_train.drop(columns=['Transported'])
kf = KFold(n_splits=4)

scores = []
for i, (train_index, test_index) in enumerate(kf.split(X)):
	x_train = X.iloc[train_index]
	y_train = y.iloc[train_index]
	x_test = X.iloc[test_index]
	y_test = y.iloc[test_index]

	clf = Pipeline(steps=[
		('prep', preprocess),
		('model', LogisticRegression(max_iter=200))
    ])
	clf.fit(x_train, y_train)
	y_pred = clf.predict(x_test)
	# y_proba = clf.predict_proba(x_test)[:, 1]
	scores.append(f1_score(y_test, y_pred))
scores

[0.7533302710151585,
 0.8021154693697664,
 0.7840560665790627,
 0.7985414767547858]

NotFittedError: Pipeline is not fitted yet.