In [77]:
import numpy as np
import pandas as pd
from typing import Optional, Union

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold

In [6]:
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")

df_train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [143]:
class CabinSplitter(TransformerMixin, BaseEstimator):
    def __init__(self):
        return
    
    def fit(self, X: np.ndarray, y: Optional[np.ndarray]=None):
        return self
    
    def transform(self, X: Union[np.ndarray, pd.DataFrame, pd.Series]):
        if isinstance(X, pd.DataFrame):
            s = X.iloc[:, 0]
        elif isinstance(X, np.ndarray):
            s = pd.Series(X.ravel())
        else:
            s = pd.Series(X)

        s = s.astype(str)
        parts = s.str.split("/", n=2, expand=True)  # (n, 3)
        return parts.iloc[:, [0, 2]]
    def get_feature_names_out(self, input_features=None):
        return np.array(["cabin_1", "cabin_3"])


class PassengerIdSplitter(TransformerMixin, BaseEstimator):
    def __init__(self):
        return
    
    def fit(self, X: np.ndarray, y: Optional[np.ndarray]=None):
        return self
    
    def transform(self, X: Union[np.ndarray, pd.DataFrame, pd.Series]):
        if isinstance(X, pd.DataFrame):
            s = X.iloc[:, 0]
        elif isinstance(X, np.ndarray):
            s = pd.Series(X.ravel())
        else:
            s = pd.Series(X)

        s = s.astype(str)
        parts = s.str.split("_", n=1, expand=True)  # (n, 3)
        print(parts)
        return parts.values

class TotalSpend(TransformerMixin, BaseEstimator):
    def __init__(self):
        return
    
    def fit(self, X: np.ndarray, y: Optional[np.ndarray]=None):
        return self
    
    def transform(self, X: Union[np.ndarray, pd.DataFrame, pd.Series]):
        s: pd.DataFrame
        if isinstance(X, pd.DataFrame):
            s = X
        elif isinstance(X, np.ndarray):
            s = pd.DataFrame(X)
        else:
            s = pd.DataFrame(X)
        return s.sum(axis=1).to_numpy().reshape(-1, 1)


num_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
bin_cols = ['VIP', 'CryoSleep']              # строковые True/False/Yes/No/0/1 и т.п.
cat_cols = ['HomePlanet', 'Destination']     # обычные категориальные
cabin_col = ['Cabin']
passanger_id_col = ['PassengerId']

num_pipe = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler()),
])

bin_pipe = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', drop='if_binary')),
])

cat_pipe = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore')),
])

cabin_pipe = Pipeline(steps=[
    ('split', CabinSplitter()),
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore')),
])

passanger_id_pipe = Pipeline(steps=[
    ('split', PassengerIdSplitter()),
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore')),
])

total_spend_pipe = Pipeline(steps=[
    ('calc', TotalSpend()),
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler()),
])

preprocess = ColumnTransformer(
    transformers=[
        ('num',   num_pipe,  num_cols),
        ('bin',   bin_pipe,  bin_cols),
        ('cat',   cat_pipe,  cat_cols),
        ('cabin', cabin_pipe, cabin_col),
        # ('passanger_id', passanger_id_pipe, passanger_id_col),
        ('total_spend', total_spend_pipe, ["ShoppingMall", "FoodCourt", "RoomService", "Spa", "VRDeck"]),
    ],
    sparse_threshold=0.0,
    remainder='drop'
)

clf = Pipeline(steps=[
		('prep', preprocess),
        # ('model', LogisticRegression(max_iter=200))
    ])
pd.DataFrame(clf.fit_transform(X))


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
0,0.711945,-0.333105,-0.281027,-0.283579,-0.270626,-0.263003,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.514066
1,-0.334037,-0.168073,-0.275387,-0.241771,0.217158,-0.224205,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.251479
2,2.036857,-0.268001,1.959998,-0.283579,5.695623,-0.219796,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.190333
3,0.293552,-0.333105,0.523010,0.336851,2.687176,-0.092818,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.332604
4,-0.891895,0.125652,-0.237159,-0.031059,0.231374,-0.261240,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.124824
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,0.851410,-0.333105,3.992336,-0.283579,1.189173,-0.197751,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.531369
8689,-0.752431,-0.333105,-0.281027,-0.283579,-0.270626,-0.263003,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,-0.514066
8690,-0.194573,-0.333105,-0.281027,2.846999,-0.269737,-0.263003,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.154175
8691,0.223820,-0.333105,0.376365,-0.283579,0.043013,2.589576,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.140302


In [153]:
y = df_train['Transported'].astype(int)
X = df_train.drop(columns=['Transported'])
kf = KFold(n_splits=4)

scores = []
for i, (train_index, test_index) in enumerate(kf.split(X)):
	x_train = X.iloc[train_index]
	y_train = y.iloc[train_index]
	x_test = X.iloc[test_index]
	y_test = y.iloc[test_index]

	clf = Pipeline(steps=[
		('prep', preprocess),
		('model', LogisticRegression(max_iter=200))
    ])
	clf.fit(x_train, y_train)
	y_pred = clf.predict(x_test)
	# y_proba = clf.predict_proba(x_test)[:, 1]
	scores.append(f1_score(y_test, y_pred))
scores

[0.7659380692167578,
 0.8108108108108109,
 0.7930283224400871,
 0.8032861706983113]

In [154]:
from sklearn.ensemble import GradientBoostingClassifier


y = df_train['Transported'].astype(int)
X = df_train.drop(columns=['Transported'])
kf = KFold(n_splits=4)

scores = []
for i, (train_index, test_index) in enumerate(kf.split(X)):
	x_train = X.iloc[train_index]
	y_train = y.iloc[train_index]
	x_test = X.iloc[test_index]
	y_test = y.iloc[test_index]

	clf = Pipeline(steps=[
		('prep', preprocess),
		('model', GradientBoostingClassifier())
    ])
	clf.fit(x_train, y_train)
	y_pred = clf.predict(x_test)
	# y_proba = clf.predict_proba(x_test)[:, 1]
	scores.append(f1_score(y_test, y_pred))
scores

[0.7769718948322756,
 0.8117081695063346,
 0.8113695090439277,
 0.8136363636363636]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.711945,-0.333105,-0.281027,-0.283579,-0.270626,-0.263003,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,-0.334037,-0.168073,-0.275387,-0.241771,0.217158,-0.224205,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,2.036857,-0.268001,1.959998,-0.283579,5.695623,-0.219796,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.293552,-0.333105,0.523010,0.336851,2.687176,-0.092818,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,-0.891895,0.125652,-0.237159,-0.031059,0.231374,-0.261240,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,0.851410,-0.333105,3.992336,-0.283579,1.189173,-0.197751,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
8689,-0.752431,-0.333105,-0.281027,-0.283579,-0.270626,-0.263003,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
8690,-0.194573,-0.333105,-0.281027,2.846999,-0.269737,-0.263003,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
8691,0.223820,-0.333105,0.376365,-0.283579,0.043013,2.589576,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
