In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
df_train = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")

df_train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [30]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [None]:
def fillna(primary_df: pd.DataFrame, transform_df: pd.DataFrame) -> pd.DataFrame:
	new_df = transform_df.copy()

	mean_age = primary_df["Age"].mean()
	mean_vip = primary_df['VIP'].value_counts().idxmax()
	mean_CryoSleep = primary_df['CryoSleep'].value_counts().idxmax()
	mean_RoomService = primary_df["RoomService"].mean()
	mean_FoodCourt = primary_df["FoodCourt"].mean()
	mean_ShoppingMall = primary_df["ShoppingMall"].mean()
	mean_Spa = primary_df["Spa"].mean()
	mean_VRDeck = primary_df["VRDeck"].mean()
	# new_df["HomePlanet"].fillna("Undefined", inplace=True)
	new_df["CryoSleep"].fillna(mean_CryoSleep, inplace=True)
	new_df["Cabin"].fillna("Undefined/Undefined/Undefined", inplace=True)
	new_df["Destination"].fillna("Undefined", inplace=True)
	new_df["Age"].fillna(mean_age, inplace=True)
	new_df["VIP"].fillna(mean_vip, inplace=True)
	new_df["RoomService"].fillna(mean_RoomService, inplace=True)
	new_df["FoodCourt"].fillna(mean_FoodCourt, inplace=True)
	new_df["ShoppingMall"].fillna(mean_ShoppingMall, inplace=True)
	new_df["Spa"].fillna(mean_Spa, inplace=True)
	new_df["VRDeck"].fillna(mean_VRDeck, inplace=True)
	new_df["Name"].fillna("Undefined", inplace=True)
	
	return new_df


def transform_features(df: pd.DataFrame) -> pd.DataFrame:
	df_new = df.copy()
	cabins = df_new["Cabin"].astype(str).apply(lambda x: x.split("/"))
	cabins = pd.DataFrame(cabins.tolist(), columns=["Cabin_1", "Cabin_2", "Cabin_3"])
	df_new = pd.concat([df_new, cabins], axis=1)
	df_new = df_new.drop("Cabin", axis=1)
	categorical_cols = ["HomePlanet", "Destination"]
	df_new = pd.get_dummies(df_new, columns=categorical_cols, dtype=int)
	df_new["Transported"] = df_new["Transported"].astype(int)
	df_new["VIP"] = df_new["VIP"].astype(int)
	df_new["CryoSleep"] = df_new["CryoSleep"].astype(int)
	spent_cols = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
	df_new['TotalSpend'] = df[spent_cols].sum(1)
	df_new['GroupId'] = df['PassengerId'].str.split('_').str[0].astype(int)
	df_new['GroupSubId'] = df['PassengerId'].str.split('_').str[1].astype(int)
	return df_new

df_train = transform_features(fillna(df_train, df_train))

In [16]:
from sklearn.preprocessing import FunctionTransformer

def custom_fillna(X):
    print(type(X))
    return X.fillna('NONE')

custom_imputer = FunctionTransformer(custom_fillna)

In [18]:
df_train

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [42]:
import numpy as np
import pandas as pd
from typing import List, Dict, Any

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression


# --- 1) Генератор признаков с импутацией по train-статистике ---
class FeatureGenerator(BaseEstimator, TransformerMixin):
    """
    Делает ИМПУТАЦИЮ и ГЕНЕРАЦИЮ ПРИЗНАКОВ:
    - Заполняет Age, сервисные траты медианами, VIP/CryoSleep/категории модами
    - Разбивает Cabin -> Cabin_1 (deck), Cabin_2 (num), Cabin_3 (side)
    - PassengerId -> GroupId / GroupSubId (числа)
    - TotalSpend = сумма сервисных трат
    - Приводит типы (VIP, CryoSleep -> {0,1})
    """
    numeric_spend = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    cat_cols_basic = ['HomePlanet', 'Destination']
    bool_like = ['VIP', 'CryoSleep']

    def fit(self, X: pd.DataFrame, y=None):
        X = X.copy()

        # Статистики для импутации
        self.median_age_ = float(X['Age'].median())
        self.median_spend_ = {c: float(X[c].median()) for c in self.numeric_spend}

        # Моды для объектных столбцов
        def mode_safe(s: pd.Series, default="Missing"):
            try:
                return s.mode(dropna=True).iloc[0]
            except Exception:
                return default

        self.mode_cat_ = {c: mode_safe(X[c]) for c in self.cat_cols_basic + self.bool_like}

        # Для Cabin/Name/PassengerId — дефолты, если NaN
        self.def_cabin_ = "Undefined/0/Undefined"
        self.def_name_ = "Undefined"
        self.def_dest_ = self.mode_cat_.get('Destination', 'Missing')

        return self

    def transform(self, X: pd.DataFrame):
        df = X.copy()

        # --- Импутация числовых ---
        df['Age'] = df['Age'].fillna(self.median_age_)
        for c in self.numeric_spend:
            df[c] = df[c].fillna(self.median_spend_[c])

        # --- Импутация категорий/булевых (хранятся как object) ---
        for c in self.cat_cols_basic + self.bool_like:
            df[c] = df[c].fillna(self.mode_cat_[c])

        # --- Cabin split ---
        # Пример значения: "B/45/P" -> deck/num/side
        cab = (
            df['Cabin']
            .fillna(self.def_cabin_)
            .astype(str)
            .str.split('/', n=2, expand=True)
            .rename(columns={0: 'Cabin_1', 1: 'Cabin_2', 2: 'Cabin_3'})
        )
        # Cabin_2 — числовой, остальное — категориальные
        cab['Cabin_2'] = pd.to_numeric(cab['Cabin_2'], errors='coerce').fillna(0).astype(int)
        df = df.drop(columns=['Cabin']).join(cab)

        # --- PassengerId split ---
        # Пример: "1234_02"
        pid = df['PassengerId'].astype(str).str.split('_', n=1, expand=True)
        df['GroupId'] = pd.to_numeric(pid[0], errors='coerce').fillna(0).astype(int)
        df['GroupSubId'] = pd.to_numeric(pid[1], errors='coerce').fillna(0).astype(int)

        # --- TotalSpend ---
        df['TotalSpend'] = df[self.numeric_spend].sum(axis=1)

        # --- Name (можно фичу длины имени) ---
        df['Name'] = df['Name'].fillna(self.def_name_)
        df['NameLen'] = df['Name'].astype(str).str.len()

        # --- Булевы как 0/1 ---
        def to01(s):
            s = s.astype(str).str.lower().map({'true': 1, 'false': 0})
            return s.fillna(0).astype(int)

        df['VIP'] = to01(df['VIP'])
        df['CryoSleep'] = to01(df['CryoSleep'])

        # Не трогаем HomePlanet/Destination (категориальные)
        # Transported — это таргет; из X его удалить, если внезапно есть
        if 'Transported' in df.columns:
            df = df.drop(columns=['Transported'])

        return df


# --- 2) Определяем группы признаков после FeatureGenerator ---
num_cols = [
    'Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
    'Cabin_2', 'GroupId', 'GroupSubId', 'TotalSpend', 'NameLen'
]
cat_cols = [
    'HomePlanet', 'Destination', 'Cabin_1', 'Cabin_3'
]
bin_cols = ['VIP', 'CryoSleep']  # уже 0/1 после FeatureGenerator

# --- 3) Колонночные пайплайны ---
num_pipe = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='median')),   # на всякий случай
    ('scale', StandardScaler())
])

cat_pipe = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

bin_pipe = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent'))
    # без масштабирования/кодирования: там уже 0/1
])

preprocess = ColumnTransformer(
    transformers=[
        ('num', num_pipe, num_cols),
        ('cat', cat_pipe, cat_cols),
        ('bin', bin_pipe, bin_cols),
    ],
    remainder='drop'
)

# --- 4) Итоговый сквозной Pipeline (генерация фич -> препроцессинг -> модель) ---
clf = Pipeline(steps=[
    ('featgen', FeatureGenerator()),
    ('prep', preprocess),
    ('model', LogisticRegression(max_iter=200, n_jobs=None))  # можно заменить на XGB/LightGBM/RandomForest
])


# ===== Пример использования =====
# Разделение X, y
y = df_train['Transported'].astype(int)
X = df_train.drop(columns=['Transported'])

# Обучение
clf.fit(X, y)

# Предсказания
y_pred = clf.predict(X)
y_proba = clf.predict_proba(X)[:, 1]

  df[c] = df[c].fillna(self.mode_cat_[c])
  df[c] = df[c].fillna(self.mode_cat_[c])
  df[c] = df[c].fillna(self.mode_cat_[c])


In [43]:
from sklearn.metrics import f1_score

y_pred = clf.predict(X)
f1_score(y, y_pred, average='macro')

  df[c] = df[c].fillna(self.mode_cat_[c])


0.7923300634439132