In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier

In [6]:
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")

df_train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [30]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [None]:
def fillna(primary_df: pd.DataFrame, transform_df: pd.DataFrame) -> pd.DataFrame:
	new_df = transform_df.copy()

	mean_age = primary_df["Age"].mean()
	mean_vip = primary_df['VIP'].value_counts().idxmax()
	mean_CryoSleep = primary_df['CryoSleep'].value_counts().idxmax()
	mean_RoomService = primary_df["RoomService"].mean()
	mean_FoodCourt = primary_df["FoodCourt"].mean()
	mean_ShoppingMall = primary_df["ShoppingMall"].mean()
	mean_Spa = primary_df["Spa"].mean()
	mean_VRDeck = primary_df["VRDeck"].mean()
	# new_df["HomePlanet"].fillna("Undefined", inplace=True)
	new_df["CryoSleep"].fillna(mean_CryoSleep, inplace=True)
	new_df["Cabin"].fillna("Undefined/Undefined/Undefined", inplace=True)
	new_df["Destination"].fillna("Undefined", inplace=True)
	new_df["Age"].fillna(mean_age, inplace=True)
	new_df["VIP"].fillna(mean_vip, inplace=True)
	new_df["RoomService"].fillna(mean_RoomService, inplace=True)
	new_df["FoodCourt"].fillna(mean_FoodCourt, inplace=True)
	new_df["ShoppingMall"].fillna(mean_ShoppingMall, inplace=True)
	new_df["Spa"].fillna(mean_Spa, inplace=True)
	new_df["VRDeck"].fillna(mean_VRDeck, inplace=True)
	new_df["Name"].fillna("Undefined", inplace=True)
	
	return new_df


def transform_features(df: pd.DataFrame) -> pd.DataFrame:
	df_new = df.copy()
	cabins = df_new["Cabin"].astype(str).apply(lambda x: x.split("/"))
	cabins = pd.DataFrame(cabins.tolist(), columns=["Cabin_1", "Cabin_2", "Cabin_3"])
	df_new = pd.concat([df_new, cabins], axis=1)
	df_new = df_new.drop("Cabin", axis=1)
	categorical_cols = ["HomePlanet", "Destination"]
	df_new = pd.get_dummies(df_new, columns=categorical_cols, dtype=int)
	df_new["Transported"] = df_new["Transported"].astype(int)
	df_new["VIP"] = df_new["VIP"].astype(int)
	df_new["CryoSleep"] = df_new["CryoSleep"].astype(int)
	spent_cols = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
	df_new['TotalSpend'] = df[spent_cols].sum(1)
	df_new['GroupId'] = df['PassengerId'].str.split('_').str[0].astype(int)
	df_new['GroupSubId'] = df['PassengerId'].str.split('_').str[1].astype(int)
	return df_new

df_train = transform_features(fillna(df_train, df_train))

In [17]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [18]:
df_train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [None]:
import numpy as np
import pandas as pd
from typing import List, Dict, Any, Optional, Union

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score


class CabinSplitter(TransformerMixin, BaseEstimator):
    def __init__(self):
        return
    
    def fit(self, X: np.ndarray, y: Optional[np.ndarray]=None):
        return self
    
    def transform(self, X: Union[np.ndarray, pd.DataFrame, pd.Series]):
        if isinstance(X, pd.DataFrame):
            s = X.iloc[:, 0]
        elif isinstance(X, np.ndarray):
            s = pd.Series(X.ravel())
        else:
            s = pd.Series(X)

        s = s.astype(str)
        parts = s.str.split("/", n=2, expand=True)  # (n, 3)
        return parts.values

class PassengerIdSplitter(TransformerMixin, BaseEstimator):
    def __init__(self):
        return
    
    def fit(self, X: np.ndarray, y: Optional[np.ndarray]=None):
        return self
    
    def transform(self, X: Union[np.ndarray, pd.DataFrame, pd.Series]):
        if isinstance(X, pd.DataFrame):
            s = X.iloc[:, 0]
        elif isinstance(X, np.ndarray):
            s = pd.Series(X.ravel())
        else:
            s = pd.Series(X)

        s = s.astype(str)
        parts = s.str.split("_", n=1, expand=True)  # (n, 3)
        return parts.values

class TotalSpend(TransformerMixin, BaseEstimator):
    def __init__(self):
        return
    
    def fit(self, X: np.ndarray, y: Optional[np.ndarray]=None):
        return self
    
    def transform(self, X: Union[np.ndarray, pd.DataFrame, pd.Series]):
        s: pd.DataFrame
        if isinstance(X, pd.DataFrame):
            s = X
        elif isinstance(X, np.ndarray):
            s = pd.DataFrame(X)
        else:
            s = pd.DataFrame(X)
        return s.sum(axis=1).to_numpy().reshape(-1, 1)


num_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
bin_cols = ['VIP', 'CryoSleep']              # строковые True/False/Yes/No/0/1 и т.п.
cat_cols = ['HomePlanet', 'Destination']     # обычные категориальные
cabin_col = ['Cabin']
passanger_id_col = ['PassengerId']

num_pipe = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler()),
])

bin_pipe = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    # drop='if_binary' оставит 1 столбец вместо 2 для строго бинарных,
    ('ohe', OneHotEncoder(handle_unknown='ignore', drop='if_binary')),
])

cat_pipe = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore')),
])

cabin_pipe = Pipeline(steps=[
    ('split', CabinSplitter()),
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore')),
])

passanger_id_pipe = Pipeline(steps=[
    ('split', CabinSplitter()),
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore')),
])

total_spend_pipe = Pipeline(steps=[
    ('calc', TotalSpend()),
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler()),
])

preprocess = ColumnTransformer(
    transformers=[
        ('num',   num_pipe,  num_cols),
        ('bin',   bin_pipe,  bin_cols),
        ('cat',   cat_pipe,  cat_cols),
        ('cabin', cabin_pipe, cabin_col),
        ('passanger_id', passanger_id_pipe, passanger_id_col),
        ('total_spend', total_spend_pipe, ["ShoppingMall", "FoodCourt", "RoomService", "Spa", "VRDeck"]),
    ],
    remainder='drop'
)

clf = Pipeline(steps=[
    ('prep', preprocess),
    ('model', LogisticRegression(max_iter=200))
])

y = df_train['Transported'].astype(int)
X = df_train.drop(columns=['Transported'])

clf.fit(X, y)
y_pred = clf.predict(X)
y_proba = clf.predict_proba(X)[:, 1]
f1_score(y, y_pred, average='macro')

0.8997883944561547