In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import missingno as msno
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings("ignore")

In [2]:
from catboost import CatBoostClassifier
import optuna
from optuna.integration import CatBoostPruningCallback
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
train, X_test = pd.read_csv('/kaggle/input/hits-2024-spaceship-titanic-1st-module/train.csv'), pd.read_csv('/kaggle/input/hits-2024-spaceship-titanic-1st-module/test.csv')  
X_train, y_train = train.iloc[:, :-1], train.iloc[:, -1:]

y_train = y_train.astype(bool).astype(int)

In [4]:
def split_feature(df, feature, new_features, sep):
    df[new_features] = df[feature].str.split(sep, expand=True)
    return df

In [5]:
def drop_features(df, features):
    df.drop(features, axis=1, inplace=True)
    return df

In [6]:
def cast_feature(df, feature, cast):
    df[feature] = df[feature].astype(cast)
    return df

In [7]:
X_test = split_feature(X_test, 'PassengerId', ['GroupId', 'IdWithinGroup'], '_')
X_train = split_feature(X_train, 'PassengerId', ['GroupId', 'IdWithinGroup'], '_')

X_test = split_feature(X_test, 'Cabin', ['Deck', 'Num', 'Side'], '/')
X_train = split_feature(X_train, 'Cabin', ['Deck', 'Num', 'Side'], '/')

X_test = drop_features(X_test, ['Name', 'PassengerId', 'Cabin', 'VIP', 'Num'])
X_train = drop_features(X_train, ['Name', 'PassengerId', 'Cabin', 'VIP', 'Num'])

X_test = cast_feature(X_test, 'GroupId', 'float')
X_train = cast_feature(X_train, 'GroupId', 'float')

In [7]:
X_train.isnull().sum()

HomePlanet       201
CryoSleep        217
Destination      182
Age              179
RoomService      181
FoodCourt        183
ShoppingMall     208
Spa              183
VRDeck           188
GroupId            0
IdWithinGroup      0
Deck             199
Side             199
dtype: int64

In [8]:
home_planet_deck = X_train.groupby(['HomePlanet', 'Deck']).size().unstack().fillna(0)
home_planet_deck

Deck,A,B,C,D,E,F,G,T
HomePlanet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Earth,0.0,0.0,0.0,0.0,395.0,1614.0,2498.0,0.0
Europa,252.0,766.0,734.0,186.0,128.0,0.0,0.0,4.0
Mars,0.0,0.0,0.0,282.0,330.0,1110.0,0.0,0.0


In [8]:
def impute_cryo_sleep(df):
    df.loc[
        ((df['RoomService'] == 0.0) | df['RoomService'].isnull()) & 
        ((df['FoodCourt'] == 0.0) | df['FoodCourt'].isnull()) & 
        ((df['ShoppingMall'] == 0.0) | df['ShoppingMall'].isnull()) & 
        ((df['Spa'] == 0.0) | df['Spa'].isnull()) &
        ((df['VRDeck'] == 0.0) | df['VRDeck'].isnull()) &
        (df['CryoSleep'].isnull()), 
        'CryoSleep'
    ] = True
    
    df.loc[
        ((df['RoomService'] > 0.0) | 
        (df['FoodCourt'] > 0.0) | 
        (df['ShoppingMall'] > 0.0) | 
        (df['Spa'] > 0.0) |
        (df['VRDeck'] > 0.0)) & (df['CryoSleep'].isnull()), 
        'CryoSleep'
    ] = False
    return df

X_train = impute_cryo_sleep(X_train)
X_test = impute_cryo_sleep(X_test)

In [9]:
def impute_home_planet_by_deck(df):
    df.loc[
        (df['Deck'] == 'G') & (df['HomePlanet'].isnull()), 
        'HomePlanet'
    ] = 'Earth'
    
    europa_decks = ['A', 'B', 'C', 'T']
    df.loc[
        (df['Deck'].isin(europa_decks)) & (df['HomePlanet'].isnull()), 
        'HomePlanet'
    ] = 'Europa'
    df.loc[
        (df['Deck'] == 'F') & (df['HomePlanet'].isnull()), 
        'HomePlanet'
    ] = 'Mars'

    return df

X_train = impute_home_planet_by_deck(X_train)
X_test = impute_home_planet_by_deck(X_test)

In [10]:
home_planet_deck = X_train.groupby(['HomePlanet', 'Deck']).size().unstack().fillna(0)

earth = home_planet_deck.loc['Earth']
earth_proba = list(earth / sum(earth))

europa = home_planet_deck.loc['Europa']
europa_proba = list(europa / sum(europa))

mars = home_planet_deck.loc['Mars']
mars_proba = list(mars / sum(mars))

decks = X_train['Deck'].unique()
deck_values = sorted(decks[~pd.isnull(decks)]) #['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T']
planet_proba = dict(zip(['Earth', 'Mars', 'Europa'], [earth_proba, mars_proba, europa_proba]))

In [11]:
np.random.seed(240304)

In [12]:
def impute_deck_by_home_planet(df):
    for planet in planet_proba.keys():
        planet_null_decks_shape = df.loc[(df['HomePlanet'] == planet) & (df['Deck'].isnull()), 'Deck'].shape[0]
        df.loc[(df['HomePlanet'] == planet) & (df['Deck'].isnull()), 'Deck'] = np.random.choice(deck_values, planet_null_decks_shape, p=planet_proba[planet]) 
    return df
               
X_train = impute_deck_by_home_planet(X_train)
X_test = impute_deck_by_home_planet(X_test)

In [13]:
def impute_age_by_planet(df):
    for planet in ['Europa', 'Earth', 'Mars']:
        planet_median = df[df['HomePlanet'] == planet]['Age'].median()
        df.loc[(df["Age"].isnull()) & (df["HomePlanet"] == planet),"Age"] = planet_median
    return df

X_train = impute_age_by_planet(X_train)
X_test = impute_age_by_planet(X_test)

In [15]:
X_train.isnull().sum()

HomePlanet        39
CryoSleep          0
Destination      182
Age                0
RoomService      181
FoodCourt        183
ShoppingMall     208
Spa              183
VRDeck           188
GroupId            0
IdWithinGroup      0
Deck               6
Side             199
dtype: int64

In [14]:
def impute_usluga_by_age(df):
    uniq_age = df['Age'].unique()
    uslugi = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    for age in uniq_age:
        for usluga in uslugi:
            usluga_median = df[df['Age'] == age][usluga].median()
            df.loc[(df[usluga].isnull()) & (df['Age'] == age), usluga] = usluga_median
    return df

X_train = impute_usluga_by_age(X_train)
X_test = impute_usluga_by_age(X_test)

In [17]:
X_train.isnull().sum()

HomePlanet        39
CryoSleep          0
Destination      182
Age                0
RoomService        0
FoodCourt          0
ShoppingMall       0
Spa                0
VRDeck             0
GroupId            0
IdWithinGroup      0
Deck               6
Side             199
dtype: int64

In [15]:
numerical_columns = X_train.describe().columns
categorical_columns = set(X_train.columns) - set(numerical_columns)

In [16]:
for col in numerical_columns:
    si = SimpleImputer(strategy='median')
    X_train[col] = si.fit_transform(X_train[col].values.reshape(-1, 1))
    X_test[col] = si.fit_transform(X_test[col].values.reshape(-1, 1))    

In [20]:
X_train['HomePlanet'].unique()

array(['Europa', 'Earth', 'Mars', nan], dtype=object)

In [17]:
for col in categorical_columns:
    si = SimpleImputer(strategy='most_frequent')
    X_train[[col]] = si.fit_transform(X_train[[col]])

In [18]:
def log_transform_data(df):
    for col in numerical_columns[1:-1]:
        df[col] = np.log(1 + df[col])
    return df

X_train = log_transform_data(X_train)
X_test = log_transform_data(X_test)

In [19]:
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

In [24]:
X_train

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,GroupId,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,...,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Side_P,Side_S
0,39.0,0.000000,0.000000,0.000000,0.000000,0.000000,1.0,False,True,False,...,False,True,False,False,False,False,False,False,True,False
1,24.0,4.700480,2.302585,3.258097,6.309918,3.806662,2.0,True,False,False,...,False,False,False,False,False,True,False,False,False,True
2,58.0,3.784190,8.182280,0.000000,8.812248,3.912023,3.0,False,True,False,...,True,False,False,False,False,False,False,False,False,True
3,33.0,0.000000,7.157735,5.918894,8.110728,5.267858,3.0,False,True,False,...,True,False,False,False,False,False,False,False,False,True
4,16.0,5.717028,4.262680,5.023881,6.338594,1.098612,4.0,True,False,False,...,False,False,False,False,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,41.0,0.000000,8.827615,0.000000,7.404888,4.317488,9276.0,False,True,False,...,True,False,False,False,False,False,False,False,True,False
8689,18.0,0.000000,0.000000,0.000000,0.000000,0.000000,9278.0,True,False,False,...,False,False,False,False,False,False,True,False,False,True
8690,26.0,0.000000,0.000000,7.535297,0.693147,0.000000,9279.0,True,False,False,...,False,False,False,False,False,False,True,False,False,True
8691,32.0,0.000000,6.956545,0.000000,5.869297,8.082093,9280.0,False,True,False,...,False,False,False,False,True,False,False,False,False,True


In [20]:
optuna.logging.disable_default_handler()


def objective(trial: optuna.Trial) -> float:
    _X_train, X_valid, _y_train, y_valid = train_test_split(
        X_train, y_train, test_size=0.25
    )
    
    params = {
        'objective': trial.suggest_categorical(
            'objective', 
            ['Logloss', 'CrossEntropy']
        ),
        'colsample_bylevel': trial.suggest_float(
            'colsample_bylevel', 
            0.01, 
            0.1, 
            log=True
        ),
        'depth': trial.suggest_int(
            'depth', 
            1, 
            12
        ),
        'boosting_type': trial.suggest_categorical(
            'boosting_type',
            ['Ordered', 'Plain']
        ),
        'bootstrap_type': trial.suggest_categorical(
            'bootstrap_type', 
            ['Bayesian', 'Bernoulli', 'MVS']
        ),
        'used_ram_limit': '8gb',
        'eval_metric': 'Accuracy',
        'logging_level': 'Silent',
        'random_seed': 21
    }
    
    if params['bootstrap_type'] == 'Bayesian':
        params['bagging_temperature'] = trial.suggest_float(
            'bagging_temperature', 0, 10
        )
    
    clf = CatBoostClassifier(**params)
    pruning_callback = CatBoostPruningCallback(trial, 'Accuracy')
    
    clf.fit(
        _X_train,
        _y_train,
        eval_set=[(X_valid, y_valid)],
        verbose=False,
        early_stopping_rounds=100,
        callbacks=[pruning_callback],
    )
    
    pruning_callback.check_pruned()
    
    predictions = clf.predict(X_valid)
    prediction_labels = np.rint(predictions)
    accuracy = accuracy_score(y_valid, prediction_labels)
    
    return accuracy
best_trials = pd.DataFrame(
    columns=[
        'objective',
        'colsample_bylevel',
        'depth',
        'boosting_type',
        'bootstrap_type',
        'best_value'
    ]
)

studies = 1
trials = 100
for n in range(studies):
    study = optuna.create_study(
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=5),
        direction='maximize'
    )
    study.optimize(objective, n_trials=trials, timeout=600)
    
    print(f'best value = {study.best_value}')
    
    trial = study.best_trial
    for k, v in trial.params.items():
        print(f'{k}: {v}')

    best_trials.loc[n] = study.best_trial.params
    best_trials['best_value'].loc[n] = study.best_value

best value = 0.8357865685372585
objective: CrossEntropy
colsample_bylevel: 0.08928075522096555
depth: 8
boosting_type: Ordered
bootstrap_type: MVS


In [22]:
best_trial = best_trials.sort_values('best_value', ascending=False).loc[0]

clf = CatBoostClassifier(**best_trial[:-1], logging_level='Silent', random_seed=240304)
clf.fit(X_train, y_train.astype(int))

predicted = clf.predict(X_test)

In [23]:
sub = pd.DataFrame()
sub['PassengerId'] = pd.read_csv('/kaggle/input/hits-2024-spaceship-titanic-1st-module/test.csv')['PassengerId']
sub['Transported'] = pd.Series(predicted).astype(bool)
sub.to_csv('submission.csv', index=False)