In [1]:
import numpy as np 
import pandas as pd 
from catboost import CatBoostClassifier
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, cross_val_predict
from IPython.display import display
import optuna
import joblib
import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/hits-2024-spaceship-titanic-1st-module/sample_submission.csv
/kaggle/input/hits-2024-spaceship-titanic-1st-module/train.csv
/kaggle/input/hits-2024-spaceship-titanic-1st-module/test.csv


In [2]:
out_of_range_value = -9999
target = 'Transported'

In [3]:
train_df_raw = pd.read_csv('../input/hits-2024-spaceship-titanic-1st-module/train.csv')
y = train_df_raw[target]
train_df_raw = train_df_raw.drop(columns=target)

<h1>Prepare the dataset</h1>
fix missing values

In [4]:
def fix_mis_val(data):
    data = data.drop(columns = 'Name') #drop useless feature

    data.RoomService = data.RoomService.fillna(0)
    data.FoodCourt = data.FoodCourt.fillna(0)
    data.ShoppingMall = data.ShoppingMall.fillna(0)
    data.Spa = data.Spa.fillna(0)
    data.VRDeck = data.VRDeck.fillna(0)

    data.VIP = data.VIP.fillna(data.VIP.mode()[0]) #test
    data.Destination = data.Destination.fillna(data.Destination.mode()[0])
    data.HomePlanet = data.HomePlanet.fillna(data.HomePlanet.mode()[0])

    
#     data.Cabin = data.Cabin.fillna(f'{out_of_range_value}/{out_of_range_value}/{out_of_range_value}')
#     data.Age = data.Age.fillna(data.Age.mean())
    
    data.Cabin = data.Cabin.fillna(f'{out_of_range_value}/{out_of_range_value}/{out_of_range_value}')
    data.Age = data.Age.fillna(out_of_range_value)

    data.loc[(data['CryoSleep'].isnull()) & (data['RoomService'] + data['FoodCourt'] + data['Spa'] + data['ShoppingMall'] + data['VRDeck']) > 0, 'CryoSleep'] = False
    data.loc[data['CryoSleep'].isnull(), 'CryoSleep'] = True
    data.CryoSleep = pd.to_numeric(data.CryoSleep)
    return data

unpack Cabin and PassengerId

In [5]:
def unpack_features(data):
#     data.PassengerId = data.PassengerId.map(lambda x: int(x.split('_')[0])) # get groups from PassengerId
#     data['in_group'] = data['PassengerId'].duplicated(keep=False) # feature indicated passangers in groups
    data = data.drop(columns='PassengerId')
    data['deck'] = data.Cabin.map(lambda x: x.split('/')[0]).map(lambda x: x if x != str(out_of_range_value) else 'Z')
    data['side'] = data.Cabin.map(lambda x: x.split('/')[2]).map(lambda x: x if x != str(out_of_range_value) else 'Z')
    data = data.drop(columns = 'Cabin')
    return data

replace range features with ints

In [6]:
def replace_range_features(data):
    mapdict = {val: i for i, val in enumerate(sorted(set(data['deck'].values)))}
    data['deck'] = data['deck'].map(mapdict)
    return data

In [7]:
def one_hot(data):
#     cat_features = ['HomePlanet', 'Destination']
    cat_features = ['HomePlanet', 'Destination', 'side']
#     cat_features = ['Destination', 'side']

    for cat_f in cat_features:
        col = data[cat_f]
        for val in col.dropna().unique():
    #         new_col_data = col == val
            new_col_data = col.map(lambda x: np.nan if pd.isna(x) else x == val)
            data[cat_f + '_' + val] = new_col_data
        data = data.drop(columns = cat_f)
    return data

In [8]:
def prepare_data(data):
    data = fix_mis_val(data)
    data = unpack_features(data)
    data = replace_range_features(data)
    data = one_hot(data)
    return data

In [9]:
def fit_predict(X, y, test_df, params):
    model = CatBoostClassifier(
    **params,
    random_seed=42,
    logging_level='Silent')
    
    categorical_features_indices = np.where(X.dtypes == object)[0]
    model.fit(X, y, cat_features=categorical_features_indices)
    predictions = model.predict(test_df)
    return predictions

In [10]:
def multithreaded_task(X, y, X_val, y_val, params):
    X = prepare_data(X)
    X_val = prepare_data(X_val)
    pred = (fit_predict(X, y, X_val, params) == 'True')
    acc = np.mean(pred == y_val)
    return acc

In [11]:
def accuracy_multithreaded_kfoldvalidation(k, X, y):
    kfold = StratifiedKFold(n_splits=k, shuffle=True, random_state=1311)
    
    results = joblib.Parallel(n_jobs=-1)(
        joblib.delayed(multithreaded_task)(
            X.iloc[train_index], y.iloc[train_index],
            X.iloc[test_index], y.iloc[test_index],
            {}
        )
        for train_index, test_index in kfold.split(X, y)
    )

    accuracy = np.mean(results)
    return accuracy
# accuracy_multithreaded_kfoldvalidation(7, train_df_raw, y)

In [19]:
params = {
    'learning_rate': 0.05464365018363548, 
    'depth': 5, 
    'iterations': 322, 
    'l2_leaf_reg': 0.937536434535921, 
    'random_strength': 0.8110316739919927, 
    'boost_from_average': False, 
    'subsample': 0.6665211600601834
}

In [21]:
drop_features = ['Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'Destination_55 Cancri e']
def accuracy_kfoldvalidation(k, X, y, params):
    kfold = StratifiedKFold(n_splits=k, shuffle=True, random_state=1311)
    accuracy_scores  = []
    for train_index, val_index in kfold.split(X, y):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        
        X_train = prepare_data(X_train)
        X_val = prepare_data(X_val)
        X_train = X_train.drop(columns = drop_features)
        X_val = X_val.drop(columns = drop_features)
        
        pred = (fit_predict(X_train, y_train, X_val, params) == 'True')
        acc = np.mean(pred == y_val)
        accuracy_scores.append(acc)

    accuracy = np.mean(accuracy_scores)
    return accuracy
# accuracy_kfoldvalidation(7, train_df_raw, y, params)

0.8072008500309659

* base score not one hot: 0.8022544468783476
* drop HomePlanet: 0.8005286556419934
* drop 'in_group', 'side_Z': 0.8035192238499528
* drop  'side_Z': 0.8024838417929542
* drop 'in_group', 'side_Z', 'side_P', 'side_S': 0.8005286556419934
* drop 'in_group', 'side_Z', 'HomePlanet_Earth': 0.8031744363419381
* drop 'in_group', 'side_Z', 'VIP':  0.8022538907694637
* drop 'in_group', 'side_Z', 'Destination_PSO J318.5-22': 0.8045544205373234 
* drop 'in_group', 'side_Z', 'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'Destination_55 Cancri e': 0.803979867375446
* drop 'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'Destination_55 Cancri e': 0.8047845569304422

Score after drop features:
*     CryoSleep: 0.8019093813158908
*     Age: 0.7978834310509326
*     VIP: 0.8022537053998358
*     RoomService: 0.7971925584475071
*     FoodCourt: 0.79834277698903
*     ShoppingMall: 0.8013338086210595
*     Spa: 0.7860343263769672
*     VRDeck: 0.7854587536821361
*     in_group: 0.8027137001316309
*     deck: 0.797998730959527
*     HomePlanet_Europa: 0.8029444853184474
*     HomePlanet_Earth: 0.8040946111751562
*     HomePlanet_Mars: 0.8038640113579678
*     Destination_TRAPPIST-1e: 0.802369561417314
*     Destination_PSO J318.5-22: 0.803174343657124
*     Destination_55 Cancri e: 0.801103672227941
*     side_P: 0.80156375964455
*     side_S: 0.8016785961290743
*     side_Z: 0.8024838417929542

In [13]:
def objective_iter_lr(trial):
    params = {
        "learning_rate": trial.suggest_float('learning_rate', 0.05, 0.3),
        "depth": 3,
        "iterations": trial.suggest_int('iterations', 250, 700),
        "l2_leaf_reg": 0.4179455283250788,
        "random_strength": 0.24904686336557558,
        "grow_policy": "Depthwise",
        "leaf_estimation_method": "Gradient",
        "feature_border_type": "MinEntropy",
        "boost_from_average": True,
        "subsample": 0.6599902174123564
    }
    return accuracy_kfoldvalidation(5, params)

def objectiveSome(trial):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.2),
        'depth': 3,
        'iterations': trial.suggest_int('iterations', 250, 700),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 1),
        'random_strength': trial.suggest_float('random_strength', 0.1, 1.0),
        'grow_policy': 'Depthwise',
        'leaf_estimation_method': 'Gradient',
        'feature_border_type': 'MinEntropy',
        'boost_from_average': True, 
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
    }
    return accuracy_kfoldvalidation(7, train_df_raw, y, params)

def objectiveAll(trial):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.3),
        'depth': trial.suggest_int('depth', 3, 5),
        'iterations': trial.suggest_int('iterations', 200, 500),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 1),
        'random_strength': trial.suggest_float('random_strength', 0.1, 1.0),
        'boost_from_average': trial.suggest_categorical('boost_from_average', [True, False]),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
    }
    return accuracy_kfoldvalidation(7, train_df_raw, y, params)

In [None]:
# study = optuna.create_study(direction="maximize")
# study.optimize(objectiveAll, n_trials=1000, n_jobs=-1)

# print("Best trial:")
# trial = study.best_trial
# print("Value: ", trial.value)
# print("Params: ")
# for key, value in trial.params.items():
#     print(f"    {key}: {value}")

In [None]:
features = ['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall',
       'Spa', 'VRDeck', 'in_group', 'deck', 'HomePlanet_Europa',
       'HomePlanet_Earth', 'HomePlanet_Mars', 'Destination_TRAPPIST-1e',
       'Destination_PSO J318.5-22', 'Destination_55 Cancri e', 'side_P',
       'side_S', 'side_Z']

# for f in features:
#     acc = accuracy_kfoldvalidation(7, train_df_raw, y, f)
#     print(f"    {f}: {acc}")

'learning_rate': 0.05464365018363548, 
'depth': 5, 
'iterations': 322, 
'l2_leaf_reg': 0.937536434535921, 
'random_strength': 0.8110316739919927, 
'boost_from_average': False, 
'subsample': 0.6665211600601834

In [22]:
def solve_out():
    features_to_drop = []

#     features_to_drop = [ 'Destination']
    features_to_drop = [ 'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'Destination_55 Cancri e']

    
    train_df = pd.read_csv('../input/hits-2024-spaceship-titanic-1st-module/train.csv')
    y = train_df[target]
    train_df = train_df.drop(columns=target)

    test_df = pd.read_csv('../input/hits-2024-spaceship-titanic-1st-module/test.csv')
    PassengerIds = test_df.PassengerId

    X =  prepare_data(train_df)
    test_df = prepare_data(test_df)

    X = X.drop(columns=features_to_drop)
    test_df = test_df.drop(columns=features_to_drop)
    
    predictions = fit_predict(X, y, test_df, params)

    out =  pd.DataFrame(data={'PassengerId': PassengerIds.values, 'Transported': predictions})
    out.to_csv('/kaggle/working/outf.csv', index=False)
# solve_out()