<a href="https://colab.research.google.com/github/Mmkdstaz/kaggle/blob/main/SpaceshipTitanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier


In [None]:
pip install catboost


Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

df = train.copy()
def split_cabin(df):
    df['Deck'] = df['Cabin'].str[0]
    df['Side'] = df['Cabin'].str[-1]
    df['CabinNum'] = df['Cabin'].str[2:-2].astype(float)
    return df

df = split_cabin(df)
test = split_cabin(test)


In [None]:
df['Group'] = df['PassengerId'].str.split('_').str[0].astype(int)
test['Group'] = test['PassengerId'].str.split('_').str[0].astype(int)


In [None]:
spend_cols = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']

df['TotalSpend'] = df[spend_cols].sum(axis=1)
test['TotalSpend'] = test[spend_cols].sum(axis=1)

df['HasSpend'] = (df['TotalSpend'] > 0).astype(int)
test['HasSpend'] = (test['TotalSpend'] > 0).astype(int)


In [None]:
for col in spend_cols:
    df.loc[df['CryoSleep'] == True, col] = df[col].fillna(0)
    test.loc[test['CryoSleep'] == True, col] = test[col].fillna(0)


In [None]:
num_cols = ['Age', 'CabinNum', 'TotalSpend']

for col in num_cols:
    df[col] = df[col].fillna(df[col].median())
    test[col] = test[col].fillna(df[col].median())


In [None]:
features = [
    'HomePlanet', 'Destination', 'Deck', 'Side',
    'Age', 'CabinNum', 'Group',
    'TotalSpend', 'HasSpend'
] + spend_cols

X = df[features]
y = df['Transported']
X_test = test[features]

cat_cols = X.select_dtypes('object').columns.tolist()


In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

accuracies = []

# Fill NaN values in categorical columns with a placeholder string
for col in cat_cols:
    X[col] = X[col].fillna('Unknown')
    X_test[col] = X_test[col].fillna('Unknown')

for train_idx, val_idx in skf.split(X, y):

    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = CatBoostClassifier(
        depth=7,
        learning_rate=0.03,
        iterations=1500,
        loss_function='Logloss',
        eval_metric='Accuracy',
        verbose=0
    )

    model.fit(X_train, y_train, cat_features=cat_cols)
    preds = model.predict(X_val)

    accuracies.append(accuracy_score(y_val, preds))

accuracies, np.mean(accuracies)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].fillna('Unknown')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[col] = X_test[col].fillna('Unknown')


([0.8131109833237493,
  0.8136860264519838,
  0.8136860264519838,
  0.8216340621403913,
  0.8060989643268124],
 np.float64(0.8136432125389842))

In [None]:
final_model = CatBoostClassifier(
    depth=7,
    learning_rate=0.03,
    iterations=1500,
    loss_function='Logloss',
    verbose=200
)

final_model.fit(X, y, cat_features=cat_cols)

preds = final_model.predict(X_test)

submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Transported': preds.astype(bool)
})

submission.to_csv('submission_catboost.csv', index=False)
v

0:	learn: 0.6782224	total: 21.1ms	remaining: 31.7s
200:	learn: 0.3719648	total: 5.47s	remaining: 35.3s
400:	learn: 0.3347161	total: 9.61s	remaining: 26.3s
600:	learn: 0.3054285	total: 13.7s	remaining: 20.5s
800:	learn: 0.2822350	total: 19.3s	remaining: 16.8s
1000:	learn: 0.2623362	total: 23.5s	remaining: 11.7s
1200:	learn: 0.2441890	total: 28.7s	remaining: 7.13s
1400:	learn: 0.2304724	total: 33.4s	remaining: 2.36s
1499:	learn: 0.2242069	total: 35.5s	remaining: 0us


In [None]:
X

Unnamed: 0,HomePlanet,Destination,Deck,Side,Age,CabinNum,Group,TotalSpend,HasSpend,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,Europa,TRAPPIST-1e,B,P,39.0,0.0,1,0.0,0,0.0,0.0,0.0,0.0,0.0
1,Earth,TRAPPIST-1e,F,S,24.0,0.0,2,736.0,1,109.0,9.0,25.0,549.0,44.0
2,Europa,TRAPPIST-1e,A,S,58.0,0.0,3,10383.0,1,43.0,3576.0,0.0,6715.0,49.0
3,Europa,TRAPPIST-1e,A,S,33.0,0.0,3,5176.0,1,0.0,1283.0,371.0,3329.0,193.0
4,Earth,TRAPPIST-1e,F,S,16.0,1.0,4,1091.0,1,303.0,70.0,151.0,565.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,Europa,55 Cancri e,A,P,41.0,98.0,9276,8536.0,1,0.0,6819.0,0.0,1643.0,74.0
8689,Earth,PSO J318.5-22,G,S,18.0,1499.0,9278,0.0,0,0.0,0.0,0.0,0.0,0.0
8690,Earth,TRAPPIST-1e,G,S,26.0,1500.0,9279,1873.0,1,0.0,0.0,1872.0,1.0,0.0
8691,Europa,55 Cancri e,E,S,32.0,608.0,9280,4637.0,1,0.0,1049.0,0.0,353.0,3235.0


In [None]:
!pip install optuna


Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.6.0-py3-none-any.whl (404 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.6.0


In [None]:
import optuna
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

def objective(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 800, 2000),
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.15),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 12),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0, 2),
        "random_strength": trial.suggest_float("random_strength", 1, 20),
        "border_count": trial.suggest_int("border_count", 64, 255)
    }

    model = CatBoostClassifier(
        **params,
        loss_function="Logloss",
        verbose=0,
        task_type="CPU"
    )

    model.fit(X_train, y_train, cat_features=cat_cols)
    preds = model.predict(X_val)

    return accuracy_score(y_val, preds)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=40)

print("Best parameters:", study.best_params)
print("Best accuracy:", study.best_value)


[I 2025-11-29 09:50:16,055] A new study created in memory with name: no-name-2cf89166-89c3-4dda-974b-9be13e60c6ec
[I 2025-11-29 09:50:43,410] Trial 0 finished with value: 0.81196089706728 and parameters: {'iterations': 1318, 'depth': 4, 'learning_rate': 0.06746117074391818, 'l2_leaf_reg': 11.065066350688292, 'bagging_temperature': 0.7267285160963084, 'random_strength': 2.0896375211122553, 'border_count': 226}. Best is trial 0 with value: 0.81196089706728.
[I 2025-11-29 09:51:01,677] Trial 1 finished with value: 0.8223116733755031 and parameters: {'iterations': 1566, 'depth': 4, 'learning_rate': 0.1487307643310685, 'l2_leaf_reg': 7.254366575156556, 'bagging_temperature': 0.517907085513934, 'random_strength': 13.542264539196923, 'border_count': 181}. Best is trial 1 with value: 0.8223116733755031.
[I 2025-11-29 09:51:31,418] Trial 2 finished with value: 0.828637147786084 and parameters: {'iterations': 1383, 'depth': 8, 'learning_rate': 0.11136890844816622, 'l2_leaf_reg': 7.50871698438413

Best parameters: {'iterations': 1642, 'depth': 6, 'learning_rate': 0.07069484268604255, 'l2_leaf_reg': 1.7847040625499062, 'bagging_temperature': 1.828361643439581, 'random_strength': 19.03145246899706, 'border_count': 77}
Best accuracy: 0.8309373202990225


In [None]:
best_params = study.best_params

final_model = CatBoostClassifier(
    **best_params,
    loss_function="Logloss",
    verbose=200
)

final_model.fit(X, y, cat_features=cat_cols)
preds = final_model.predict(X_test)

submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Transported': preds.astype(bool)
})
submission.to_csv("submission_optuna_catboost.csv", index=False)


0:	learn: 0.6836160	total: 12.3ms	remaining: 20.2s
200:	learn: 0.3786891	total: 2.57s	remaining: 18.4s
400:	learn: 0.3097865	total: 5.63s	remaining: 17.4s
600:	learn: 0.2717685	total: 10.1s	remaining: 17.4s
800:	learn: 0.2444942	total: 13.2s	remaining: 13.9s
1000:	learn: 0.2210725	total: 16.5s	remaining: 10.6s
1200:	learn: 0.2030872	total: 19.7s	remaining: 7.24s
1400:	learn: 0.1879306	total: 24.3s	remaining: 4.18s
1600:	learn: 0.1738842	total: 27.5s	remaining: 704ms
1641:	learn: 0.1711860	total: 28.2s	remaining: 0us


In [None]:
import pickle

with open('catboost.pkl', 'wb') as f:
    pickle.dump(final_model, f)


In [None]:
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier, Pool
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
import joblib
import optuna
import warnings
warnings.filterwarnings('ignore')


train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

_train = train.copy()
_test = test.copy()



def split_cabin(df):
    df['Cabin'] = df['Cabin'].astype(str).replace('nan', np.nan)
    df['Deck'] = df['Cabin'].str.split('/').str[0]
    df['CabinNum'] = df['Cabin'].str.extract(r'/([0-9]+)')
    df['CabinNum'] = pd.to_numeric(df['CabinNum'], errors='coerce')
    df['Side'] = df['Cabin'].str.split('/').str[-1]
    return df


def extract_group(df):
    df['Group'] = df['PassengerId'].str.split('_').str[0]
    df['Group'] = df['Group'].astype(int)
    return df


def add_spend_features(df):
    spend_cols = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
    df[spend_cols] = df[spend_cols].fillna(0)
    df['TotalSpend'] = df[spend_cols].sum(axis=1)
    df['HasSpend'] = (df['TotalSpend'] > 0).astype(int)
    df['LogTotalSpend'] = np.log1p(df['TotalSpend'])
    return df


for df in [train, test]:
    df = split_cabin(df)
    df = extract_group(df)

train = add_spend_features(train)
test = add_spend_features(test)

group_counts = train['Group'].value_counts()
train['GroupSize'] = train['Group'].map(group_counts)
test['GroupSize'] = test['Group'].map(group_counts).fillna(1)

age_median = train['Age'].median()
train['Age'] = train['Age'].fillna(age_median)
test['Age'] = test['Age'].fillna(age_median)

spend_cols = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
train.loc[train['CryoSleep'] == True, spend_cols] = 0
test.loc[test['CryoSleep'] == True, spend_cols] = 0

train.loc[(train['TotalSpend']==0) & (train['CryoSleep'].isna()), 'CryoSleep'] = True
test.loc[(test['TotalSpend']==0) & (test['CryoSleep'].isna()), 'CryoSleep'] = True

train['Transported'] = train['Transported'].astype(bool)

for df in [train, test]:
    df['IsAlone'] = (df['GroupSize']==1).astype(int)
    df['Deck_Side'] = df['Deck'].fillna('X') + '_' + df['Side'].fillna('X')


features = [
    'HomePlanet', 'Destination', 'Deck', 'Side', 'Deck_Side',
    'Age', 'CabinNum', 'GroupSize', 'IsAlone',
    'TotalSpend', 'LogTotalSpend', 'HasSpend'
] + spend_cols

features = [f for f in features if f in train.columns]

X = train[features]
y = train['Transported']
X_test = test[features]

cat_cols = X.select_dtypes(include=['object','category','bool']).columns.tolist()
for c in cat_cols:
    if X[c].dtype == 'bool':
        X[c] = X[c].astype(int)
        X_test[c] = X_test[c].astype(int)

cat_cols = X.select_dtypes(include=['object','category']).columns.tolist()

for col in cat_cols:
    X[col] = X[col].fillna('Unknown')
    X_test[col] = X_test[col].fillna('Unknown')

print('Features used:', features)
print('Categorical columns:', cat_cols)




Features used: ['HomePlanet', 'Destination', 'Deck', 'Side', 'Deck_Side', 'Age', 'CabinNum', 'GroupSize', 'IsAlone', 'TotalSpend', 'LogTotalSpend', 'HasSpend', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
Categorical columns: ['HomePlanet', 'Destination', 'Deck', 'Side', 'Deck_Side']


In [None]:
def cv_catboost(X, y, params=None, n_splits=5, random_state=42):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    scores = []
    models = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = CatBoostClassifier(
            random_seed=random_state,
            **(params or {})
        )
        model.fit(X_tr, y_tr, cat_features=cat_cols)
        preds = model.predict(X_val)
        score = accuracy_score(y_val, preds)
        print(f'Fold {fold} accuracy: {score:.5f}')
        scores.append(score)
        models.append(model)

    print('CV mean accuracy:', np.mean(scores))
    return models, scores

base_models, base_scores = cv_catboost(X, y)





[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
10:	learn: 0.5972674	total: 324ms	remaining: 29.1s
11:	learn: 0.5912309	total: 365ms	remaining: 30.1s
12:	learn: 0.5855666	total: 399ms	remaining: 30.3s
13:	learn: 0.5793990	total: 426ms	remaining: 30s
14:	learn: 0.5741686	total: 457ms	remaining: 30s
15:	learn: 0.5694063	total: 485ms	remaining: 29.8s
16:	learn: 0.5642920	total: 510ms	remaining: 29.5s
17:	learn: 0.5594325	total: 538ms	remaining: 29.3s
18:	learn: 0.5557411	total: 565ms	remaining: 29.1s
19:	learn: 0.5515508	total: 581ms	remaining: 28.5s
20:	learn: 0.5488638	total: 595ms	remaining: 27.7s
21:	learn: 0.5447131	total: 624ms	remaining: 27.7s
22:	learn: 0.5403262	total: 655ms	remaining: 27.8s
23:	learn: 0.5364925	total: 687ms	remaining: 27.9s
24:	learn: 0.5330815	total: 714ms	remaining: 27.9s
25:	learn: 0.5297414	total: 748ms	remaining: 28s
26:	learn: 0.5270811	total: 782ms	remaining: 28.2s
27:	learn: 0.5246044	total: 820ms	remaining: 28.5s
28:	le

In [None]:
def optuna_catboost(X, y, n_trials=30):
    X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

    def objective(trial):
        params = {
            'iterations': trial.suggest_int('iterations', 800, 1800),
            'depth': trial.suggest_int('depth', 4, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.08),
            'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 8),
            'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.5),
            'random_strength': trial.suggest_float('random_strength', 1e-6, 20.0),
            'border_count': trial.suggest_int('border_count', 32, 255)
        }

        model = CatBoostClassifier(**params, loss_function='Logloss', verbose=0)
        model.fit(X_tr, y_tr, cat_features=cat_cols)
        preds = model.predict(X_val)
        return accuracy_score(y_val, preds)

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials)
    print('Optuna best params:', study.best_params)
    print('Optuna best value:', study.best_value)
    return study.best_params

best_params = optuna_catboost(X, y, n_trials=30)



[I 2025-11-29 11:03:17,446] A new study created in memory with name: no-name-fca10f9b-b920-4c7c-890e-3318c8dd4898
[I 2025-11-29 11:03:45,340] Trial 0 finished with value: 0.8246118458884416 and parameters: {'iterations': 1159, 'depth': 7, 'learning_rate': 0.07539421337326394, 'l2_leaf_reg': 5.204780755401487, 'bagging_temperature': 0.6732679701110976, 'random_strength': 19.717350651027033, 'border_count': 205}. Best is trial 0 with value: 0.8246118458884416.
[I 2025-11-29 11:04:01,601] Trial 1 finished with value: 0.816561242093157 and parameters: {'iterations': 1429, 'depth': 4, 'learning_rate': 0.07409471899736822, 'l2_leaf_reg': 1.192423842707205, 'bagging_temperature': 0.8887384109643286, 'random_strength': 15.845909962298862, 'border_count': 133}. Best is trial 0 with value: 0.8246118458884416.
[I 2025-11-29 11:04:51,457] Trial 2 finished with value: 0.8136860264519838 and parameters: {'iterations': 1175, 'depth': 10, 'learning_rate': 0.017701108740367566, 'l2_leaf_reg': 1.8790757

Optuna best params: {'iterations': 920, 'depth': 7, 'learning_rate': 0.07973507664167895, 'l2_leaf_reg': 6.1119260068505294, 'bagging_temperature': 0.12467669778474386, 'random_strength': 19.944733046979657, 'border_count': 50}
Optuna best value: 0.8297872340425532


In [None]:

final_cat_params = dict(best_params)
final_cat_params.update({'iterations': final_cat_params.get('iterations',1500), 'verbose':0})


models_cat, scores_cat = cv_catboost(X, y, params=final_cat_params)
print('CatBoost CV mean:', np.mean(scores_cat))


final_cat = models_cat[0]
joblib.dump(final_cat, 'catboost_model.pkl')
final_cat.save_model('catboost_model.cbm')



Fold 0 accuracy: 0.81254
Fold 1 accuracy: 0.80621
Fold 2 accuracy: 0.80449
Fold 3 accuracy: 0.82854
Fold 4 accuracy: 0.80667
CV mean accuracy: 0.8116889261516247
CatBoost CV mean: 0.8116889261516247


In [None]:
from sklearn.preprocessing import LabelEncoder


X_xgb = X.copy()
X_test_xgb = X_test.copy()
encoders = {}
for col in cat_cols:
  le = LabelEncoder()
  X_xgb[col] = X_xgb[col].fillna('NaN')
  X_test_xgb[col] = X_test_xgb[col].fillna('NaN')
  le.fit(pd.concat([X_xgb[col], X_test_xgb[col]]))
  X_xgb[col] = le.transform(X_xgb[col])
  X_test_xgb[col] = le.transform(X_test_xgb[col])
  encoders[col] = le


xgb = XGBClassifier(
n_estimators=1200,
max_depth=6,
learning_rate=0.03,
subsample=0.9,
colsample_bytree=0.9,
use_label_encoder=False,
eval_metric='logloss'
)


xgb.fit(X_xgb, y)
joblib.dump(xgb, 'xgb_model.pkl')

['xgb_model.pkl']

In [None]:
voting = VotingClassifier(estimators=[('cat', final_cat), ('xgb', xgb)], voting='soft')

proba_cat = final_cat.predict_proba(X_test)
proba_xgb = xgb.predict_proba(X_test_xgb)
proba_ens = (proba_cat + proba_xgb) / 2
preds_ens = (proba_ens[:,1] > 0.5)


submission = pd.DataFrame({
'PassengerId': test['PassengerId'],
'Transported': preds_ens.astype(bool)
})
submission.to_csv('submission_ensemble_avg.csv', index=False)



joblib.dump({'features': features, 'cat_cols': cat_cols, 'encoders': encoders}, 'pipeline_meta.pkl')


print('\nDone. Saved: catboost_model.pkl, catboost_model.cbm, xgb_model.pkl, submission_ensemble_avg.csv, pipeline_meta.pkl')


Done. Saved: catboost_model.pkl, catboost_model.cbm, xgb_model.pkl, submission_ensemble_avg.csv, pipeline_meta.pkl
