In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from sklearn.metrics import roc_auc_score, mean_squared_error, accuracy_score

from tqdm import tqdm

import warnings; warnings.filterwarnings('ignore')

import lightgbm as lgb
import xgboost as xgb
import catboost as cb

from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score


import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
import random
import os

def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

In [3]:
from pathlib import Path
DATA_PATH = Path("./data/")
OUTPUT_PATH = Path("./output/")

# General settings
SEED = 42
N_FOLDS = 10

# Metric choice placeholder
#Adjust depending on the competition
METRIC = "auc"

In [4]:
def evaluate(y_true, y_pred, metric="auc"):
    """
    Compute score based on chosen metric.
    """
    metric = metric.lower()
    
    if metric == "auc":
        return roc_auc_score(y_true, y_pred)
    elif metric == "rmse":
        return mean_squared_error(y_true, y_pred, squared=False)
    elif metric == "accuracy":
        return accuracy_score(y_true, y_pred)
    else:
        raise ValueError(f"Unknown metric: {metric}")


In [5]:
#Load Data
df = pd.read_csv(DATA_PATH / "Spaceship Titanic" / "train.csv")

In [6]:
# === Basic EDA ===

# Shape
print("Shape:", df.shape)

# Preview
display(df.head())

# Info
print("\nInfo:")
df.info()

# Missing values
print("\nMissing values:")
print(df.isna().sum())

# Target distribution 
if "Transported" in df.columns:
    print("\nTarget distribution:")
    print(df["Transported"].value_counts(normalize=True))

# Numerical stats
print("\nNumerical summary:")
display(df.describe())

# Categorical stats (top categories)
print("\nCategorical summary:")
for col in df.select_dtypes(include=["object", "category"]).columns:
    print(f"{col}: {df[col].nunique()} unique")


Shape: (8693, 14)


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True



Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB

Missing values:
PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             17

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0



Categorical summary:
PassengerId: 8693 unique
HomePlanet: 3 unique
CryoSleep: 2 unique
Cabin: 6560 unique
Destination: 3 unique
VIP: 2 unique
Name: 8473 unique


In [7]:
target_col = "Transported"
use_cols = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall']
numeric_features = ['Age','RoomService','FoodCourt','ShoppingMall']
categorical_features = ['HomePlanet','CryoSleep','Cabin','Destination','VIP']
X=df[use_cols].copy() 
y = df[target_col].astype(int).copy()

In [8]:
numeric_transformer = Pipeline([("imputer", SimpleImputer(strategy = "median"))])

categorical_transformer = Pipeline([("imputer", SimpleImputer(strategy = "most_frequent")),
                                   ("onehot", OneHotEncoder(handle_unknown="ignore"))
                                   ])
preprocessor = ColumnTransformer([("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_preds = np.zeros(len(X))
models = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = Pipeline([
        ("prep", preprocessor),
        ("lgbm", LGBMClassifier(
            n_estimators=100,
            num_leaves=16,
            max_depth=4,
            min_data_in_leaf=20,
            feature_fraction=0.7,
            bagging_fraction=0.7,
            learning_rate=0.05,
            random_state=42,
            n_jobs=-1,
            verbosity=-1
        ))
    ])

    model.fit(X_train, y_train)
    oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]

    models.append(model) 

In [9]:
LGBM_oof = oof_preds
print(LGBM_oof)

[0.31346759 0.18394385 0.57951732 ... 0.65372249 0.2379046  0.66656945]


In [10]:

cat_idx = list(range(len(numeric_features), len(numeric_features) + len(categorical_features)))

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_features),
    ("cat", SimpleImputer(strategy="most_frequent"), categorical_features)
])

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_preds = np.zeros(len(X))
models = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = Pipeline([
        ("prep", preprocessor),
        ("catboost", CatBoostClassifier(
            cat_features=cat_idx,
            learning_rate=0.05,
            depth=6,
            random_state=42,
            verbose=False
        ))
    ])

    model.fit(X_train, y_train)
    oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]

    models.append(model)


In [11]:
Cat_oof = oof_preds
print(Cat_oof)

[0.46137761 0.12550541 0.61520358 ... 0.78914112 0.15735932 0.80075566]


In [12]:
numeric_transformer = Pipeline([("imputer", SimpleImputer(strategy = "median"))])

categorical_transformer = Pipeline([("imputer", SimpleImputer(strategy = "most_frequent")),
                                   ("onehot", OneHotEncoder(handle_unknown="ignore"))
                                   ])
preprocessor = ColumnTransformer([("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_preds = np.zeros(len(X))
models = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = Pipeline([
        ("prep", preprocessor),
        ("xgb", XGBClassifier(
            n_estimators=100,
            max_depth=4,
            subsample = 0.7,
            colsample_bytree = 0.7,
            min_child_weight = 3,
            learning_rate=0.05,
            eval_metric = "logloss",
            random_state=42,
            n_jobs = 1,
            verbosity = 0
        ))
    ])

    model.fit(X_train, y_train)
    oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]

    models.append(model)

In [13]:
XGB_oof = oof_preds
print(XGB_oof)

[0.40304902 0.14556043 0.58859462 ... 0.74021018 0.28323397 0.69207489]


In [14]:
meta_X = np.column_stack([LGBM_oof, Cat_oof, XGB_oof])
meta_y = y.values

In [15]:
from sklearn.linear_model import LogisticRegression

meta_model = LogisticRegression()
meta_model.fit(meta_X, meta_y)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [16]:
target_col = "Transported"
use_cols = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall']
numeric_features = ['Age','RoomService','FoodCourt','ShoppingMall']
categorical_features = ['HomePlanet','CryoSleep','Cabin','Destination','VIP']
X=df[use_cols].copy() 
y = df[target_col].astype(int).copy()
numeric_transformer = Pipeline([("imputer", SimpleImputer(strategy = "median"))])

categorical_transformer = Pipeline([("imputer", SimpleImputer(strategy = "most_frequent")),
                                   ("onehot", OneHotEncoder(handle_unknown="ignore"))
                                   ])
preprocessor = ColumnTransformer([("num", numeric_transformer, numeric_features)])

xgb_full = Pipeline([
        ("prep", preprocessor),
        ("xgb", XGBClassifier(
            n_estimators=100,
            max_depth=4,
            subsample = 0.7,
            colsample_bytree = 0.7,
            min_child_weight = 3,
            learning_rate=0.05,
            eval_metric = "logloss",
            random_state=42,
            n_jobs = 1,
            verbosity = 0
        ))
    ])
xgb_full.fit(X, y)

0,1,2
,steps,"[('prep', ...), ('xgb', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.7
,device,
,early_stopping_rounds,
,enable_categorical,False


In [17]:
cat_idx = list(range(len(numeric_features), len(numeric_features) + len(categorical_features)))

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_features),
    ("cat", SimpleImputer(strategy="most_frequent"), categorical_features)
])

cat_full = Pipeline([
        ("prep", preprocessor),
        ("catboost", CatBoostClassifier(
            cat_features=cat_idx,
            learning_rate=0.05,
            depth=6,
            random_state=42,
            verbose=False
        ))
    ])
cat_full.fit(X, y)

0,1,2
,steps,"[('prep', ...), ('catboost', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False


In [18]:
numeric_transformer = Pipeline([("imputer", SimpleImputer(strategy = "median"))])

categorical_transformer = Pipeline([("imputer", SimpleImputer(strategy = "most_frequent")),
                                   ("onehot", OneHotEncoder(handle_unknown="ignore"))
                                   ])
preprocessor = ColumnTransformer([("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])
lgbm_full = Pipeline([
        ("prep", preprocessor),
        ("lgbm", LGBMClassifier(
            n_estimators=100,
            num_leaves=16,
            max_depth=4,
            min_data_in_leaf=20,
            feature_fraction=0.7,
            bagging_fraction=0.7,
            learning_rate=0.05,
            random_state=42,
            n_jobs=-1,
            verbosity=-1
        ))
    ])
lgbm_full.fit(X,y)

0,1,2
,steps,"[('prep', ...), ('lgbm', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,boosting_type,'gbdt'
,num_leaves,16
,max_depth,4
,learning_rate,0.05
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [19]:
df_test = pd.read_csv(DATA_PATH / "Spaceship Titanic" / "test.csv")
df_test.head()
df_test.shape

(4277, 13)

In [20]:
use_cols = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall']
X_test=df_test[use_cols].copy()
X_test.shape

(4277, 9)

In [21]:
lgbm_test = lgbm_full.predict_proba(X_test)[:, 1]
cat_test  = cat_full.predict_proba(X_test)[:, 1]
xgb_test  = xgb_full.predict_proba(X_test)[:, 1]

In [22]:
meta_test = np.column_stack([lgbm_test, cat_test, xgb_test])

In [23]:
meta_test

array([[0.64246514, 0.59669166, 0.69917673],
       [0.21781888, 0.13087901, 0.30032739],
       [0.97482674, 0.99143646, 0.69967383],
       ...,
       [0.90784476, 0.96166183, 0.69917673],
       [0.54837053, 0.5318325 , 0.6523419 ],
       [0.63975317, 0.63539852, 0.73896229]], shape=(4277, 3))

In [24]:
final_preds = meta_model.predict_proba(meta_test)[:,1]
final_preds

array([0.70906439, 0.18854197, 0.88000703, ..., 0.86532437, 0.63313151,
       0.74260151], shape=(4277,))

In [25]:
sub = pd.read_csv(DATA_PATH / "Spaceship Titanic" / "sample_submission.csv")
sub.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,False
3,0021_01,False
4,0023_01,False


In [26]:
sub["Transported"] = final_preds
sub["Transported"] = sub["Transported"].astype(bool)
sub.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,True
2,0019_01,True
3,0021_01,True
4,0023_01,True


In [27]:
sub.to_csv("submission_2.csv", index=False)