In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from sklearn.metrics import roc_auc_score, mean_squared_error, accuracy_score

from tqdm import tqdm

import warnings; warnings.filterwarnings('ignore')

import lightgbm as lgb
import xgboost as xgb
import catboost as cb

In [2]:
import random
import os

def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

In [3]:
from pathlib import Path
DATA_PATH = Path("./data/")
OUTPUT_PATH = Path("./output/")

# General settings
SEED = 42
N_FOLDS = 10

# Metric choice placeholder
#Adjust depending on the competition
METRIC = "auc"

In [4]:
def get_folds(
    X,
    y,
    cv_type="kfold",
    n_splits=5,
    shuffle=True,
    random_state=42,
    groups=None,
):
    """
    Unified CV builder for Kaggle workflows.
    Returns a list of (train_idx, valid_idx) pairs.
    """

    cv_type = cv_type.lower()

    if cv_type not in ["kfold", "stratified", "group"]:
        raise ValueError(f"Unknown cv_type: {cv_type}")

    # -------------------------
    # Stratified KFold
    # -------------------------
    if cv_type == "stratified":
        # Safety: each class must appear at least n_splits times
        unique, counts = np.unique(y, return_counts=True)
        if np.any(counts < n_splits):
            raise ValueError(
                "Some classes have fewer samples than n_splits. "
                "StratifiedKFold would be invalid."
            )

        splitter = StratifiedKFold(
            n_splits=n_splits,
            shuffle=shuffle,
            random_state=random_state,
        )
        folds = list(splitter.split(X, y))

    # -------------------------
    # Group KFold
    # -------------------------
    elif cv_type == "group":
        if groups is None:
            raise ValueError("groups must be provided for GroupKFold.")

        splitter = GroupKFold(n_splits=n_splits)
        folds = list(splitter.split(X, y, groups))

    # -------------------------
    # Standard KFold
    # -------------------------
    else:  # "kfold"
        splitter = KFold(
            n_splits=n_splits,
            shuffle=shuffle,
            random_state=random_state,
        )
        folds = list(splitter.split(X, y))

    return folds


In [5]:
def evaluate(y_true, y_pred, metric="auc"):
    """
    Compute score based on chosen metric.
    """
    metric = metric.lower()
    
    if metric == "auc":
        return roc_auc_score(y_true, y_pred)
    elif metric == "rmse":
        return mean_squared_error(y_true, y_pred, squared=False)
    elif metric == "accuracy":
        return accuracy_score(y_true, y_pred)
    else:
        raise ValueError(f"Unknown metric: {metric}")


In [6]:
def run_training(X, y, model, folds, metric="auc"):
    """
    Train model across folds and return OOF predictions and scores.
    """
    oof_preds = np.zeros(len(y))
    scores = []

    for fold, (train_idx, val_idx) in enumerate(folds):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)[:, 1] if hasattr(model, "predict_proba") else model.predict(X_val)
        oof_preds[val_idx] = preds

        score = evaluate(y_val, preds, metric)
        scores.append(score)
        print(f"Fold {fold+1} | {metric.upper()}: {score:.4f}")

    print(f"CV mean {metric.upper()}: {np.mean(scores):.4f} | std: {np.std(scores):.4f}")
    return oof_preds, scores

In [7]:
#Load Data
df = pd.read_csv(DATA_PATH / "Spaceship Titanic" / "train.csv")

In [8]:
# === Basic EDA ===

# Shape
print("Shape:", df.shape)

# Preview
display(df.head())

# Info
print("\nInfo:")
df.info()

# Missing values
print("\nMissing values:")
print(df.isna().sum())

# Target distribution 
if "Transported" in df.columns:
    print("\nTarget distribution:")
    print(df["Transported"].value_counts(normalize=True))

# Numerical stats
print("\nNumerical summary:")
display(df.describe())

# Categorical stats (top categories)
print("\nCategorical summary:")
for col in df.select_dtypes(include=["object", "category"]).columns:
    print(f"{col}: {df[col].nunique()} unique")


Shape: (8693, 14)


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True



Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB

Missing values:
PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             17

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0



Categorical summary:
PassengerId: 8693 unique
HomePlanet: 3 unique
CryoSleep: 2 unique
Cabin: 6560 unique
Destination: 3 unique
VIP: 2 unique
Name: 8473 unique


In [9]:
target_col = "Transported"
use_cols = ['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
X=df[use_cols].copy()
y = df[target_col].astype(int).copy()
X['HomePlanet'] = X['HomePlanet'].fillna("Missng")
X["CryoSleep"] = X["CryoSleep"].fillna("Unknown")
X["Cabin"] = X["Cabin"].fillna("Unknown")
X["Destination"] = X["Destination"].fillna("Unknown")
X["Age"] = X["Age"].fillna(X["Age"].median())
X["VIP"] = X["VIP"].fillna("False")
X["RoomService"] = X["RoomService"].fillna(X["RoomService"].median())
X["FoodCourt"] = X["FoodCourt"].fillna(X["FoodCourt"].median())
X["ShoppingMall"] = X["ShoppingMall"].fillna(X["ShoppingMall"].median())
X["Spa"] = X["Spa"].fillna(X["Spa"].median())
X["VRDeck"] = X["VRDeck"].fillna(X["VRDeck"].median())

cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

In [10]:
folds = get_folds(X, y, cv_type="stratified", n_splits=N_FOLDS, shuffle=True, random_state=SEED)

In [11]:
model = lgb.LGBMClassifier(
    n_estimators=100,
    learning_rate=0.05,
    random_state=SEED,
    n_jobs=-1,
    verbosity = -1
)

oof_preds,scores = run_training(X, y, model, folds, metric=METRIC)

Fold 1 | AUC: 0.8750
Fold 2 | AUC: 0.8864
Fold 3 | AUC: 0.8980
Fold 4 | AUC: 0.8552
Fold 5 | AUC: 0.8852
Fold 6 | AUC: 0.8706
Fold 7 | AUC: 0.8801
Fold 8 | AUC: 0.8780
Fold 9 | AUC: 0.8766
Fold 10 | AUC: 0.8666
CV mean AUC: 0.8772 | std: 0.0111


In [12]:
df["Group"] = df["PassengerId"].str.split("_").str[0]
groups = df["Group"]

splitter = GroupKFold(n_splits=10)

folds_grouped = list(splitter.split(X, y, groups=groups))

In [13]:
oof_preds,scores = run_training(X, y, model, folds_grouped, metric=METRIC)

Fold 1 | AUC: 0.8758
Fold 2 | AUC: 0.8659
Fold 3 | AUC: 0.8901
Fold 4 | AUC: 0.8920
Fold 5 | AUC: 0.8782
Fold 6 | AUC: 0.8705
Fold 7 | AUC: 0.8635
Fold 8 | AUC: 0.8849
Fold 9 | AUC: 0.8735
Fold 10 | AUC: 0.8834
CV mean AUC: 0.8778 | std: 0.0093


In [14]:
splitter = KFold(
            n_splits=10,
            shuffle=True,
            random_state=42,
        )
folds_ord = list(splitter.split(X, y))

In [15]:
oof_preds,scores = run_training(X, y, model, folds_ord, metric=METRIC)

Fold 1 | AUC: 0.8643
Fold 2 | AUC: 0.8844
Fold 3 | AUC: 0.8813
Fold 4 | AUC: 0.8772
Fold 5 | AUC: 0.8706
Fold 6 | AUC: 0.8878
Fold 7 | AUC: 0.8712
Fold 8 | AUC: 0.8795
Fold 9 | AUC: 0.8753
Fold 10 | AUC: 0.8953
CV mean AUC: 0.8787 | std: 0.0086


In [16]:
test = pd.read_csv(DATA_PATH / "Spaceship Titanic" / "test.csv")
test.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [17]:
final_model = lgb.LGBMClassifier(
    n_estimators=100,
    learning_rate=0.05,
    random_state=SEED,
    n_jobs=-1,
    verbosity=-1
)

final_model.fit(X, y)

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [18]:
# Shape
print("Shape:", test.shape)

# Preview
display(test.head())

# Info
print("\nInfo:")
test.info()

# Missing values
print("\nMissing values:")
print(test.isna().sum())


Shape: (4277, 13)


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez



Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   4277 non-null   object 
 1   HomePlanet    4190 non-null   object 
 2   CryoSleep     4184 non-null   object 
 3   Cabin         4177 non-null   object 
 4   Destination   4185 non-null   object 
 5   Age           4186 non-null   float64
 6   VIP           4184 non-null   object 
 7   RoomService   4195 non-null   float64
 8   FoodCourt     4171 non-null   float64
 9   ShoppingMall  4179 non-null   float64
 10  Spa           4176 non-null   float64
 11  VRDeck        4197 non-null   float64
 12  Name          4183 non-null   object 
dtypes: float64(6), object(7)
memory usage: 434.5+ KB

Missing values:
PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt 

In [19]:
use_cols = ['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
test_df=test[use_cols].copy()
test_df['HomePlanet'] = test_df['HomePlanet'].fillna("Missng")
test_df["CryoSleep"] = test_df["CryoSleep"].fillna("Unknown")
test_df["Cabin"] = test_df["Cabin"].fillna("Unknown")
test_df["Destination"] = test_df["Destination"].fillna("Unknown")
test_df["Age"] = test_df["Age"].fillna(X["Age"].median())
test_df["VIP"] = test_df["VIP"].fillna("False")
test_df["RoomService"] = test_df["RoomService"].fillna(X["RoomService"].median())
test_df["FoodCourt"] = test_df["FoodCourt"].fillna(X["FoodCourt"].median())
test_df["ShoppingMall"] = test_df["ShoppingMall"].fillna(X["ShoppingMall"].median())
test_df["Spa"] = test_df["Spa"].fillna(X["Spa"].median())
test_df["VRDeck"] = test_df["VRDeck"].fillna(X["VRDeck"].median())

cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
test_df = pd.get_dummies(test_df, columns=cat_cols, drop_first=True)
test_df = test_df.reindex(columns=X.columns, fill_value=0)

In [20]:
test_pred = final_model.predict(test_df)
test_pred

array([1, 0, 1, ..., 1, 0, 1], shape=(4277,))

In [21]:
sub = pd.read_csv(DATA_PATH / "Spaceship Titanic" / "sample_submission.csv")
sub.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,False
3,0021_01,False
4,0023_01,False


In [22]:
sub["Transported"] = test_pred
sub["Transported"] = sub["Transported"].astype(bool)
sub.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True


In [23]:
sub.to_csv("submission.csv", index=False)

In [24]:
import os
os.listdir()

['.git',
 '.gitignore',
 '.ipynb_checkpoints',
 'Baseline test on Titianic dataset.ipynb',
 'baseline_template.ipynb',
 'competitions',
 'data',
 'EDA',
 'models',
 'notebooks',
 'output',
 'README.md',
 'requirements.txt',
 'Spaceship Titanic Competition first ansemble',
 'Spaceship Titanic Competition.ipynb',
 'submission.csv',
 'utils',
 'venv']