In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from sklearn.metrics import roc_auc_score, mean_squared_error, accuracy_score

from tqdm import tqdm

import warnings; warnings.filterwarnings('ignore')

import lightgbm as lgb
import xgboost as xgb
import catboost as cb

from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score


import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import random
import os

def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

In [3]:
from pathlib import Path
DATA_PATH = Path("C:\ML_Projects\kaggle-workflow\data/data/")
OUTPUT_PATH = Path("./output/")

# General settings
SEED = 42
N_FOLDS = 10

# Metric choice placeholder
#Adjust depending on the competition
METRIC = "auc"

In [1]:
#Load Data
df = pd.read_csv( / "Spaceship Titanic" / "train.csv")


SyntaxError: invalid syntax (1232162115.py, line 2)

In [5]:
# === Basic EDA for training data ===

# Shape
print("Shape:", df.shape)

# Preview
display(df.head())

# Info
print("\nInfo:")
df.info()

# Missing values
print("\nMissing values:")
print(df.isna().sum())

# Target distribution 
if "Transported" in df.columns:
    print("\nTarget distribution:")
    print(df["Transported"].value_counts(normalize=True))

# Numerical stats
print("\nNumerical summary:")
display(df.describe())

# Categorical stats (top categories)
print("\nCategorical summary:")
for col in df.select_dtypes(include=["object", "category"]).columns:
    print(f"{col}: {df[col].nunique()} unique")

Shape: (8693, 14)


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True



Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB

Missing values:
PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             17

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0



Categorical summary:
PassengerId: 8693 unique
HomePlanet: 3 unique
CryoSleep: 2 unique
Cabin: 6560 unique
Destination: 3 unique
VIP: 2 unique
Name: 8473 unique


In [6]:
df_test = pd.read_csv(DATA_PATH / "Spaceship Titanic" / "test.csv")


In [7]:
# === Basic EDA for test data ===

# Shape
print("Shape:", df_test.shape)

# Preview
display(df_test.head())

# Info
print("\nInfo:")
df_test.info()

# Missing values
print("\nMissing values:")
print(df_test.isna().sum())


# Numerical stats
print("\nNumerical summary:")
display(df_test.describe())

# Categorical stats (top categories)
print("\nCategorical summary:")
for col in df_test.select_dtypes(include=["object", "category"]).columns:
    print(f"{col}: {df[col].nunique()} unique")


Shape: (4277, 13)


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez



Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   4277 non-null   object 
 1   HomePlanet    4190 non-null   object 
 2   CryoSleep     4184 non-null   object 
 3   Cabin         4177 non-null   object 
 4   Destination   4185 non-null   object 
 5   Age           4186 non-null   float64
 6   VIP           4184 non-null   object 
 7   RoomService   4195 non-null   float64
 8   FoodCourt     4171 non-null   float64
 9   ShoppingMall  4179 non-null   float64
 10  Spa           4176 non-null   float64
 11  VRDeck        4197 non-null   float64
 12  Name          4183 non-null   object 
dtypes: float64(6), object(7)
memory usage: 434.5+ KB

Missing values:
PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt 

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,4186.0,4195.0,4171.0,4179.0,4176.0,4197.0
mean,28.658146,219.266269,439.484296,177.295525,303.052443,310.710031
std,14.179072,607.011289,1527.663045,560.821123,1117.186015,1246.994742
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,26.0,0.0,0.0,0.0,0.0,0.0
75%,37.0,53.0,78.0,33.0,50.0,36.0
max,79.0,11567.0,25273.0,8292.0,19844.0,22272.0



Categorical summary:
PassengerId: 8693 unique
HomePlanet: 3 unique
CryoSleep: 2 unique
Cabin: 6560 unique
Destination: 3 unique
VIP: 2 unique
Name: 8473 unique


In [8]:
#Preprocessing
target_col = "Transported"
pred_col = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
features = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall']
num_col = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall']
cat_col = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination','VIP']

X = df[features].copy()
y=df[target_col].copy().astype(int)

In [9]:
numeric_transformer = Pipeline([("imputer", SimpleImputer(strategy = "median"))])

categorical_transformer = Pipeline([("imputer", SimpleImputer(strategy = "most_frequent")),
                                   ("onehot", OneHotEncoder(handle_unknown="ignore"))
                                   ])
preprocessor = ColumnTransformer([("num", numeric_transformer, num_col),
    ("cat", categorical_transformer, cat_col)
])
c

In [10]:
LGBM_oof = oof_preds
LGBM_oof

array([0.31346759, 0.18394385, 0.57951732, ..., 0.65372249, 0.2379046 ,
       0.66656945], shape=(8693,))

In [11]:
cat_idx = list(range(len(num_col), len(num_col) + len(cat_col)))

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, num_col),
    ("cat", SimpleImputer(strategy="most_frequent"), cat_col)
])

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_preds = np.zeros(len(X))
models = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = Pipeline([
        ("prep", preprocessor),
        ("catboost", CatBoostClassifier(
            cat_features=cat_idx,
            learning_rate=0.05,
            depth=6,
            random_state=42,
            verbose=False
        ))
    ])

    model.fit(X_train, y_train)
    oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]

    models.append(model)

In [12]:
Cat_oof = oof_preds
Cat_oof

array([0.46137761, 0.12550541, 0.61520358, ..., 0.78914112, 0.15735932,
       0.80075566], shape=(8693,))

In [13]:
numeric_transformer = Pipeline([("imputer", SimpleImputer(strategy = "median"))])

categorical_transformer = Pipeline([("imputer", SimpleImputer(strategy = "most_frequent")),
                                   ("onehot", OneHotEncoder(handle_unknown="ignore"))
                                   ])
preprocessor = ColumnTransformer([("num", numeric_transformer, num_col),
    ("cat", categorical_transformer, cat_col)
])
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_preds = np.zeros(len(X))
models = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = Pipeline([
        ("prep", preprocessor),
        ("xgb", XGBClassifier(
            n_estimators=100,
            max_depth=4,
            subsample = 0.7,
            colsample_bytree = 0.7,
            min_child_weight = 3,
            learning_rate=0.05,
            eval_metric = "logloss",
            random_state=42,
            n_jobs = 1,
            verbosity = 0
        ))
    ])

    model.fit(X_train, y_train)
    oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]

    models.append(model)

In [14]:
xgb_oof = oof_preds
xgb_oof

array([0.40304902, 0.14556043, 0.58859462, ..., 0.74021018, 0.28323397,
       0.69207489], shape=(8693,))

In [15]:
meta_X = np.column_stack([LGBM_oof,Cat_oof,xgb_oof])
meta_y = y.values

In [16]:
from sklearn.linear_model import LogisticRegression

meta_model = LogisticRegression()
meta_model.fit(meta_X, meta_y)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [17]:
#test preprocessing
X_test = df_test[features].copy()

In [18]:
#Test pred from 

numeric_transformer = Pipeline([("imputer", SimpleImputer(strategy = "median"))])

categorical_transformer = Pipeline([("imputer", SimpleImputer(strategy = "most_frequent")),
                                   ("onehot", OneHotEncoder(handle_unknown="ignore"))
                                   ])
preprocessor = ColumnTransformer([("num", numeric_transformer, num_col),
    ("cat", categorical_transformer, cat_col)
])
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

lgbm_test_preds = []

base_lgbm = Pipeline([
    ("prep", preprocessor),
    ("lgbm", LGBMClassifier(
        n_estimators=100,
        num_leaves=16,
        max_depth=4,
        min_data_in_leaf=20,
        feature_fraction=0.7,
        bagging_fraction=0.7,
        learning_rate=0.05,
        random_state=42,
        n_jobs=-1,
        verbosity=-1))])

# K-fold loop (same folds as for OOF)
for train_idx, _ in kf.split(X, y):

    # Clone pipeline = new clean model each fold
    model = base_lgbm

    # Fit on the fold's training data
    model.fit(X.iloc[train_idx], y.iloc[train_idx])

    # Predict test set for this fold
    preds = model.predict_proba(X_test)[:, 1]

    # Store fold predictions
    lgbm_test_preds.append(preds)

# Average test predictions over folds
lgbm_test_final = np.mean(lgbm_test_preds, axis=0)

In [19]:
lgbm_test_final

array([0.63630342, 0.21767737, 0.97746357, ..., 0.91150692, 0.55290573,
       0.62950206], shape=(4277,))

In [20]:
cat_idx = list(range(len(num_col), len(num_col) + len(cat_col)))

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, num_col),
    ("cat", SimpleImputer(strategy="most_frequent"), cat_col)
])

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cat_test_preds=[]

base_cat = Pipeline([
        ("prep", preprocessor),
        ("catboost", CatBoostClassifier(
            cat_features=cat_idx,
            learning_rate=0.05,
            depth=6,
            random_state=42,
            verbose=False
        ))
    ])
# K-fold loop (same folds as for OOF)
for train_idx, _ in kf.split(X, y):

    # Clone pipeline = new clean model each fold
    model = base_cat

    # Fit on the fold's training data
    model.fit(X.iloc[train_idx], y.iloc[train_idx])

    # Predict test set for this fold
    preds = model.predict_proba(X_test)[:, 1]

    # Store fold predictions
    cat_test_preds.append(preds)



In [21]:
# Average test predictions over folds
cat_test_final = np.mean(cat_test_preds, axis=0)
cat_test_final

array([0.60651198, 0.1517073 , 0.98826348, ..., 0.95673651, 0.56907122,
       0.62219706], shape=(4277,))

In [22]:
numeric_transformer = Pipeline([("imputer", SimpleImputer(strategy = "median"))])

categorical_transformer = Pipeline([("imputer", SimpleImputer(strategy = "most_frequent")),
                                   ("onehot", OneHotEncoder(handle_unknown="ignore"))
                                   ])
preprocessor = ColumnTransformer([("num", numeric_transformer, num_col),
    ("cat", categorical_transformer, cat_col)
])
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

xgb_test_preds = []

base_xgb = Pipeline([
        ("prep", preprocessor),
        ("xgb", XGBClassifier(
            n_estimators=100,
            max_depth=4,
            subsample = 0.7,
            colsample_bytree = 0.7,
            min_child_weight = 3,
            learning_rate=0.05,
            eval_metric = "logloss",
            random_state=42,
            n_jobs = 1,
            verbosity = 0
        ))
    ])

# K-fold loop (same folds as for OOF)
for train_idx, _ in kf.split(X, y):

    # Clone pipeline = new clean model each fold
    model = base_xgb

    # Fit on the fold's training data
    model.fit(X.iloc[train_idx], y.iloc[train_idx])

    # Predict test set for this fold
    preds = model.predict_proba(X_test)[:, 1]

    # Store fold predictions
    xgb_test_preds.append(preds)

# Average test predictions over folds
xgb_test_final = np.mean(xgb_test_preds, axis=0)

In [23]:
xgb_test_final

array([0.6269252 , 0.19040468, 0.9699403 , ..., 0.91367406, 0.5513859 ,
       0.6450687 ], shape=(4277,), dtype=float32)

In [24]:
meta_test = np.column_stack([lgbm_test_final, cat_test_final, xgb_test_final])
meta_test

array([[0.63630342, 0.60651198, 0.62692517],
       [0.21767737, 0.1517073 , 0.19040468],
       [0.97746357, 0.98826348, 0.9699403 ],
       ...,
       [0.91150692, 0.95673651, 0.91367406],
       [0.55290573, 0.56907122, 0.55138588],
       [0.62950206, 0.62219706, 0.64506871]], shape=(4277, 3))

In [25]:
final_probs = meta_model.predict_proba(meta_test)[:, 1]
final_probs

array([0.67214028, 0.15428939, 0.93567432, ..., 0.91691459, 0.5895075 ,
       0.68672471], shape=(4277,))

In [26]:
final_probs.shape[0] == len(X_test)

True

In [27]:
np.isnan(final_probs).sum() == 0

np.True_

In [28]:
sub = pd.read_csv(DATA_PATH / "Spaceship Titanic" / "sample_submission.csv")

In [29]:
sub["Transported"] = final_probs>0.5
sub["Transported"] = sub["Transported"].astype(bool)
sub.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True


In [30]:
sub.to_csv("submission_4.csv", index=False)