In [6]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import warnings

warnings.filterwarnings('ignore')

train_df = pd.read_csv('../data/raw/train.csv')
test_df = pd.read_csv('../data/raw/test.csv')
sample_submission = pd.read_csv('../data/raw/sample_submission.csv')

test_passenger_ids = test_df['PassengerId']
train_ids = train_df['PassengerId']

y = train_df['Transported'].copy()
train_df.drop('Transported', axis=1, inplace=True)
all_data = pd.concat([train_df, test_df], ignore_index=True)

all_data['GroupId'] = all_data['PassengerId'].apply(lambda x: x.split('_')[0])
all_data['GroupSize'] = all_data.groupby('GroupId')['PassengerId'].transform('count')

all_data[['Deck', 'CabinNum', 'Side']] = all_data['Cabin'].str.split('/', expand=True)
all_data['CabinNum'] = pd.to_numeric(all_data['CabinNum'])

spend_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
all_data['TotalSpend'] = all_data[spend_cols].sum(axis=1)
all_data['NoSpend'] = (all_data['TotalSpend'] == 0).astype(int)

for col in spend_cols:
    all_data.loc[(all_data['CryoSleep'] == True) & (all_data[col].isnull()), col] = 0

numeric_cols = all_data.select_dtypes(include=np.number).columns.tolist()
categorical_cols = all_data.select_dtypes(exclude=np.number).columns.tolist()

for col in numeric_cols:
    median_val = all_data[col].median()
    all_data[col] = all_data[col].fillna(median_val)

for col in categorical_cols:
    mode_val = all_data[col].mode()[0]
    all_data[col] = all_data[col].fillna(mode_val)

all_data['TotalSpend'] = all_data[spend_cols].sum(axis=1)

all_data.drop(['PassengerId', 'Name', 'Cabin', 'GroupId'], axis=1, inplace=True)

bool_cols = ['VIP', 'CryoSleep']
for col in bool_cols:
    if col in all_data.columns:
        all_data[col] = all_data[col].astype(int)

categorical_cols_to_encode = all_data.select_dtypes(include=['object']).columns
all_data = pd.get_dummies(all_data, columns=categorical_cols_to_encode, dummy_na=False)

X = all_data.iloc[:len(train_ids)]
X_test = all_data.iloc[len(train_ids):]
y = y.astype(int)

lgbm = lgb.LGBMClassifier(random_state=42, n_estimators=200, learning_rate=0.05, num_leaves=20)
lgbm.fit(X, y)
predictions = lgbm.predict(X_test)

submission_preds = (predictions == 1)

submission = pd.DataFrame({
    'PassengerId': test_passenger_ids,
    'Transported': submission_preds
})

submission.to_csv('../submissions/baseline_submission.csv', index=False)

print("Submission file 'baseline_submission.csv' created successfully!")
print(submission.head())


[LightGBM] [Info] Number of positive: 4378, number of negative: 4315
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000985 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1909
[LightGBM] [Info] Number of data points in the train set: 8693, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503624 -> initscore=0.014495
[LightGBM] [Info] Start training from score 0.014495
Submission file 'baseline_submission.csv' created successfully!
  PassengerId  Transported
0     0013_01         True
1     0018_01        False
2     0019_01         True
3     0021_01         True
4     0023_01         True
Training Logistic Regression...

--- Blending Predictions ---

Submission file 'ensemble_submission.csv' created successfully!
  PassengerId  Transported
0     0013_01         True
1     0018_01        False
2     0019_01         True
3     0021_01         True
4     0023_01         T

In [7]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import warnings

warnings.filterwarnings('ignore')

# =============================================================================
# 1. Data Processing (Our proven pipeline)
# =============================================================================

train_df = pd.read_csv('../data/raw/train.csv')
test_df = pd.read_csv('../data/raw/test.csv')

test_passenger_ids = test_df['PassengerId']
train_ids = train_df['PassengerId']
y = train_df['Transported'].copy()

# Drop target for concatenation
train_df.drop('Transported', axis=1, inplace=True)
all_data = pd.concat([train_df, test_df], ignore_index=True)

# --- Feature Engineering ---
all_data['GroupId'] = all_data['PassengerId'].apply(lambda x: x.split('_')[0])
all_data['GroupSize'] = all_data.groupby('GroupId')['PassengerId'].transform('count')
all_data[['Deck', 'CabinNum', 'Side']] = all_data['Cabin'].str.split('/', expand=True)
all_data['CabinNum'] = pd.to_numeric(all_data['CabinNum'])
spend_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
all_data['TotalSpend'] = all_data[spend_cols].sum(axis=1)
all_data['NoSpend'] = (all_data['TotalSpend'] == 0).astype(int)

# --- Imputing Missing Values ---
for col in spend_cols:
    all_data.loc[(all_data['CryoSleep'] == True) & (all_data[col].isnull()), col] = 0

numeric_cols = all_data.select_dtypes(include=np.number).columns.tolist()
categorical_cols = all_data.select_dtypes(exclude=np.number).columns.tolist()
for col in numeric_cols:
    all_data[col] = all_data[col].fillna(all_data[col].median())
for col in categorical_cols:
    all_data[col] = all_data[col].fillna(all_data[col].mode()[0])
all_data['TotalSpend'] = all_data[spend_cols].sum(axis=1)

# --- Final Prep ---
all_data.drop(['PassengerId', 'Name', 'Cabin', 'GroupId'], axis=1, inplace=True)
bool_cols = ['VIP', 'CryoSleep']
for col in bool_cols:
    if col in all_data.columns:
        all_data[col] = all_data[col].astype(int)
categorical_cols_to_encode = all_data.select_dtypes(include=['object']).columns
all_data = pd.get_dummies(all_data, columns=categorical_cols_to_encode, dummy_na=False)

# Separate train and test
X = all_data.iloc[:len(train_ids)]
X_test = all_data.iloc[len(train_ids):]
y = y.astype(int)

# Scale data for Logistic Regression
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)


# =============================================================================
# 2. Model Training
# =============================================================================
print("--- Training Ensemble Models ---")

# Define models
lgbm = lgb.LGBMClassifier(random_state=42, n_estimators=200, learning_rate=0.05, num_leaves=20)
xgboost = xgb.XGBClassifier(random_state=42, n_estimators=200, learning_rate=0.05, max_depth=5, use_label_encoder=False, eval_metric='logloss')
logreg = LogisticRegression(random_state=42, max_iter=1000)

# Train models
print("Training LightGBM...")
lgbm.fit(X, y)
print("Training XGBoost...")
xgboost.fit(X, y)
print("Training Logistic Regression...")
logreg.fit(X_scaled, y) # Use scaled data for logistic regression


# =============================================================================
# 3. Blending Predictions and Creating Submission
# =============================================================================
print("\n--- Blending Predictions ---")

# Predict probabilities for the positive class (Transported=True)
lgbm_probs = lgbm.predict_proba(X_test)[:, 1]
xgboost_probs = xgboost.predict_proba(X_test)[:, 1]
logreg_probs = logreg.predict_proba(X_test_scaled)[:, 1]

# Weighted average of probabilities
final_probs = (0.4 * lgbm_probs + 0.4 * xgboost_probs + 0.2 * logreg_probs)

# Convert probabilities to final predictions (True if > 0.5)
final_preds = (final_probs > 0.5)

# Create submission file
submission = pd.DataFrame({'PassengerId': test_passenger_ids, 'Transported': final_preds})
submission.to_csv('../submissions/ensemble_submission.csv', index=False)

print("\nSubmission file 'ensemble_submission.csv' created successfully!")
print(submission.head())

--- Training Ensemble Models ---
Training LightGBM...
[LightGBM] [Info] Number of positive: 4378, number of negative: 4315
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000521 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1909
[LightGBM] [Info] Number of data points in the train set: 8693, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503624 -> initscore=0.014495
[LightGBM] [Info] Start training from score 0.014495
Training XGBoost...


In [8]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
import warnings

warnings.filterwarnings('ignore')

# =============================================================================
# 1. Data Processing (Our proven pipeline)
# =============================================================================

train_df = pd.read_csv('../data/raw/train.csv')
test_df = pd.read_csv('../data/raw/test.csv')

test_passenger_ids = test_df['PassengerId']
train_ids = train_df['PassengerId']
y = train_df['Transported'].copy().astype(int)

train_df.drop('Transported', axis=1, inplace=True)
all_data = pd.concat([train_df, test_df], ignore_index=True)

all_data['GroupId'] = all_data['PassengerId'].apply(lambda x: x.split('_')[0])
all_data['GroupSize'] = all_data.groupby('GroupId')['PassengerId'].transform('count')
all_data[['Deck', 'CabinNum', 'Side']] = all_data['Cabin'].str.split('/', expand=True)
all_data['CabinNum'] = pd.to_numeric(all_data['CabinNum'])
spend_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
all_data['TotalSpend'] = all_data[spend_cols].sum(axis=1)
all_data['NoSpend'] = (all_data['TotalSpend'] == 0).astype(int)

for col in spend_cols:
    all_data.loc[(all_data['CryoSleep'] == True) & (all_data[col].isnull()), col] = 0

numeric_cols = all_data.select_dtypes(include=np.number).columns.tolist()
categorical_cols = all_data.select_dtypes(exclude=np.number).columns.tolist()
for col in numeric_cols:
    all_data[col] = all_data[col].fillna(all_data[col].median())
for col in categorical_cols:
    all_data[col] = all_data[col].fillna(all_data[col].mode()[0])
all_data['TotalSpend'] = all_data[spend_cols].sum(axis=1)

all_data.drop(['PassengerId', 'Name', 'Cabin', 'GroupId'], axis=1, inplace=True)
bool_cols = ['VIP', 'CryoSleep']
for col in bool_cols:
    if col in all_data.columns:
        all_data[col] = all_data[col].astype(int)
categorical_cols_to_encode = all_data.select_dtypes(include=['object']).columns
all_data = pd.get_dummies(all_data, columns=categorical_cols_to_encode, dummy_na=False)

X = all_data.iloc[:len(train_ids)]
X_test = all_data.iloc[len(train_ids):]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

# =============================================================================
# 2. Stacking Implementation
# =============================================================================
print("--- Starting Stacking ---")

N_SPLITS = 10
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

# Create empty arrays to store out-of-fold predictions
oof_preds_lgbm = np.zeros((len(X),))
oof_preds_xgb = np.zeros((len(X),))
oof_preds_logreg = np.zeros((len(X),))

# Create empty arrays to store test predictions
test_preds_lgbm = np.zeros((len(X_test),))
test_preds_xgb = np.zeros((len(X_test),))
test_preds_logreg = np.zeros((len(X_test),))

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"--- Fold {fold+1}/{N_SPLITS} ---")
    
    # Split data for this fold
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    
    # Scaled data for logistic regression
    X_train_scaled, X_val_scaled = X_scaled[train_idx], X_scaled[val_idx]

    # --- Train and predict with LGBM ---
    lgbm = lgb.LGBMClassifier(random_state=42, n_estimators=200, learning_rate=0.05, num_leaves=20)
    lgbm.fit(X_train, y_train)
    oof_preds_lgbm[val_idx] = lgbm.predict_proba(X_val)[:, 1]
    test_preds_lgbm += lgbm.predict_proba(X_test)[:, 1] / N_SPLITS

    # --- Train and predict with XGBoost ---
    xgboost = xgb.XGBClassifier(random_state=42, n_estimators=200, learning_rate=0.05, max_depth=5, use_label_encoder=False, eval_metric='logloss')
    xgboost.fit(X_train, y_train)
    oof_preds_xgb[val_idx] = xgboost.predict_proba(X_val)[:, 1]
    test_preds_xgb += xgboost.predict_proba(X_test)[:, 1] / N_SPLITS

    # --- Train and predict with Logistic Regression ---
    logreg = LogisticRegression(random_state=42, max_iter=1000)
    logreg.fit(X_train_scaled, y_train)
    oof_preds_logreg[val_idx] = logreg.predict_proba(X_val_scaled)[:, 1]
    test_preds_logreg += logreg.predict_proba(X_test_scaled)[:, 1] / N_SPLITS

# =============================================================================
# 3. Train Meta-Model and Create Submission
# =============================================================================
print("\n--- Training Meta-Model ---")

# Create the training data for the meta-model
meta_X = np.stack([oof_preds_lgbm, oof_preds_xgb, oof_preds_logreg], axis=1)

# Create the test data for the meta-model
meta_X_test = np.stack([test_preds_lgbm, test_preds_xgb, test_preds_logreg], axis=1)

# Meta-model (Logistic Regression is a good choice)
meta_model = LogisticRegression(random_state=42)
meta_model.fit(meta_X, y)

# Make final predictions
final_probs = meta_model.predict_proba(meta_X_test)[:, 1]
final_preds = (final_probs > 0.5)

# Create submission file
submission = pd.DataFrame({'PassengerId': test_passenger_ids, 'Transported': final_preds})
submission.to_csv('../submissions/stacking_submission.csv', index=False)

print("\nSubmission file 'stacking_submission.csv' created successfully!")
print(submission.head())

--- Starting Stacking ---
--- Fold 1/10 ---
[LightGBM] [Info] Number of positive: 3940, number of negative: 3883
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000530 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1909
[LightGBM] [Info] Number of data points in the train set: 7823, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503643 -> initscore=0.014573
[LightGBM] [Info] Start training from score 0.014573


In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import optuna
import warnings

warnings.filterwarnings('ignore')

# =============================================================================
# 1. Data Processing (Our proven pipeline)
# =============================================================================

train_df = pd.read_csv('../data/raw/train.csv')
test_df = pd.read_csv('../data/raw/test.csv')

test_passenger_ids = test_df['PassengerId']
train_ids = train_df['PassengerId']
y = train_df['Transported'].copy().astype(int)

train_df.drop('Transported', axis=1, inplace=True)
all_data = pd.concat([train_df, test_df], ignore_index=True)

all_data['GroupId'] = all_data['PassengerId'].apply(lambda x: x.split('_')[0])
all_data['GroupSize'] = all_data.groupby('GroupId')['PassengerId'].transform('count')
all_data[['Deck', 'CabinNum', 'Side']] = all_data['Cabin'].str.split('/', expand=True)
all_data['CabinNum'] = pd.to_numeric(all_data['CabinNum'])
spend_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
all_data['TotalSpend'] = all_data[spend_cols].sum(axis=1)
all_data['NoSpend'] = (all_data['TotalSpend'] == 0).astype(int)

for col in spend_cols:
    all_data.loc[(all_data['CryoSleep'] == True) & (all_data[col].isnull()), col] = 0

numeric_cols = all_data.select_dtypes(include=np.number).columns.tolist()
categorical_cols = all_data.select_dtypes(exclude=np.number).columns.tolist()
for col in numeric_cols:
    all_data[col] = all_data[col].fillna(all_data[col].median())
for col in categorical_cols:
    all_data[col] = all_data[col].fillna(all_data[col].mode()[0])
all_data['TotalSpend'] = all_data[spend_cols].sum(axis=1)

all_data.drop(['PassengerId', 'Name', 'Cabin', 'GroupId'], axis=1, inplace=True)
bool_cols = ['VIP', 'CryoSleep']
for col in bool_cols:
    if col in all_data.columns:
        all_data[col] = all_data[col].astype(int)
categorical_cols_to_encode = all_data.select_dtypes(include=['object']).columns
all_data = pd.get_dummies(all_data, columns=categorical_cols_to_encode, dummy_na=False)

X = all_data.iloc[:len(train_ids)]
X_test = all_data.iloc[len(train_ids):]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

# =============================================================================
# 2. Hyperparameter Tuning with Optuna for LightGBM
# =============================================================================
print("--- Starting Hyperparameter Tuning for LightGBM ---")

def objective(trial):
    param = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'random_state': 42
    }
    
    model = lgb.LGBMClassifier(**param)
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = []
    for train_idx, val_idx in skf.split(X, y):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        scores.append(accuracy_score(y_val, preds))
        
    return np.mean(scores)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50) # 50 trials is a good balance

print("Best trial:")
trial = study.best_trial
print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

best_lgbm_params = trial.params
best_lgbm_params['random_state'] = 42

# =============================================================================
# 3. Final Model Training and Submission
# =============================================================================
print("\n--- Training Final Ensemble with Tuned LGBM ---")

lgbm_tuned = lgb.LGBMClassifier(**best_lgbm_params)
xgboost = xgb.XGBClassifier(random_state=42, n_estimators=200, learning_rate=0.05, max_depth=5, use_label_encoder=False, eval_metric='logloss')
logreg = LogisticRegression(random_state=42, max_iter=1000)

print("Training Tuned LightGBM...")
lgbm_tuned.fit(X, y)
print("Training XGBoost...")
xgboost.fit(X, y)
print("Training Logistic Regression...")
logreg.fit(X_scaled, y)

lgbm_probs = lgbm_tuned.predict_proba(X_test)[:, 1]
xgboost_probs = xgboost.predict_proba(X_test)[:, 1]
logreg_probs = logreg.predict_proba(X_test_scaled)[:, 1]

final_probs = (0.4 * lgbm_probs + 0.4 * xgboost_probs + 0.2 * logreg_probs)
final_preds = (final_probs > 0.5)

submission = pd.DataFrame({'PassengerId': test_passenger_ids, 'Transported': final_preds})
submission.to_csv('../submissions/tuned_ensemble_submission.csv', index=False)

print("\nSubmission file 'tuned_ensemble_submission.csv' created successfully!")
print(submission.head())