In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.pipeline import clone
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import mode

In [None]:
train_df = pd.read_csv('train.csv')

# set up for further filtering
train_df['NameLength'] = train_df['Name'].str.len()
train_df['Cabin'] = train_df['Cabin'].astype(str)
train_df['Cabin'] = train_df['Cabin'].str[0] + train_df['Cabin'].str[-1]
train_df['total spending'] = train_df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
train_df['VIP'] = train_df['VIP'].fillna(False).astype(int)
train_df['CryoSleep'] = train_df['CryoSleep'].fillna(False).astype(int)

# needed to calculate the distribution of positives to sort catagorical features
transported_true = train_df[train_df['Transported'] == True]

# sort homeplanet catagories by True values percentages
counts = train_df['HomePlanet'].value_counts()
transported_counts = transported_true.groupby('HomePlanet').size()
percentages = transported_counts/counts
homeplanet_map = percentages.sort_values(ascending=False)
planet_map = {planet: rank for rank, planet in enumerate(homeplanet_map.index)}
train_df['HomePlanet'] = train_df['HomePlanet'].map(planet_map)


# sort destination catagories by True values percentages
Destcounts = train_df['Destination'].value_counts()
transDest_counts = transported_true.groupby('Destination').size()
percent_vals = transDest_counts/Destcounts
dest_map = percent_vals.sort_values(ascending=False)
d_map = {planet: rank for rank, planet in enumerate(dest_map.index)}
train_df['Destination'] = train_df['Destination'].map(d_map)

# sort Cabin catagories by True values percentages
# This does not need to be divided by counts as in my previous notebook it is 
# shown that there is less than a 1% differance in size of groups
cabin_counts = train_df[train_df['Transported'] == True].groupby('Cabin').size()
sorted_cabins = cabin_counts.sort_values(ascending=False).index
cabin_mapping = {cabin: rank for rank, cabin in enumerate(sorted_cabins)}
train_df['Cabin_ordinal'] = train_df['Cabin'].map(cabin_mapping).fillna(-1).astype(int)


# Targets
targets = train_df['Transported']


#drop unused Catagories
filtered_X = train_df[['HomePlanet', 'CryoSleep', 'Age','VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck','NameLength', 'total spending', 'Cabin_ordinal']]

cabin_block = train_df[['Cabin_ordinal']]
spend_block = train_df[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','total spending']]
age_block = train_df[['Age','CryoSleep','VIP']]
travel_block = train_df[['HomePlanet','Destination']]


train_df


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,NameLength,total spending,Cabin_ordinal
0,0001_01,0.0,0,BP,2.0,39.0,0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,15.0,0.0,6
1,0002_01,2.0,0,FS,2.0,24.0,0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,12.0,736.0,1
2,0003_01,0.0,0,AS,2.0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,13.0,10383.0,13
3,0003_02,0.0,0,AS,2.0,33.0,0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,12.0,5176.0,13
4,0004_01,2.0,0,FS,2.0,16.0,0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,17.0,1091.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,0.0,0,AP,0.0,41.0,1,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False,17.0,8536.0,14
8689,9278_01,2.0,1,GS,1.0,18.0,0,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False,15.0,0.0,0
8690,9279_01,2.0,0,GS,2.0,26.0,0,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True,12.0,1873.0,0
8691,9280_01,0.0,0,ES,0.0,32.0,0,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False,16.0,4637.0,8


now preform bagging the idea is to have one trained on cabins, one trained on spending, one on age+cryo+VIP, and one based on home planet and destination.

In [18]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score

# ----- Feature builders -----
def split_cabin(df):
    # Cabin like "B/45/P" -> Deck=B, Num=45, Side=P
    c = df['Cabin'].fillna('Unknown/0/Unknown').str.split('/', expand=True)
    c.columns = ['Deck', 'CabNum', 'Side']
    c['CabNum'] = pd.to_numeric(c['CabNum'], errors='coerce')
    return c

def add_spend_features(df):
    spend_cols = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
    X = df[spend_cols].copy()
    X = X.fillna(0)
    X['TotalSpend'] = X.sum(axis=1)
    for col in spend_cols:
        X[f'{col}_ratio'] = X[col] / (X['TotalSpend'] + 1)
    # log1p numeric
    for col in spend_cols + ['TotalSpend']:
        X[col] = np.log1p(X[col])
    return X

def add_age_block(df):
    X = pd.DataFrame(index=df.index)
    X['Age'] = df['Age']
    X['CryoSleep'] = df['CryoSleep'].astype('float')  # True/False -> 1/0
    X['VIP'] = df['VIP'].astype('float')
    # Buckets
    bins = [-1, 12, 18, 40, 60, 200]
    labels = ['child','teen','adult','mid','senior']
    X['AgeBucket'] = pd.cut(df['Age'], bins=bins, labels=labels)
    # Interactions
    X['Cryo_x_VIP'] = X['CryoSleep'] * X['VIP']
    return X

def add_travel_block(df):
    return df[['HomePlanet','Destination']].copy()

# ----- Build per-bag matrices -----
def build_bag_matrices(df):
    # Cabin bag
    cabin = df[['Cabin_ordinal']].copy()

    # Spending bag
    spend_cols = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','total_spending']
    spend = df[spend_cols].copy()

    # Age + Cryo + VIP
    ageblock = df[['Age','CryoSleep','VIP']].copy()

    # HomePlanet + Destination
    travel = df[['HomePlanet','Destination']].copy()

    return cabin, spend, ageblock, travel


# ----- Generic pipeline helpers -----
def make_pipeline_num_cat(num_cols, cat_cols, base_est):
    pre = ColumnTransformer(
        transformers=[
            ('num', Pipeline([
                ('imp', SimpleImputer(strategy='median')),
                ('sc', StandardScaler())
            ]), num_cols),
            ('cat', Pipeline([
                ('imp', SimpleImputer(strategy='most_frequent')),
                ('oh', OneHotEncoder(handle_unknown='ignore'))
            ]), cat_cols),
        ],
        remainder='drop'
    )
    pipe = Pipeline([
        ('pre', pre),
        ('clf', CalibratedClassifierCV(base_est, method='isotonic', cv=3))
    ])
    return pipe

# Example base models (swap in CatBoost/LightGBM if you like)
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

def bag_pipes(cabin, spend, ageblock, travel):
    pipes = []

    # Cabin bag
    num_cab = ['CabNum']
    cat_cab = ['Deck','Side']
    pipes.append( ('cabin_rf', make_pipeline_num_cat(num_cab, cat_cab,
                    RandomForestClassifier(n_estimators=400, max_depth=8, random_state=1))) )

    # Spending bag (mostly numeric)
    num_spend = [c for c in spend.columns if 'ratio' in c or 'Spend' in c or c in
                ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']]
    cat_spend = []
    pipes.append( ('spend_gb', make_pipeline_num_cat(num_spend, cat_spend,
                    GradientBoostingClassifier(random_state=2))) )

    # Age + Cryo + VIP
    num_age = ['Age','CryoSleep','VIP','Cryo_x_VIP']
    cat_age = ['AgeBucket']
    pipes.append( ('age_lr', make_pipeline_num_cat(num_age, cat_age,
                    LogisticRegression(max_iter=2000, class_weight='balanced', random_state=3))) )

    # HomePlanet + Destination
    num_travel = []
    cat_travel = ['HomePlanet','Destination']
    pipes.append( ('travel_rf', make_pipeline_num_cat(num_travel, cat_travel,
                    RandomForestClassifier(n_estimators=400, max_depth=10, random_state=4))) )

    return pipes

# ----- Training with OOF for stacking -----
def train_oof_blend(df_raw, y):
    cabin, spend, ageblock, travel = build_bag_matrices(df_raw)
    pipes = bag_pipes(cabin, spend, ageblock, travel)

    Xbags = {'cabin': cabin, 'spend': spend, 'age': ageblock, 'travel': travel}

    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    oof = pd.DataFrame(index=df_raw.index)
    test_probas = {name: np.zeros(len(df_raw)) for name,_ in pipes}

    for name, pipe in pipes:
        X = Xbags[name.split('_')[0]]
        oof_col = f'proba_{name}'
        oof[oof_col] = np.nan

        for tr, va in kf.split(df_raw, y):
            pipe.fit(X.iloc[tr], y.iloc[tr])
            oof.loc[X.index[va], oof_col] = pipe.predict_proba(X.iloc[va])[:,1]

        # simple refit on all for final preds if needed
        pipe.fit(X, y)
        test_probas[name] = pipe.predict_proba(X)[:,1]

    # Simple soft-vote
    oof['blend_mean'] = oof.filter(like='proba_').mean(axis=1)

    # Meta-model stacker
    meta = LogisticRegression(max_iter=2000)
    meta.fit(oof.filter(like='proba_'), y)
    oof['stack'] = meta.predict_proba(oof.filter(like='proba_'))[:,1]

    return oof, pipes, meta


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

df = pd.read_csv('train.csv')

df['VIP'] = df['VIP'].fillna(False).astype(int)
df['CryoSleep'] = df['CryoSleep'].fillna(False).astype(int)

# Split data into train/test
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns=['Transported']),
    df['Transported'],
    test_size=0.2,
    stratify=df['Transported'],
    random_state=42
)

# Train on train set
oof_train, pipes, meta_model = train_oof_blend(X_train, y_train)

# Prepare the bag matrices for the *test* set
cabin_test, spend_test, ageblock_test, travel_test = build_bag_matrices(X_test)
Xbags_test = {
    'cabin': cabin_test,
    'spend': spend_test,
    'age': ageblock_test,
    'travel': travel_test
}

# Get probability predictions from each bag model on the test set
test_pred_probas = []
for name, pipe in pipes:
    bag_name = name.split('_')[0]
    preds = pipe.predict_proba(Xbags_test[bag_name])[:, 1]
    test_pred_probas.append(preds)

# Stack into DataFrame for meta-model
import pandas as pd
test_probas_df = pd.DataFrame(
    {f'proba_{pipes[i][0]}': test_pred_probas[i] for i in range(len(pipes))}
)

for name, pipe in pipes:
    bag_name = name.split('_')[0]
    preds = pipe.predict(Xbags_test[bag_name])
    acc_bag = accuracy_score(y_test, preds)
    print(f"{name} test accuracy: {acc_bag:.4f}")


# Meta-model predicts final probabilities
final_probas = meta_model.predict_proba(test_probas_df)[:, 1]
final_preds = (final_probas >= 0.5).astype(int)

# Evaluate
acc = accuracy_score(y_test, final_preds)
print(f"Stacked Bagging Accuracy on Test: {acc:.4f}")


cabin_rf test accuracy: 0.6118
spend_gb test accuracy: 0.7924
age_lr test accuracy: 0.7303
travel_rf test accuracy: 0.5900
Stacked Bagging Accuracy on Test: 0.7884


In [77]:
train_df = pd.read_csv('train.csv')

# set up for further filtering
train_df['NameLength'] = train_df['Name'].str.len()
train_df['Cabin'] = train_df['Cabin'].astype(str)
train_df['Cabin'] = train_df['Cabin'].str[0] + train_df['Cabin'].str[-1]
train_df['total_spending'] = train_df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
train_df['VIP'] = train_df['VIP'].fillna(False).astype(int)
train_df['CryoSleep'] = train_df['CryoSleep'].fillna(False).astype(int)

# needed to calculate the distribution of positives to sort catagorical features
transported_true = train_df[train_df['Transported'] == True]

# sort homeplanet catagories by True values percentages
counts = train_df['HomePlanet'].value_counts()
transported_counts = transported_true.groupby('HomePlanet').size()
percentages = transported_counts/counts
homeplanet_map = percentages.sort_values(ascending=False)
planet_map = {planet: rank for rank, planet in enumerate(homeplanet_map.index)}
train_df['HomePlanet'] = train_df['HomePlanet'].map(planet_map)


# sort destination catagories by True values percentages
Destcounts = train_df['Destination'].value_counts()
transDest_counts = transported_true.groupby('Destination').size()
percent_vals = transDest_counts/Destcounts
dest_map = percent_vals.sort_values(ascending=False)
d_map = {planet: rank for rank, planet in enumerate(dest_map.index)}
train_df['Destination'] = train_df['Destination'].map(d_map)

# sort Cabin catagories by True values percentages
# This does not need to be divided by counts as in my previous notebook it is 
# shown that there is less than a 1% differance in size of groups
cabin_counts = train_df[train_df['Transported'] == True].groupby('Cabin').size()
sorted_cabins = cabin_counts.sort_values(ascending=False).index
cabin_mapping = {cabin: rank for rank, cabin in enumerate(sorted_cabins)}
train_df['Cabin_ordinal'] = train_df['Cabin'].map(cabin_mapping).fillna(-1).astype(int)


# Targets
targets = train_df['Transported']


#drop unused Catagories
filtered_X = train_df[['HomePlanet', 'CryoSleep', 'Age','VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck','NameLength', 'total_spending', 'Cabin_ordinal']]

filtered_X = filtered_X.fillna(filtered_X.median())

cabin_block = train_df[['Cabin_ordinal']]
spend_block = train_df[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','total_spending']]
age_block = train_df[['Age','CryoSleep','VIP']]
travel_block = train_df[['HomePlanet','Destination']]


train_df


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,NameLength,total_spending,Cabin_ordinal
0,0001_01,0.0,0,BP,2.0,39.0,0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,15.0,0.0,6
1,0002_01,2.0,0,FS,2.0,24.0,0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,12.0,736.0,1
2,0003_01,0.0,0,AS,2.0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,13.0,10383.0,13
3,0003_02,0.0,0,AS,2.0,33.0,0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,12.0,5176.0,13
4,0004_01,2.0,0,FS,2.0,16.0,0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,17.0,1091.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,0.0,0,AP,0.0,41.0,1,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False,17.0,8536.0,14
8689,9278_01,2.0,1,GS,1.0,18.0,0,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False,15.0,0.0,0
8690,9279_01,2.0,0,GS,2.0,26.0,0,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True,12.0,1873.0,0
8691,9280_01,0.0,0,ES,0.0,32.0,0,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False,16.0,4637.0,8


In [78]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score
import numpy as np
from xgboost import XGBClassifier
from catboost import CatBoostClassifier


scaler = StandardScaler()
scaled_x = scaler.fit_transform(filtered_X)
scaled_x = pd.DataFrame(scaled_x, columns=filtered_X.columns)

# --- FEATURES AND TARGET ---
X = scaled_x
y = targets

# --- TRAIN / TEST SPLIT ---
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# --- BAGGING SETUP ---
# For example, use BaggingClassifier with Decision Trees
bagging_model = BaggingClassifier(
    n_estimators=100,       # number of models in the ensemble
    max_samples=0.8,        # fraction of samples for each model
    max_features=1.0,       # fraction of features for each model
    random_state=42
)

# Train the model
bagging_model.fit(X_train, y_train)

# Predict on validation set
y_pred = bagging_model.predict(X_valid)

# Evaluate accuracy
acc = accuracy_score(y_valid, y_pred)
print(f"Validation Accuracy: {acc:.4f}")


# --- OPTIONAL: OOF BLENDING EXAMPLE ---
# Let's say you want to blend RandomForest and GradientBoosting
folds = 5
skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)

models = [
    RandomForestClassifier(n_estimators=200, max_depth=8, random_state=42),
    #ExtraTreesClassifier(n_estimators=200, random_state=42),
    XGBClassifier(random_state = 20,max_depth = 5,reg_lambda = 10,subsample=1.0,min_child_weight = 5,learning_rate = 0.2,n_estimators =100,reg_alpha=0.1, colsample_bytree = 0.8 ),
    CatBoostClassifier(iterations=200, learning_rate=0.05, random_state=42, verbose=0),
    LogisticRegression(max_iter=2000),
    #KNeighborsClassifier(n_neighbors=5),
    QuadraticDiscriminantAnalysis(reg_param=0.05),
    LinearSVC(max_iter=5000, random_state=42),
    SVC(kernel='rbf', probability=True, random_state=42)
]

n_models = len(models)

oof_preds = np.zeros((X_train.shape[0], n_models))  # assuming 2 models
val_preds = np.zeros((X_valid.shape[0], n_models))



for i, model in enumerate(models):
    oof = np.zeros(X_train.shape[0])
    val_fold_preds = np.zeros((X_valid.shape[0], folds))
    
    for fold, (tr_idx, va_idx) in enumerate(skf.split(X_train, y_train)):
        X_tr, X_va = X_train.iloc[tr_idx], X_train.iloc[va_idx]
        y_tr, y_va = y_train.iloc[tr_idx], y_train.iloc[va_idx]
        
        model.fit(X_tr, y_tr)
        oof[va_idx] = model.predict(X_va)
        val_fold_preds[:, fold] = model.predict(X_valid)
    
    oof_preds[:, i] = oof
    val_preds[:, i] = val_fold_preds.mean(axis=1)
    model_acc = accuracy_score(y_train, oof)
    print(f"Model {i+1} ({model.__class__.__name__}) OOF Accuracy: {model_acc:.4f}")


# Simple blend: majority vote
from scipy.stats import mode
final_val_pred = mode(val_preds.round(), axis=1).mode.flatten()
print(f"OOF Blend Validation Accuracy: {accuracy_score(y_valid, final_val_pred):.4f}")


Validation Accuracy: 0.7970
Model 1 (RandomForestClassifier) OOF Accuracy: 0.7991
Model 2 (XGBClassifier) OOF Accuracy: 0.7975
Model 3 (CatBoostClassifier) OOF Accuracy: 0.8036
Model 4 (LogisticRegression) OOF Accuracy: 0.7863
Model 5 (QuadraticDiscriminantAnalysis) OOF Accuracy: 0.6865
Model 6 (LinearSVC) OOF Accuracy: 0.7839
Model 7 (SVC) OOF Accuracy: 0.7890
OOF Blend Validation Accuracy: 0.8045


In [79]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier


models = [
    # Random Forest variants
    #RandomForestClassifier(n_estimators=200, max_depth=6, random_state=42),
    #RandomForestClassifier(n_estimators=300, max_depth=8, random_state=52),
    
    # XGBoost variants
    XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=4, random_state=42),
    XGBClassifier(n_estimators=300, learning_rate=0.03, max_depth=5, random_state=52),
    
    # LightGBM variants
    LGBMClassifier(n_estimators=200, learning_rate=0.05, max_depth=4, random_state=42, verbose=0),
    LGBMClassifier(n_estimators=300, learning_rate=0.02, max_depth=6, random_state=52, verbose=0),
    
    # CatBoost variants
    CatBoostClassifier(iterations=200, learning_rate=0.05, depth=4, random_state=42, verbose=0),
    CatBoostClassifier(iterations=300, learning_rate=0.03, depth=5, random_state=52, verbose=0),

]

# Meta-model
meta_model = LogisticRegression(max_iter=2000)


In [80]:

folds = 5
skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)

n_models = len(models)
oof_preds = np.zeros((X_train.shape[0], n_models))
val_preds = np.zeros((X_valid.shape[0], n_models))

# Train base models and collect OOF predictions
for i, model in enumerate(models):
    oof = np.zeros(X_train.shape[0])
    val_fold_preds = np.zeros((X_valid.shape[0], folds))
    
    for fold, (tr_idx, va_idx) in enumerate(skf.split(X_train, y_train)):
        X_tr, X_va = X_train.iloc[tr_idx], X_train.iloc[va_idx]
        y_tr, y_va = y_train.iloc[tr_idx], y_train.iloc[va_idx]
        
        model.fit(X_tr, y_tr)
        oof[va_idx] = model.predict(X_va)
        val_fold_preds[:, fold] = model.predict(X_valid)
    
    oof_preds[:, i] = oof
    val_preds[:, i] = val_fold_preds.mean(axis=1)
    
    model_acc = accuracy_score(y_train, oof)
    print(f"Model {i+1} ({model.__class__.__name__}) OOF Accuracy: {model_acc:.4f}")

# Meta-model: train on OOF predictions
meta_model.fit(oof_preds, y_train)
final_val_pred = meta_model.predict(val_preds)
print(f"OOF Stack with Meta-Model Validation Accuracy: {accuracy_score(y_valid, final_val_pred):.4f}")

Model 1 (XGBClassifier) OOF Accuracy: 0.7988
Model 2 (XGBClassifier) OOF Accuracy: 0.8016
Model 3 (LGBMClassifier) OOF Accuracy: 0.7993
Model 4 (LGBMClassifier) OOF Accuracy: 0.8013
Model 5 (CatBoostClassifier) OOF Accuracy: 0.7987
Model 6 (CatBoostClassifier) OOF Accuracy: 0.8014
OOF Stack with Meta-Model Validation Accuracy: 0.8114


In [84]:
test_df = pd.read_csv('test.csv')

# set up for further filtering
test_df['NameLength'] = test_df['Name'].str.len()
test_df['Cabin'] = test_df['Cabin'].astype(str)
test_df['Cabin'] = test_df['Cabin'].str[0] + test_df['Cabin'].str[-1]
test_df['total_spending'] = test_df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
test_df['VIP'] = test_df['VIP'].fillna(False).astype(int)
test_df['CryoSleep'] = test_df['CryoSleep'].fillna(False).astype(int)


# sort homeplanet catagories by True values percentages
test_df['HomePlanet'] = test_df['HomePlanet'].map(planet_map)

test_df['Destination'] = test_df['Destination'].map(d_map)

test_df['Cabin_ordinal'] = test_df['Cabin'].map(cabin_mapping).fillna(-1).astype(int)

#drop unused Catagories
filtered_test_X = test_df[['HomePlanet', 'CryoSleep', 'Age','VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck','NameLength', 'total_spending', 'Cabin_ordinal']]

filtered_test_X = filtered_test_X.fillna(filtered_test_X.median())

cabin_block = test_df[['Cabin_ordinal']]
spend_block = test_df[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','total_spending']]
age_block = test_df[['Age','CryoSleep','VIP']]
travel_block = test_df[['HomePlanet','Destination']]

cabin_block


Unnamed: 0,Cabin_ordinal
0,0
1,1
2,5
3,5
4,1
...,...
4272,0
4273,12
4274,11
4275,11


In [86]:
scaled_test_X = scaler.transform(filtered_test_X)
scaled_x = pd.DataFrame(scaled_test_X, columns=filtered_X.columns)

val_preds_new = np.zeros((filtered_test_X.shape[0], n_models))

for i, model in enumerate(models):
    if hasattr(model, "predict_proba"):  # tree-based models
        val_preds_new[:, i] = model.predict_proba(scaled_test_X)[:,1]
    else:  # models like RidgeClassifier or KNN
        val_preds_new[:, i] = model.predict(scaled_test_X)

final_pred_proba = meta_model.predict_proba(val_preds_new)[:,1]
final_pred = final_pred_proba >= 0.5
passenger_ids = test_df['PassengerId']
submission = pd.DataFrame({
        'PassengerId': passenger_ids,
        'Transported': final_pred
    })

submission.to_csv('bagging_model.csv', index=False)

