# Import Modules

In [200]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

# Load the dataset

In [134]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Data Understanding

In [136]:
train.shape

(8693, 14)

In [137]:
test.shape

(4277, 13)

In [138]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [139]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [140]:
train.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [141]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   4277 non-null   object 
 1   HomePlanet    4190 non-null   object 
 2   CryoSleep     4184 non-null   object 
 3   Cabin         4177 non-null   object 
 4   Destination   4185 non-null   object 
 5   Age           4186 non-null   float64
 6   VIP           4184 non-null   object 
 7   RoomService   4195 non-null   float64
 8   FoodCourt     4171 non-null   float64
 9   ShoppingMall  4179 non-null   float64
 10  Spa           4176 non-null   float64
 11  VRDeck        4197 non-null   float64
 12  Name          4183 non-null   object 
dtypes: float64(6), object(7)
memory usage: 434.5+ KB


In [142]:
test.isnull().sum()

PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
dtype: int64

In [143]:
(8494/8693)*100


97.71080179454734

In [144]:
(204/891)*100

22.895622895622896

In [145]:
train.head(1)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False


# Data Cleaning And Feature Engineering

In [147]:
def prepare_features(df):
    # Extract group from PassengerId
    df[["Group", "Passenger"]] = df["PassengerId"].str.split("_", expand = True)
    df["Passenger"] = pd.to_numeric(df["Passenger"], errors = "coerce")
    df["Group"] = pd.to_numeric(df["Group"], errors = "coerce")
    
    # Extract deck and side from Cabin
    df[['Deck', 'CabinNum', 'Side']] = df['Cabin'].str.split('/', expand=True)
    df["CabinNum"] = pd.to_numeric(df["CabinNum"], errors = "coerce")
    
    # Create total spend column
    spend_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    df['TotalSpend'] = df[spend_cols].sum(axis=1)
    
    # Drop original spending columns
    #df.drop(columns=spend_cols + ['Cabin', 'CabinNum', 'Name'], inplace=True)
    df.drop(columns=['Cabin', 'PassengerId', 'Name'], inplace=True)

    group_counts = df["Group"].value_counts()
    df["GroupSize"] = df["Group"].map(group_counts)
    df["IsAlone"] = (df["GroupSize"] == 1)

    df['CryoSleep'] = df['CryoSleep'].map({True: 1, False: 0})
    df['VIP'] = df['VIP'].map({True: 1, False: 0})
    df['Side'] = df['Side'].map({'P': 0, 'S': 1})
    df['IsAlone'] = df['IsAlone'].map({True: 1, False: 0})
    
    return df


In [148]:
train = prepare_features(train)
test = prepare_features(test)

In [149]:
train.dtypes

HomePlanet       object
CryoSleep       float64
Destination      object
Age             float64
VIP             float64
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Transported        bool
Group             int64
Passenger         int64
Deck             object
CabinNum        float64
Side            float64
TotalSpend      float64
GroupSize         int64
IsAlone           int64
dtype: object

# Data Preprocessing

In [151]:
# Define your column groups
num_cols = ['CabinNum', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
cat_cols_mode = ['CryoSleep', 'VIP', 'Side', 'HomePlanet', 'Destination', 'Deck']

# Numerical transformer: median imputation + scaling
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Numerical transformer (for spending cols where missing = 0)
num_zero_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', StandardScaler())
])

# Categorical transformer: mode imputation + one-hot encoding
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', drop='first'))
])

# Combine everything in ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, ['CabinNum']),                 # median fill
        ('num_zero', num_zero_transformer, ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']),  # fill 0
        ('cat', cat_transformer, cat_cols_mode)                 # mode fill + OHE
    ])

# Model Building

In [153]:
X = train.drop(columns=['Transported'])
y = train['Transported'] # target

In [154]:
X.columns

Index(['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService',
       'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Group', 'Passenger',
       'Deck', 'CabinNum', 'Side', 'TotalSpend', 'GroupSize', 'IsAlone'],
      dtype='object')

In [155]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size= 0.2, random_state=42)


In [156]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=200, random_state=42))
])


In [157]:
model.fit(X_train, y_train)


0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('num_zero', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,0
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [158]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_valid)
print("Validation Accuracy:", accuracy_score(y_valid, y_pred))


Validation Accuracy: 0.7849338700402531


In [159]:
from sklearn.model_selection import cross_val_score

# 5-fold cross-validation on the whole training set
cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')

print("Cross-validation scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())


Cross-validation scores: [0.74985624 0.7239793  0.79355952 0.81875719 0.77848101]
Mean CV Accuracy: 0.7729266518924478


# Hyper Parameter Tunning

# =====================
# XGBoost (optional)

In [250]:
xgb_pipe = Pipeline([('preprocessor', preprocessor),
                         ('model', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'))])
xgb_params = {
    'model__n_estimators': [200, 500, 800, 2000],
    'model__learning_rate': [0.01, 0.03, 0.05, 0.1],
    'model__max_depth': [3, 4, 5, 6, 8],
    'model__subsample': [0.7, 0.8, 0.9, 1.0],
    'model__colsample_bytree': [0.7, 0.8, 0.9, 1.0],
    'model__gamma': [0, 1, 3, 5]
}

grid_xgb = GridSearchCV(xgb_pipe, xgb_params, cv=5, scoring='accuracy', n_jobs=-1)
grid_xgb.fit(X_train, y_train)

0,1,2
,estimator,"Pipeline(step...=None, ...))])"
,param_grid,"{'model__learning_rate': [0.01, 0.03, ...], 'model__max_depth': [3, 4, ...], 'model__n_estimators': [200, 500, ...]}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('num_zero', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,0
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


# =====================
# Random Forest

In [222]:
# =====================
# Random Forest
rf_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(random_state=42))
])
rf_params = {
    'model__n_estimators': [200, 500, 800, 1200],
    'model__max_depth': [None, 10, 20, 30],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__max_features': ['sqrt', 'log2', None]
}

grid_rf = GridSearchCV(rf_pipe, rf_params, cv=5, scoring='accuracy', n_jobs=-1)
grid_rf.fit(X_train, y_train)


0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'model__max_depth': [None, 5, ...], 'model__min_samples_leaf': [1, 2], 'model__min_samples_split': [2, 5], 'model__n_estimators': [100, 200, ...]}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('num_zero', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,0
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,10
,min_samples_split,5
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


# =====================
# Decision Tree

In [227]:
dt_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', DecisionTreeClassifier(random_state=42))
])
dt_params = {
    'model__max_depth': [None, 5, 10, 20],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4]
}
grid_dt = GridSearchCV(dt_pipe, dt_params, cv=5, scoring='accuracy', n_jobs=-1)
grid_dt.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'model__max_depth': [None, 5, ...], 'model__min_samples_leaf': [1, 2, ...], 'model__min_samples_split': [2, 5, ...]}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('num_zero', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,0
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,5
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


# =====================
# Logistic Regression

In [229]:
lr_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LogisticRegression(random_state=42, max_iter=500))
])
lr_params = {
    'model__penalty': ['l2'],
    'model__C': [0.01, 0.1, 1, 10],
    'model__solver': ['lbfgs', 'liblinear']
}
grid_lr = GridSearchCV(lr_pipe, lr_params, cv=5, scoring='accuracy', n_jobs=-1)
grid_lr.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'model__C': [0.01, 0.1, ...], 'model__penalty': ['l2'], 'model__solver': ['lbfgs', 'liblinear']}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('num_zero', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,0
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,10
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,500


# =====================
# Gradient Boosting

In [233]:
gb_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', GradientBoostingClassifier(random_state=42))
])
gb_params = {
    'model__n_estimators': [100, 200],
    'model__learning_rate': [0.05, 0.1, 0.2],
    'model__max_depth': [3, 4, 5],
    'model__subsample': [0.8, 1.0]
}
grid_gb = GridSearchCV(gb_pipe, gb_params, cv=5, scoring='accuracy', n_jobs=-1)
grid_gb.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'model__learning_rate': [0.05, 0.1, ...], 'model__max_depth': [3, 4, ...], 'model__n_estimators': [100, 200], 'model__subsample': [0.8, 1.0]}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('num_zero', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,0
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,loss,'log_loss'
,learning_rate,0.1
,n_estimators,100
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,4
,min_impurity_decrease,0.0


# =====================
# Function to Evaluate Models

In [252]:
def evaluate_model(grid, model_name, X, y, X_valid, y_valid):
    best_model = grid.best_estimator_
    val_score = cross_val_score(best_model, X, y, cv=5).mean()
    test_score = accuracy_score(y_valid, best_model.predict(X_valid))
    
    gap = val_score - test_score
    if gap > 0.05 and val_score >= 0.85:
        fit_msg = "🚨 Overfitting: Validation much better than Test."
    elif val_score < 0.80 and test_score < 0.80:
        fit_msg = "⚠️ Underfitting: Model too simple, low on both."
    elif abs(gap) <= 0.05 and test_score >= 0.80:
        fit_msg = "✅ Good Fit: Validation and Test are close and high."
    else:
        fit_msg = "ℹ️ Borderline — needs more tuning."
    
    print(f"\n--- {model_name} ---")
    print("Best Params:", grid.best_params_)
    print("Validation Accuracy:", round(val_score, 3))
    print("Test Accuracy:", round(test_score, 3))
    print("Fit Assessment:", fit_msg)
    
    return {
        'Model': model_name,
        'Validation_Accuracy': round(val_score, 3),
        'Test_Accuracy': round(test_score, 3),
        'Fit_Assessment': fit_msg
    }

# Store model name and fitted GridSearchCV object

In [255]:
grids = [
    ("Random Forest", grid_rf),
    ("Decision Tree", grid_dt),
    ("Logistic Regression", grid_lr),
    ("Gradient Boosting", grid_gb),
    ("XGBoost", grid_xgb)
]

results = []

# Loop through each model and collect results
for name, grid in grids:
    res = evaluate_model(grid, name, X_train, y_train, X_valid, y_valid)
    results.append(res)

# Convert results to DataFrame
df_results = pd.DataFrame(results)

#print("\n===== Summary DataFrame =====")
#print(df_results)



--- Random Forest ---
Best Params: {'model__max_depth': 10, 'model__min_samples_leaf': 1, 'model__min_samples_split': 5, 'model__n_estimators': 200}
Validation Accuracy: 0.805
Test Accuracy: 0.795
Fit Assessment: ℹ️ Borderline — needs more tuning.

--- Decision Tree ---
Best Params: {'model__max_depth': 5, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2}
Validation Accuracy: 0.771
Test Accuracy: 0.766
Fit Assessment: ⚠️ Underfitting: Model too simple, low on both.

--- Logistic Regression ---
Best Params: {'model__C': 10, 'model__penalty': 'l2', 'model__solver': 'lbfgs'}
Validation Accuracy: 0.794
Test Accuracy: 0.78
Fit Assessment: ⚠️ Underfitting: Model too simple, low on both.

--- Gradient Boosting ---
Best Params: {'model__learning_rate': 0.1, 'model__max_depth': 4, 'model__n_estimators': 100, 'model__subsample': 1.0}
Validation Accuracy: 0.81
Test Accuracy: 0.793
Fit Assessment: ℹ️ Borderline — needs more tuning.

--- XGBoost ---
Best Params: {'model__learning_rate':

In [257]:
df_results

Unnamed: 0,Model,Validation_Accuracy,Test_Accuracy,Fit_Assessment
0,Random Forest,0.805,0.795,ℹ️ Borderline — needs more tuning.
1,Decision Tree,0.771,0.766,"⚠️ Underfitting: Model too simple, low on both."
2,Logistic Regression,0.794,0.78,"⚠️ Underfitting: Model too simple, low on both."
3,Gradient Boosting,0.81,0.793,ℹ️ Borderline — needs more tuning.
4,XGBoost,0.812,0.792,ℹ️ Borderline — needs more tuning.


# Prediction And Do Submission

In [282]:
X_test_final = test.drop(columns=['PassengerId'], errors='ignore')


In [284]:
test.columns

Index(['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService',
       'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Group', 'Passenger',
       'Deck', 'CabinNum', 'Side', 'TotalSpend', 'GroupSize', 'IsAlone'],
      dtype='object')

In [287]:
# Predict using the best XGBoost pipeline
best_xgb_model = grid_xgb.best_estimator_  # includes preprocessor
test_predictions = best_xgb_model.predict(X_test_final)

In [295]:
# Example: create a submission DataFrame
submission = pd.DataFrame({
    'PassengerId': pd.read_csv('test.csv')['PassengerId'],
    'Transported': test_predictions.astype(bool)
})

In [297]:
# Save submission
submission.to_csv('submission.csv', index=False)
print("✅ Predictions saved to submission.csv")

✅ Predictions saved to submission.csv


# Another Process

In [None]:
# submission_df = pd.read_csv('sample_submission.csv')

# submission_df['Transported'] = test_predictions.astype(bool)

# submission_df

# Save submission
# submission.to_csv('submission.csv', index=False)
# print("✅ Predictions saved to submission.csv")