In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv("../Dataset/train.csv")
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
X = data.drop("Survived", axis=1)
Y = data["Survived"]

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler, PowerTransformer, OrdinalEncoder
from sklearn.model_selection import train_test_split

In [5]:
DropAndImpute = ColumnTransformer(
    transformers=[
        ("AgeImpute", SimpleImputer(strategy='mean', copy=False), [4]),
        ("EmbarkedImpute", SimpleImputer(strategy='most_frequent', copy=False), [10]),
        ("CabinImpute", SimpleImputer(strategy='constant', fill_value=0 ,add_indicator=True),[9]), 
    ], remainder="passthrough"
)

In [6]:
from pipelineHelpers import extractTitle, extractAgeCategory, famCategory, ticketCategory, cabinCategory

def get_title_name(self, input_features):
    return ['NameTitle']

titleTransformer = FunctionTransformer(
    extractTitle,
    feature_names_out=get_title_name
)

def get_Age_Category_name(self, input_features):
    return ['AgeCategory']

ageTransformer = FunctionTransformer(
    extractAgeCategory,
    feature_names_out=get_Age_Category_name
)

def get_family_category(self, input_features):
    return ['FamSize']

famNumToCategoryTransformer = FunctionTransformer(
    famCategory,
    feature_names_out=get_family_category
)

def get_ticket_category(self, input_features):
    return ['TicketCat']

ticketCategoryTransformer = FunctionTransformer(
    ticketCategory,
    feature_names_out=get_ticket_category
)

def get_cabin_category(self, input_features):
    return ['CabinCategory']

cabinCategoryTransformer = FunctionTransformer(
    cabinCategory,
    feature_names_out=get_cabin_category
)

In [7]:
CategoriesConstruct = ColumnTransformer(
    transformers= [
        ("TitleExtract", titleTransformer, [6]),
        ("ExtractAgeCategory", ageTransformer, [0]),
        ("FamCategory", famNumToCategoryTransformer, [8,9]),
        ("TicketCategory", ticketCategoryTransformer, [10]),
        ("CabinCategory", cabinCategoryTransformer, [2]),
    ], remainder='passthrough'
)

In [15]:
Encoding = ColumnTransformer(
    transformers=[
        ("OHE", OneHotEncoder(sparse_output=False, dtype=np.int32, drop='first', handle_unknown='ignore'), [0,1,2,4,5,6,8,9]),
        
        ("OHE_Ticket", OneHotEncoder(sparse_output=False, dtype=np.int32, drop='first', max_categories=18, handle_unknown='infrequent_if_exist'), [3]),
        
        ("Scale_Fare", PowerTransformer(standardize=True, copy=False), [10])
    ],
    remainder='drop' 
)

In [27]:
best_params = {'max_depth': 5,
 'min_child_weight': 1,
 'gamma': 0.01715518069488603,
 'subsample': 0.9439950277797844,
 'colsample_bytree': 0.8645354179897717,
 'n_estimators': 350,
 'learning_rate': 0.012894989466883576,
 'alpha': 0.8289901171806625,
 'lambda': 0.0028328551580498323}

In [28]:
from xgboost import XGBClassifier
Xgb = XGBClassifier(**best_params)

In [29]:
pipe = Pipeline(
    [
        ("DropAndImpute", DropAndImpute),
        ("FeatureConstruction", CategoriesConstruct),
        ("FeatureTransformation", Encoding),
        ("Training",Xgb)
    ]
)

In [30]:
import optuna
from sklearn.model_selection import cross_val_score

def xgb_objective(trial):
    params = {
        'Training__max_depth': trial.suggest_int('max_depth', 3, 7),
        'Training__min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'Training__gamma': trial.suggest_float('gamma', 1e-3, 5.0, log=True),
        
        'Training__subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'Training__colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        
        'Training__n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'Training__learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        
        'Training__alpha': trial.suggest_float('alpha', 1e-3, 10.0, log=True),
        'Training__lambda': trial.suggest_float('lambda', 1e-3, 10.0, log=True),
        
        'Training__objective': 'binary:logistic',
        'Training__eval_metric': 'logloss',
        'Training__random_state': 42,
        'Training__n_jobs': -1
    }

    pipe.set_params(**params)
    
    score = cross_val_score(pipe, X, Y, cv=3, scoring='accuracy', n_jobs=-1).mean()
    
    return score

  from .autonotebook import tqdm as notebook_tqdm


In [31]:
study_XGB = optuna.create_study(direction='maximize')
study_XGB.optimize(xgb_objective, n_trials=100)

[32m[I 2026-02-24 20:54:02,390][0m A new study created in memory with name: no-name-7705467c-cbe9-49cf-87bc-4e40a06ba628[0m
[32m[I 2026-02-24 20:54:11,787][0m Trial 0 finished with value: 0.8237934904601572 and parameters: {'max_depth': 3, 'min_child_weight': 4, 'gamma': 1.34853650800973, 'subsample': 0.9313696090378134, 'colsample_bytree': 0.845534071626879, 'n_estimators': 315, 'learning_rate': 0.09074054105035526, 'alpha': 0.5669259767084347, 'lambda': 0.01001470369492167}. Best is trial 0 with value: 0.8237934904601572.[0m
[32m[I 2026-02-24 20:54:15,665][0m Trial 1 finished with value: 0.8350168350168351 and parameters: {'max_depth': 7, 'min_child_weight': 2, 'gamma': 0.1273982465128692, 'subsample': 0.6071878101702602, 'colsample_bytree': 0.9487535332171673, 'n_estimators': 393, 'learning_rate': 0.030703095667606706, 'alpha': 0.6113991199161283, 'lambda': 0.0011123581964859915}. Best is trial 1 with value: 0.8350168350168351.[0m
[32m[I 2026-02-24 20:54:19,386][0m Trial 

In [32]:
best_parameters = study_XGB.best_params
best_parameters

{'max_depth': 7,
 'min_child_weight': 6,
 'gamma': 0.06467998684361159,
 'subsample': 0.8847589615063616,
 'colsample_bytree': 0.869109159162538,
 'n_estimators': 479,
 'learning_rate': 0.03850032327710283,
 'alpha': 0.126705611004766,
 'lambda': 0.04463757387925479}

In [33]:
study_XGB.best_trial.params

{'max_depth': 7,
 'min_child_weight': 6,
 'gamma': 0.06467998684361159,
 'subsample': 0.8847589615063616,
 'colsample_bytree': 0.869109159162538,
 'n_estimators': 479,
 'learning_rate': 0.03850032327710283,
 'alpha': 0.126705611004766,
 'lambda': 0.04463757387925479}

In [35]:
pipe.named_steps["Training"].set_params(**best_parameters)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.869109159162538
,device,
,early_stopping_rounds,
,enable_categorical,False


In [19]:
pipe.set_output(transform="pandas")

0,1,2
,steps,"[('DropAndImpute', ...), ('FeatureConstruction', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('AgeImpute', ...), ('EmbarkedImpute', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,False
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,False
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,0
,copy,True
,add_indicator,True
,keep_empty_features,False

0,1,2
,transformers,"[('TitleExtract', ...), ('ExtractAgeCategory', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,func,<function ext...002B1940BCE00>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function get...002B1F9CA51C0>
,kw_args,
,inv_kw_args,

0,1,2
,func,<function ext...002B1940BCF40>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function get...002B1940BD260>
,kw_args,
,inv_kw_args,

0,1,2
,func,<function fam...002B1940BD080>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function get...002B1940BD300>
,kw_args,
,inv_kw_args,

0,1,2
,func,<function tic...002B1940BD120>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function get...002B1940BD3A0>
,kw_args,
,inv_kw_args,

0,1,2
,func,<function cab...002B1940BD1C0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function get...002B1940BD440>
,kw_args,
,inv_kw_args,

0,1,2
,transformers,"[('OHE', ...), ('OHE_Ticket', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.int32'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.int32'>
,handle_unknown,'infrequent_if_exist'
,min_frequency,
,max_categories,18
,feature_name_combiner,'concat'

0,1,2
,method,'yeo-johnson'
,standardize,True
,copy,False

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8645354179897717
,device,
,early_stopping_rounds,
,enable_categorical,False


In [36]:
from sklearn.metrics import accuracy_score

In [37]:
test_data = pd.read_csv("../Dataset/test.csv")
test_data

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [38]:
pipe.fit(X,Y)

0,1,2
,steps,"[('DropAndImpute', ...), ('FeatureConstruction', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('AgeImpute', ...), ('EmbarkedImpute', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,False
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,False
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,0
,copy,True
,add_indicator,True
,keep_empty_features,False

0,1,2
,transformers,"[('TitleExtract', ...), ('ExtractAgeCategory', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,func,<function ext...002B1940BCE00>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function get...002B1F9CA51C0>
,kw_args,
,inv_kw_args,

0,1,2
,func,<function ext...002B1940BCF40>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function get...002B1940BD260>
,kw_args,
,inv_kw_args,

0,1,2
,func,<function fam...002B1940BD080>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function get...002B1940BD300>
,kw_args,
,inv_kw_args,

0,1,2
,func,<function tic...002B1940BD120>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function get...002B1940BD3A0>
,kw_args,
,inv_kw_args,

0,1,2
,func,<function cab...002B1940BD1C0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function get...002B1940BD440>
,kw_args,
,inv_kw_args,

0,1,2
,transformers,"[('OHE', ...), ('OHE_Ticket', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.int32'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.int32'>
,handle_unknown,'infrequent_if_exist'
,min_frequency,
,max_categories,18
,feature_name_combiner,'concat'

0,1,2
,method,'yeo-johnson'
,standardize,True
,copy,False

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.869109159162538
,device,
,early_stopping_rounds,
,enable_categorical,False


In [39]:
test_prediction = pipe.predict(test_data)
test_prediction



array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,

In [40]:
test_data["PassengerId"]

0       892
1       893
2       894
3       895
4       896
       ... 
413    1305
414    1306
415    1307
416    1308
417    1309
Name: PassengerId, Length: 418, dtype: int64

In [41]:
prediction = pd.DataFrame({
    'PassengerId': test_data["PassengerId"],
    'Survived': test_prediction
})

In [42]:
prediction.to_csv("../Dataset/prediction.csv", index=False)