In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import RandomizedSearchCV

from sklearn.ensemble import VotingClassifier, StackingClassifier
import warnings
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv('/kaggle/input/titanic/train.csv') 
test = pd.read_csv('/kaggle/input/titanic/test.csv')
submission = pd.read_csv('/kaggle/input/titanic/gender_submission.csv') 

In [3]:
print("train shape:", train.shape, "test shape:", test.shape)

train shape: (891, 12) test shape: (418, 11)


In [4]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
train.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Braund, Mr. Owen Harris",male,,,,347082.0,,B96 B98,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


In [6]:
print("Overall survival rate:", train['Survived'].mean())

Overall survival rate: 0.3838383838383838


In [7]:
title = train['Name'].str.extract(r',\s*([^\.]+)\.').iloc[:,0].str.strip()
title.value_counts()

0
Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Mlle              2
Major             2
Col               2
the Countess      1
Capt              1
Ms                1
Sir               1
Lady              1
Mme               1
Don               1
Jonkheer          1
Name: count, dtype: int64

In [8]:
train['Embarked'].value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [9]:
def preprocess(df,isTrain=True):
    df = df.copy()
    # extract name from title
    df['Title'] = df['Name'].str.extract(r',\s*([^\.]+)\.').iloc[:,0].str.strip()
    df['Title'] = df['Title'].replace({'Mlle':'Miss','Ms':'Miss','Mme':'Mrs','Sir':'Mr','Lady':'Miss'})
    common_titles = ['Mr','Miss','Mrs','Master']
    rare_titles = [t for t in df['Title'].unique() if t not in common_titles]
    df['Title'] = df['Title'].replace(rare_titles, 'Rare')

    # Family size / is alone
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

    # Deck from Cabin (first letter). Many missing -> use 'U' for unknown
    df['Cabin'] = df['Cabin'].fillna('U')
    df['Deck'] = df['Cabin'].str[0]

    # Fill Embarked (most common)
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

    # Fare: fill missing with median of Pclass
    df['Fare'] = df['Fare'].fillna(df.groupby('Pclass')['Fare'].transform('median'))

    # Age: use median age grouped by Title + Pclass
    df['Age'] = df['Age'].fillna(df.groupby(['Title','Pclass'])['Age'].transform('median'))
    # if still null (rare), fill with overall median
    df['Age'] = df['Age'].fillna(df['Age'].median())

    # AgeBand, FareBand
    df['AgeBand'] = pd.qcut(df['Age'], 4, labels=[0,1,2,3])
    df['FareBand'] = pd.qcut(df['Fare'], 4, labels=[0,1,2,3])

    # Drop or keep features
    keep_features = [
    'Pclass', 'Sex', 'Age', 'Fare', 'Embarked',
    'Title', 'FamilySize', 'IsAlone', 'Deck',
    'AgeBand', 'FareBand'
    ]
    
    if isTrain:
        df = df[keep_features + ['Survived']]
    else:
        df = df[keep_features + ['PassengerId']]
    return df

In [10]:
train_p = preprocess(train,True)
test_p = preprocess(test,False)

In [11]:
train_p.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Title,FamilySize,IsAlone,Deck,AgeBand,FareBand,Survived
0,3,male,22.0,7.25,S,Mr,2,0,U,1,0,0
1,1,female,38.0,71.2833,C,Mrs,2,0,C,3,3,1
2,3,female,26.0,7.925,S,Miss,1,1,U,1,1,1
3,1,female,35.0,53.1,S,Mrs,2,0,C,2,3,1
4,3,male,35.0,8.05,S,Mr,1,1,U,2,1,0


In [12]:
test_p.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Title,FamilySize,IsAlone,Deck,AgeBand,FareBand,PassengerId
0,3,male,34.5,7.8292,Q,Mr,1,1,U,2,0,892
1,3,female,47.0,7.0,S,Mrs,2,0,U,3,0,893
2,2,male,62.0,9.6875,Q,Mr,1,1,U,3,1,894
3,3,male,27.0,8.6625,S,Mr,1,1,U,2,1,895
4,3,female,22.0,12.2875,S,Mrs,3,0,U,0,1,896


In [13]:
features = ['Pclass','Sex','Age','Fare','Embarked','Title','FamilySize','IsAlone','Deck','AgeBand','FareBand']
target = 'Survived'

X = train_p[features].copy()
y = train_p[target].copy()
X_test = test_p[features].copy()
test_passenger_ids = test_p['PassengerId']

num_features = ['Age','Fare','FamilySize','AgeBand','FareBand']
cat_features = ['Pclass','Sex','Embarked','Title','IsAlone','Deck']

# proprocessing pipeline
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_features),
    ('cat', cat_transformer, cat_features)
])

In [14]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def tune_model(model, param_dist, X, y, n_iter=20):
    pipe = Pipeline(steps=[('pre', preprocessor), ('clf', model)])
    search = RandomizedSearchCV(
        pipe,
        param_distributions=param_dist,
        n_iter=n_iter,
        cv=cv,
        scoring="accuracy",
        random_state=42,
        n_jobs=-1
    )
    search.fit(X, y)
    best_model = search.best_estimator_
    best_mean = search.best_score_
    best_params = search.best_params_
    return best_model, best_mean, best_params

In [15]:
#model 1
rf_params = {
    "clf__n_estimators": [100, 200, 500],
    "clf__max_depth": [None, 5, 10, 20],
    "clf__min_samples_split": [2, 5, 10],
    "clf__min_samples_leaf": [1, 2, 4],
    "clf__max_features": ["sqrt", "log2", None]
}
best_rf, rf_mean, rf_params = tune_model(RandomForestClassifier(random_state=42), rf_params, X, y)
print(f"RandomForest best CV accuracy: {rf_mean:.4f}, params={rf_params}")

RandomForest best CV accuracy: 0.8384, params={'clf__n_estimators': 500, 'clf__min_samples_split': 10, 'clf__min_samples_leaf': 4, 'clf__max_features': 'log2', 'clf__max_depth': 20}


In [16]:
#model 2
gb_params = {
    "clf__n_estimators": [100, 200, 500],
    "clf__learning_rate": [0.01, 0.05, 0.1],
    "clf__max_depth": [3, 5, 7],
    "clf__min_samples_split": [2, 5, 10],
    "clf__min_samples_leaf": [1, 2, 4],
    "clf__subsample": [0.8, 1.0]
}
best_gb, gb_mean, gb_params = tune_model(GradientBoostingClassifier(random_state=42), gb_params, X, y)
print(f"GradientBoosting best CV accuracy: {gb_mean:.4f}, params={gb_params}")

GradientBoosting best CV accuracy: 0.8395, params={'clf__subsample': 0.8, 'clf__n_estimators': 200, 'clf__min_samples_split': 10, 'clf__min_samples_leaf': 4, 'clf__max_depth': 5, 'clf__learning_rate': 0.01}


In [20]:
#model 3
lr_params = {
    "clf__C": [0.01, 0.1, 1, 10, 100],
    "clf__penalty": ["l1", "l2"],
    "clf__solver": ["liblinear", "saga","lbfgs"],
}
best_lr, lr_mean, lr_params = tune_model(LogisticRegression(max_iter=1000), lr_params, X, y)
print(f"LogisticRegression best CV accuracy: {lr_mean:.4f}, params={lr_params}")



LogisticRegression best CV accuracy: 0.8350, params={'clf__solver': 'liblinear', 'clf__penalty': 'l1', 'clf__C': 1}




In [22]:
base_lgb = lgb.LGBMClassifier(
    objective="binary",
    metric="accuracy",
    boosting_type="gbdt",
    num_leaves=31,
    max_depth=-1,
    min_data_in_leaf=20,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=5,
    verbose=-1,
    random_state=42,
    n_estimators=1000
)

lgb_params = {
    "clf__learning_rate": [0.01, 0.05, 0.1, 0.2]
}

best_lgb, lgb_mean, lgb_params = tune_model(base_lgb, lgb_params, X, y)
print(f"LightGBM best CV accuracy: {lgb_mean:.4f}, params={lgb_params}")

LightGBM best CV accuracy: 0.8451, params={'clf__learning_rate': 0.01}


In [23]:
def evaluate_model(model, X, y): 
    pipe = Pipeline(steps=[('pre', preprocessor), ('clf', model)]) 
    scores = cross_val_score(pipe, X, y, cv=cv, scoring='accuracy') 
    return scores.mean(), scores.std(), pipe

In [25]:
#voting

voting_clf = VotingClassifier(
    estimators=[
        ("rf", best_rf.named_steps['clf']),
        ("gb", best_gb.named_steps['clf']),
        ("lr", best_lr.named_steps['clf']),
        ("lgb", best_lgb.named_steps['clf'])
    ],
    voting="soft"  
)
voting_mean, voting_std, voting_pipe = evaluate_model(voting_clf, X, y)
print(f"Voting CV accuracy: {voting_mean:.4f} ± {voting_std:.4f}")

Voting CV accuracy: 0.8474 ± 0.0121


In [26]:
stacking_clf = StackingClassifier(
    estimators=[
        ("rf", best_rf.named_steps['clf']),
        ("gb", best_gb.named_steps['clf']),
        ("lr", best_lr.named_steps['clf']),
        ("lgb", best_lgb.named_steps['clf'])
    ],
    final_estimator=LogisticRegression(max_iter=1000),
    cv=cv,
    n_jobs=-1
)
stacking_mean, stacking_std, stacking_pipe = evaluate_model(stacking_clf, X, y)
print(f"Stacking CV accuracy: {stacking_mean:.4f} ± {stacking_std:.4f}")

Stacking CV accuracy: 0.8473 ± 0.0122


In [29]:
results = []

# single model
results.append(("RandomForest", rf_mean))
results.append(("GradientBoosting", gb_mean))
results.append(("LogisticRegression", lr_mean))
results.append(("LightGBM", lgb_mean))

# ensemble model
results.append(("Voting", voting_mean))
results.append(("Stacking", stacking_mean))


results_df = pd.DataFrame(results, columns=["Model", "CV Mean Accuracy"])
results_df = results_df.sort_values(by="CV Mean Accuracy", ascending=False).reset_index(drop=True)

results_df

Unnamed: 0,Model,CV Mean Accuracy
0,Voting,0.847354
1,Stacking,0.847348
2,LightGBM,0.845101
3,GradientBoosting,0.839483
4,RandomForest,0.838384
5,LogisticRegression,0.835007


In [32]:
#use voting ensemble model
final_voting_pipe = Pipeline(steps=[
    ("pre", preprocessor),
    ("clf", VotingClassifier(
        estimators=[
            ("rf", best_rf.named_steps['clf']),
            ("gb", best_gb.named_steps['clf']),
            ("lr", best_lr.named_steps['clf']),
            ("lgb", best_lgb.named_steps['clf'])
        ],
        voting="soft"
    ))
])
final_voting_pipe.fit(X, y)

# predict in testing set
test_pred = final_voting_pipe.predict(X_test)

# turn to csv
submission = pd.DataFrame({
    "PassengerId": test_passenger_ids,  
    "Survived": test_pred
})
submission.to_csv("submission.csv", index=False)
