# Titanic Survival Prediction

## 1. Import Libraries and Load Data

First, let's import the necessary libraries and load our training and testing datasets.

In [None]:

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from IPython.display import display

from sklearn.base import clone
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import (
    RandomForestClassifier,
    ExtraTreesClassifier,
    GradientBoostingClassifier,
    HistGradientBoostingClassifier,
    StackingClassifier
)
from sklearn.linear_model import LogisticRegression

RANDOM_STATE = 42
sns.set(style="whitegrid")
np.random.seed(RANDOM_STATE)
pd.set_option("display.max_columns", 200)


## 2. Exploratory Data Analysis (EDA)

Now, let's explore the data to understand its structure, find patterns, and identify missing values.

In [None]:

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

print(f"Train shape : {train_df.shape}")
print(f"Test shape  : {test_df.shape}")

display(train_df.head())
display(train_df.describe(include="all"))


Train shape : (891, 12)
Test shape  : (418, 11)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Braund, Mr. Owen Harris",male,,,,347082.0,,B96 B98,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


## 3. Data Cleaning & Feature Engineering

Based on our EDA, we'll clean the data by handling missing values and create new features to improve our model's performance.

In [None]:

def engineer_datasets(train_df: pd.DataFrame, test_df: pd.DataFrame):
    train = train_df.copy()
    test = test_df.copy()

    y = train["Survived"].astype(int)
    train_features = train.drop(columns=["Survived"])
    test_features = test.copy()
    full = pd.concat([train_features, test_features], axis=0, ignore_index=True)

    full["Title"] = full["Name"].str.extract(r" ([A-Za-z]+)\.").fillna("Unknown")
    title_mapping = {
        "Mr": "Mr", "Mrs": "Mrs", "Miss": "Miss", "Master": "Master",
        "Ms": "Miss", "Mlle": "Miss", "Mme": "Mrs",
        "Lady": "Rare", "Countess": "Rare", "Capt": "Rare", "Col": "Rare",
        "Don": "Rare", "Dr": "Rare", "Major": "Rare", "Rev": "Rare",
        "Sir": "Rare", "Jonkheer": "Rare", "Dona": "Rare"
    }
    full["Title"] = full["Title"].map(lambda t: title_mapping.get(t, "Rare"))

    full["NameLength"] = full["Name"].str.len()
    full["NameWords"] = full["Name"].str.split().map(len)
    full["HasParenthesis"] = full["Name"].str.contains(r"\(", regex=True).astype(int)

    full["Embarked"] = full["Embarked"].fillna(full["Embarked"].mode()[0])

    full["Fare"] = full["Fare"].astype(float)
    fare_by_class = full.groupby("Pclass")["Fare"].transform("median")
    full["Fare"] = full["Fare"].fillna(fare_by_class).fillna(full["Fare"].median())

    full["FamilySize"] = full["SibSp"] + full["Parch"] + 1
    full["IsAlone"] = (full["FamilySize"] == 1).astype(int)
    full["FamilySizeSquared"] = full["FamilySize"] ** 2

    full["TicketGroupSize"] = full.groupby("Ticket")["Ticket"].transform("count")
    full["SharedTicket"] = (full["TicketGroupSize"] > 1).astype(int)

    fare_per_person = (full["Fare"] / np.maximum(full["FamilySize"], 1)).replace([np.inf, -np.inf], np.nan)
    full["FarePerPerson"] = fare_per_person.fillna(fare_per_person.median())
    full["FareLog"] = np.log1p(full["Fare"])
    full["FareRank"] = full["Fare"].rank(pct=True)

    full["LastName"] = full["Name"].str.split(",").str[0].str.strip()
    full["FamilyId"] = full["LastName"] + "_" + full["FamilySize"].astype(str)

    full["CabinDeck"] = full["Cabin"].str[0].fillna("M")
    full["CabinKnown"] = full["Cabin"].notna().astype(int)
    full["CabinCount"] = full["Cabin"].fillna("").str.split().map(len)
    full["MultipleCabins"] = (full["CabinCount"] > 1).astype(int)
    full["CabinNumber"] = (
        full["Cabin"]
        .str.extract(r"(\d+)")
        .fillna("0")
        .astype(int)
    )
    full["CabinOdd"] = (full["CabinNumber"] % 2 == 1).astype(int)

    age_group_med = full.groupby(["Title", "Pclass"])["Age"].transform("median")
    full["Age"] = full["Age"].fillna(age_group_med).fillna(full["Age"].median())

    age_bins = [0, 12, 18, 25, 35, 45, 55, 65, np.inf]
    age_labels = ["0-12", "12-18", "18-25", "25-35", "35-45", "45-55", "55-65", "65+"]
    age_bin = pd.cut(full["Age"], bins=age_bins, labels=age_labels, right=False)
    full["AgeBin"] = age_bin.astype(str).replace("nan", "Missing")

    full["IsChild"] = (full["Age"] < 12).astype(int)
    full["IsTeen"] = ((full["Age"] >= 12) & (full["Age"] < 18)).astype(int)
    full["IsMother"] = (
        (full["Sex"] == "female")
        & (full["Parch"] > 0)
        & (full["Age"] >= 18)
        & (full["Title"] == "Mrs")
    ).astype(int)
    full["Age*Class"] = full["Age"] * full["Pclass"]
    full["Fare*Class"] = full["Fare"] * full["Pclass"]
    full["SibSp*Parch"] = full["SibSp"] * full["Parch"]

    full["TicketLetters"] = full["Ticket"].str.replace("[^A-Za-z]", "", regex=True).str.upper().replace("", "NONE")
    full["TicketDigits"] = full["Ticket"].str.replace("[^0-9]", "", regex=True).replace("", "0")
    full["TicketLettersLen"] = full["TicketLetters"].str.len()
    full["TicketDigitsLen"] = full["TicketDigits"].str.len()
    full["TicketPrefix"] = full["TicketLetters"]
    full["TicketNumeric"] = full["TicketDigits"].astype(int)

    full["SexPclass"] = full["Sex"] + "_" + full["Pclass"].astype(str)
    full["DeckPclass"] = full["CabinDeck"] + "_" + full["Pclass"].astype(str)
    full["EmbarkedPclass"] = full["Embarked"] + "_" + full["Pclass"].astype(str)
    full["TitleSex"] = full["Title"] + "_" + full["Sex"]

    fare_bins = pd.qcut(full["Fare"], q=8, duplicates="drop")
    full["FareBin"] = fare_bins.astype(str).replace("nan", "Missing")

    family_size_group = pd.cut(
        full["FamilySize"],
        bins=[0, 1, 2, 4, 7, np.inf],
        labels=["Solo", "Couple", "Small", "Medium", "Large"],
        right=True
    )
    full["FamilySizeGroup"] = family_size_group.astype(str).replace("nan", "Unknown")

    train_engineered = full.iloc[: len(train)].reset_index(drop=True)
    test_engineered = full.iloc[len(train):].reset_index(drop=True)
    train_engineered["Survived"] = y.values

    def add_target_rate(train_df, test_df, column, smoothing=18, n_splits=5):
        prior = train_df["Survived"].mean()
        feature_name = f"{column}_survival_rate"
        encoded = pd.Series(prior, index=train_df.index, dtype=float)
        skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)

        for tr_idx, val_idx in skf.split(train_df, train_df["Survived"]):
            fold = train_df.iloc[tr_idx]
            stats = fold.groupby(column)["Survived"].agg(["mean", "count"])
            smooth = (stats["mean"] * stats["count"] + prior * smoothing) / (stats["count"] + smoothing)
            encoded.iloc[val_idx] = train_df.iloc[val_idx][column].map(smooth).fillna(prior)

        train_df[feature_name] = encoded
        stats_full = train_df.groupby(column)["Survived"].agg(["mean", "count"])
        smooth_full = (stats_full["mean"] * stats_full["count"] + prior * smoothing) / (stats_full["count"] + smoothing)
        test_df[feature_name] = test_df[column].map(smooth_full).fillna(prior)

    for col in [
        "Title", "CabinDeck", "TicketPrefix", "FamilyId",
        "SexPclass", "DeckPclass", "EmbarkedPclass", "FareBin", "FamilySizeGroup"
    ]:
        add_target_rate(train_engineered, test_engineered, col, smoothing=20)

    binary_cols = ["CabinKnown", "IsAlone", "SharedTicket", "IsChild", "IsTeen", "IsMother", "MultipleCabins", "HasParenthesis", "CabinOdd"]
    for col in binary_cols:
        train_engineered[col] = train_engineered[col].astype(int)
        test_engineered[col] = test_engineered[col].astype(int)

    passenger_ids = test_engineered["PassengerId"].astype(int).copy()

    drop_cols = ["PassengerId", "Name", "Ticket", "Cabin", "LastName"]
    train_engineered = train_engineered.drop(columns=drop_cols)
    test_engineered = test_engineered.drop(columns=drop_cols)

    y_final = train_engineered["Survived"].astype(int)
    train_engineered = train_engineered.drop(columns=["Survived"])

    return train_engineered, test_engineered, y_final, passenger_ids

X_train, X_test, y, passenger_ids = engineer_datasets(train_df, test_df)
print(f"Train features shape : {X_train.shape}")
print(f"Test features shape  : {X_test.shape}")
display(X_train.head())


Train features shape : (891, 54)
Test features shape  : (418, 54)


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,NameLength,NameWords,HasParenthesis,FamilySize,IsAlone,FamilySizeSquared,TicketGroupSize,SharedTicket,FarePerPerson,FareLog,FareRank,FamilyId,CabinDeck,CabinKnown,CabinCount,MultipleCabins,CabinNumber,CabinOdd,AgeBin,IsChild,IsTeen,IsMother,Age*Class,Fare*Class,SibSp*Parch,TicketLetters,TicketDigits,TicketLettersLen,TicketDigitsLen,TicketPrefix,TicketNumeric,SexPclass,DeckPclass,EmbarkedPclass,TitleSex,FareBin,FamilySizeGroup,Title_survival_rate,CabinDeck_survival_rate,TicketPrefix_survival_rate,FamilyId_survival_rate,SexPclass_survival_rate,DeckPclass_survival_rate,EmbarkedPclass_survival_rate,FareBin_survival_rate,FamilySizeGroup_survival_rate
0,3,male,22.0,1,0,7.25,S,Mr,23,4,0,2,0,4,1,0,3.625,2.110213,0.082888,Braund_2,M,0,0,0,0,0,18-25,0,0,0,66.0,21.75,0,A,521171,1,6,A,521171,male_3,M_3,S_3,Mr_male,"(-0.001, 7.75]",Couple,0.16669,0.305914,0.225041,0.36556,0.154837,0.242298,0.202219,0.23185,0.524843
1,1,female,38.0,1,0,71.2833,C,Mrs,51,7,1,2,0,4,2,1,35.64165,4.280593,0.883499,Cumings_2,C,1,1,0,85,1,35-45,0,0,0,38.0,71.2833,0,PC,17599,2,5,PC,17599,female_1,C_1,C_1,Mrs_female,"(69.55, 512.329]",Couple,0.707691,0.525406,0.564258,0.383838,0.849229,0.525406,0.629742,0.660166,0.523569
2,3,female,26.0,0,0,7.925,S,Miss,22,3,0,1,1,1,1,0,7.925,2.188856,0.266616,Heikkinen_1,M,0,0,0,0,0,25-35,0,0,0,78.0,23.775,0,STONO,23101282,5,8,STONO,23101282,female_3,M_3,S_3,Miss_female,"(7.896, 9.844]",Solo,0.677212,0.301717,0.37991,0.383838,0.523532,0.247952,0.204228,0.228489,0.30345
3,1,female,35.0,1,0,53.1,S,Mrs,44,7,1,2,0,4,2,1,26.55,3.990834,0.834607,Futrelle_2,C,1,1,0,123,1,35-45,0,0,0,35.0,53.1,0,NONE,113803,4,6,NONE,113803,female_1,C_1,S_1,Mrs_female,"(31.275, 69.55]",Couple,0.731264,0.537177,0.39037,0.36556,0.833443,0.537177,0.566478,0.450711,0.543194
4,3,male,35.0,0,0,8.05,S,Mr,24,4,0,1,1,1,1,0,8.05,2.202765,0.299465,Allen_1,M,0,0,0,0,0,35-45,0,0,0,105.0,24.15,0,NONE,373450,4,6,NONE,373450,male_3,M_3,S_3,Mr_male,"(7.896, 9.844]",Solo,0.16972,0.305914,0.375765,0.413179,0.155589,0.246757,0.207539,0.2328,0.305948


## 4. Model Training and Evaluation

It's time to choose a model, train it on our processed data, and see how well it performs.

In [None]:

numeric_features = X_train.select_dtypes(include=["int64", "float64", "int32", "float32"]).columns.tolist()
categorical_features = X_train.select_dtypes(include=["object", "category"]).columns.tolist()

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocess = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])

stacking_base_estimators = [
    ("rf", RandomForestClassifier(
        n_estimators=600,
        max_depth=None,
        min_samples_leaf=2,
        max_features="sqrt",
        bootstrap=True,
        n_jobs=-1,
        random_state=RANDOM_STATE
    )),
    ("et", ExtraTreesClassifier(
        n_estimators=850,
        max_depth=None,
        min_samples_leaf=1,
        max_features="sqrt",
        bootstrap=False,
        n_jobs=-1,
        random_state=RANDOM_STATE
    )),
    ("gb", GradientBoostingClassifier(
        n_estimators=420,
        learning_rate=0.045,
        max_depth=3,
        min_samples_leaf=18,
        subsample=0.9,
        random_state=RANDOM_STATE
    ))
]

stacking_final_estimator = LogisticRegression(
    C=0.7,
    penalty="l2",
    solver="lbfgs",
    max_iter=1500,
    random_state=RANDOM_STATE
)

stacking_clf = StackingClassifier(
    estimators=stacking_base_estimators,
    final_estimator=stacking_final_estimator,
    stack_method="predict_proba",
    passthrough=True,
    n_jobs=-1,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
)

stacking_pipeline = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", stacking_clf)
])

hist_gb_pipeline = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", HistGradientBoostingClassifier(
        learning_rate=0.07,
        max_depth=4,
        max_iter=400,
        min_samples_leaf=16,
        l2_regularization=0.015,
        max_bins=255,
        random_state=RANDOM_STATE
    ))
])

gradient_boost_pipeline = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", GradientBoostingClassifier(
        n_estimators=480,
        learning_rate=0.055,
        max_depth=3,
        min_samples_leaf=14,
        subsample=0.95,
        random_state=RANDOM_STATE
    ))
])

random_forest_pipeline = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", RandomForestClassifier(
        n_estimators=900,
        max_depth=None,
        min_samples_leaf=2,
        max_features="sqrt",
        bootstrap=True,
        n_jobs=-1,
        random_state=RANDOM_STATE
    ))
])

base_models = {
    "stacking": stacking_pipeline,
    "hist_gb": hist_gb_pipeline,
    "grad_boost": gradient_boost_pipeline,
    "rand_forest": random_forest_pipeline
}
base_model_names = list(base_models.keys())

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=RANDOM_STATE)
oof_predictions = {name: np.zeros(len(X_train), dtype=float) for name in base_model_names}
test_predictions_cvmean = {name: np.zeros(len(X_test), dtype=float) for name in base_model_names}
base_fold_acc = {name: [] for name in base_model_names}

for fold, (train_idx, val_idx) in enumerate(cv.split(X_train, y), start=1):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

    for name, model in base_models.items():
        estimator = clone(model)
        estimator.fit(X_tr, y_tr)

        val_proba = estimator.predict_proba(X_val)[:, 1]
        oof_predictions[name][val_idx] = val_proba
        test_predictions_cvmean[name] += estimator.predict_proba(X_test)[:, 1] / cv.n_splits

        val_pred_binary = (val_proba >= 0.5).astype(int)
        base_fold_acc[name].append(accuracy_score(y_val, val_pred_binary))

    fold_msg = " | ".join(
        f"{name}: {base_fold_acc[name][-1]:.4f}"
        for name in base_model_names
    )
    print(f"Fold {fold:02d} accuracies -> {fold_msg}")

meta_feature_names = [f"{name}_proba" for name in base_model_names]
meta_X = np.column_stack([oof_predictions[name] for name in base_model_names])
meta_X_df = pd.DataFrame(meta_X, columns=meta_feature_names)

meta_model_base = LogisticRegression(
    C=0.9,
    penalty="l2",
    solver="lbfgs",
    max_iter=2000,
    random_state=RANDOM_STATE
)

logistic_cv_scores = cross_val_score(meta_model_base, meta_X_df, y, cv=cv, scoring="accuracy", n_jobs=-1)
print(f"\nMeta-logistic CV accuracy (threshold 0.50) : {logistic_cv_scores.mean():.5f} ± {logistic_cv_scores.std():.5f}")

logistic_oof = np.zeros(len(X_train), dtype=float)
for train_idx, val_idx in cv.split(meta_X_df, y):
    estimator = clone(meta_model_base)
    estimator.fit(meta_X_df.iloc[train_idx], y.iloc[train_idx])
    logistic_oof[val_idx] = estimator.predict_proba(meta_X_df.iloc[val_idx])[:, 1]

threshold_grid = np.linspace(0.35, 0.65, 61)
best_threshold, best_acc = 0.50, 0.0
for thr in threshold_grid:
    acc = ( (logistic_oof >= thr).astype(int) == y.values ).mean()
    if acc > best_acc:
        best_acc = acc
        best_threshold = thr

print(f"Meta-logistic OOF accuracy (optimal threshold {best_threshold:.3f}) : {best_acc:.5f}")

print("\nAccuracy moyenne par modèle de base (seuil 0.50) :")
for name in base_model_names:
    print(f"  {name:<12} -> {np.mean(base_fold_acc[name]):.5f}")


Fold 01 accuracies -> stacking: 0.8889 | hist_gb: 0.9111 | grad_boost: 0.8778 | rand_forest: 0.8889
Fold 02 accuracies -> stacking: 0.8090 | hist_gb: 0.7978 | grad_boost: 0.8090 | rand_forest: 0.8090
Fold 03 accuracies -> stacking: 0.8202 | hist_gb: 0.8652 | grad_boost: 0.8652 | rand_forest: 0.8315
Fold 04 accuracies -> stacking: 0.8315 | hist_gb: 0.8315 | grad_boost: 0.8652 | rand_forest: 0.8427
Fold 05 accuracies -> stacking: 0.7978 | hist_gb: 0.7865 | grad_boost: 0.7865 | rand_forest: 0.8090
Fold 06 accuracies -> stacking: 0.7978 | hist_gb: 0.8427 | grad_boost: 0.8539 | rand_forest: 0.8427
Fold 07 accuracies -> stacking: 0.8315 | hist_gb: 0.8427 | grad_boost: 0.8539 | rand_forest: 0.8652
Fold 08 accuracies -> stacking: 0.8202 | hist_gb: 0.8202 | grad_boost: 0.8427 | rand_forest: 0.8315
Fold 09 accuracies -> stacking: 0.8427 | hist_gb: 0.8202 | grad_boost: 0.8315 | rand_forest: 0.8539
Fold 10 accuracies -> stacking: 0.8315 | hist_gb: 0.8427 | grad_boost: 0.8539 | rand_forest: 0.8315


## 5. Create Submission File

Finally, we'll use our trained model to make predictions on the test set and generate the submission file in the required format.

In [None]:
    
meta_model_final = clone(meta_model_base)
meta_model_final.fit(meta_X_df, y)

fitted_base_models = {}
train_meta_full = []
test_meta_full = []

for name, model in base_models.items():
    estimator = clone(model)
    estimator.fit(X_train, y)
    fitted_base_models[name] = estimator

    train_meta_full.append(estimator.predict_proba(X_train)[:, 1])
    test_meta_full.append(estimator.predict_proba(X_test)[:, 1])

train_meta_matrix = np.column_stack(train_meta_full)
test_meta_matrix = np.column_stack(test_meta_full)

train_meta_df = pd.DataFrame(train_meta_matrix, columns=meta_feature_names)
test_meta_df = pd.DataFrame(test_meta_matrix, columns=meta_feature_names)

train_meta_proba = meta_model_final.predict_proba(train_meta_df)[:, 1]
train_meta_pred = (train_meta_proba >= best_threshold).astype(int)
train_meta_accuracy = (train_meta_pred == y.values).mean()
print(f"Accuracy sur l'ensemble d'entraînement (ré-entraînement complet, seuil {best_threshold:.3f}) : {train_meta_accuracy:.5f}")

test_meta_proba = meta_model_final.predict_proba(test_meta_df)[:, 1]
test_predictions = (test_meta_proba >= best_threshold).astype(int)

submission = pd.DataFrame({
    "PassengerId": passenger_ids,
    "Survived": test_predictions
})
submission.to_csv("submission.csv", index=False)
print("✅ Fichier 'submission.csv' généré avec succès.")
display(submission.head())

meta_coefficients = pd.Series(meta_model_final.coef_.ravel(), index=meta_feature_names).sort_values(ascending=False)
print("\nPoids du méta-modèle (influence de chaque base) :")
display(meta_coefficients)

ensemble_preview = pd.DataFrame({
    "stacking_cv_mean": test_predictions_cvmean["stacking"],
    "hist_gb_cv_mean": test_predictions_cvmean["hist_gb"],
    "grad_boost_cv_mean": test_predictions_cvmean["grad_boost"],
    "rand_forest_cv_mean": test_predictions_cvmean["rand_forest"],
    "meta_proba_final": test_meta_proba
}).head()
print("\nAperçu des probabilités (moyennes CV vs finale) :")
display(ensemble_preview)

print("Prêt pour une nouvelle soumission Kaggle 🚀")


Accuracy sur l'ensemble d'entraînement (ré-entraînement complet, seuil 0.565) : 0.94725
✅ Fichier 'submission.csv' généré avec succès.


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1



Poids du méta-modèle (influence de chaque base) :


rand_forest_proba    2.450650
grad_boost_proba     2.111053
stacking_proba       1.279295
hist_gb_proba       -0.036341
dtype: float64


Aperçu des probabilités (moyennes CV vs finale) :


Unnamed: 0,stacking_cv_mean,hist_gb_cv_mean,grad_boost_cv_mean,rand_forest_cv_mean,meta_proba_final
0,0.403472,0.065109,0.136784,0.137821,0.163146
1,0.254312,0.044014,0.15513,0.463287,0.248203
2,0.033346,0.057286,0.051664,0.148653,0.096046
3,0.075851,0.273202,0.184718,0.120745,0.12107
4,0.578056,0.917643,0.869778,0.553879,0.775073


Prêt pour une nouvelle soumission Kaggle 🚀
