In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier


In [2]:
# Carregar Titanic
df = pd.read_csv("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")

df = df[['Survived','Pclass','Sex','Age','Fare','Embarked']]  # features reais
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

X = df.drop('Survived', axis=1)
y = df['Survived']


In [3]:
numeric_cols = ['Age','Fare','Pclass']
categorical_cols = ['Sex','Embarked']

preprocess = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

X_processed = preprocess.fit_transform(X)


In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.3, random_state=42, stratify=y
)


In [5]:
def eval_classification(y_true, y_pred, name="Modelo"):
    print(f"\n=== {name} ===")
    print("Accuracy:",  accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:",    recall_score(y_true, y_pred))
    print("F1:",        f1_score(y_true, y_pred))
    print("MSE:",       mean_squared_error(y_true, y_pred))


In [6]:
def get_oof_preds(clf, X_train, y_train, X_test, n_splits=5, name="model"):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    oof_pred = np.zeros((X_train.shape[0], 2))
    test_pred = np.zeros((X_test.shape[0], 2, n_splits))
    
    for fold, (train_idx, valid_idx) in enumerate(skf.split(X_train, y_train)):
        X_tr, X_val = X_train[train_idx], X_train[valid_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]
        
        model = clf
        model.fit(X_tr, y_tr)
        
        oof_pred[valid_idx] = model.predict_proba(X_val)
        test_pred[:,:,fold] = model.predict_proba(X_test)
    
    print(f"OOF gerado para {name}")
    return oof_pred, test_pred.mean(axis=2)


In [7]:
base_models_level1 = {
    "xgb": XGBClassifier(
        n_estimators=200, learning_rate=0.1, max_depth=3,
        subsample=0.8, colsample_bytree=0.8,
        objective="binary:logistic", eval_metric="logloss",
        random_state=42
    ),
    "lgbm": LGBMClassifier(
        n_estimators=200, learning_rate=0.1,
        subsample=0.8, colsample_bytree=0.8,
        random_state=42
    ),
    "cat": CatBoostClassifier(
        iterations=200, learning_rate=0.1, depth=4,
        loss_function='Logloss', verbose=False,
        random_state=42
    ),
    "rf": RandomForestClassifier(
        n_estimators=200, random_state=42
    ),
    "svm": SVC(
        probability=True, kernel='rbf', C=1.0, gamma='scale',
        random_state=42
    ),
    "logreg": LogisticRegression(max_iter=1000),
    "tree": DecisionTreeClassifier(max_depth=4, random_state=42)
}


In [8]:
oof_level1 = []
test_level1 = []

for name, model in base_models_level1.items():
    oof_pred, test_pred = get_oof_preds(
        model, X_train, y_train, X_test, name=name
    )
    oof_level1.append(oof_pred)
    test_level1.append(test_pred)

# Concatenar previsÃµes da camada 1
X_train_level2 = np.hstack(oof_level1)
X_test_level2  = np.hstack(test_level1)

print("Shape camada 1 -> camada 2:")
print(X_train_level2.shape, X_test_level2.shape)


OOF gerado para xgb
[LightGBM] [Info] Number of positive: 191, number of negative: 307
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000115 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 164
[LightGBM] [Info] Number of data points in the train set: 498, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383534 -> initscore=-0.474574
[LightGBM] [Info] Start training from score -0.474574
[LightGBM] [Info] Number of positive: 191, number of negative: 307
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000059 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 165
[LightGBM] [Info] Number of data points in the train set: 498, number of used features: 8
[LightGBM] [In



[LightGBM] [Info] Number of positive: 191, number of negative: 308
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000050 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 167
[LightGBM] [Info] Number of data points in the train set: 499, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.382766 -> initscore=-0.477826
[LightGBM] [Info] Start training from score -0.477826
OOF gerado para lgbm




OOF gerado para cat
OOF gerado para rf
OOF gerado para svm
OOF gerado para logreg
OOF gerado para tree
Shape camada 1 -> camada 2:
(623, 14) (268, 14)


In [9]:
models_level2 = {
    "xgb_l2": XGBClassifier(
        n_estimators=150, learning_rate=0.1, max_depth=3,
        subsample=0.8, colsample_bytree=0.8,
        objective="binary:logistic", eval_metric="logloss",
        random_state=42
    ),
    "lgbm_l2": LGBMClassifier(
        n_estimators=150, learning_rate=0.1,
        subsample=0.8, colsample_bytree=0.8,
        random_state=42
    ),
    "cat_l2": CatBoostClassifier(
        iterations=150, learning_rate=0.1, depth=4,
        loss_function='Logloss', verbose=False,
        random_state=42
    ),
    "logreg_l2": LogisticRegression(max_iter=1000)
}


In [10]:
oof_level2 = []
test_level2 = []

for name, model in models_level2.items():
    oof_pred, test_pred = get_oof_preds(
        model, X_train_level2, y_train, X_test_level2, name=name
    )
    oof_level2.append(oof_pred)
    test_level2.append(test_pred)

# Concatenar previsÃµes da camada 2
X_train_level3 = np.hstack(oof_level2)
X_test_level3  = np.hstack(test_level2)

print("Shape camada 2 -> camada 3:")
print(X_train_level3.shape, X_test_level3.shape)


OOF gerado para xgb_l2
[LightGBM] [Info] Number of positive: 191, number of negative: 307
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000101 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1946
[LightGBM] [Info] Number of data points in the train set: 498, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383534 -> initscore=-0.474574
[LightGBM] [Info] Start training from score -0.474574
[LightGBM] [Info] Number of positive: 191, number of negative: 307
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000101 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1954
[LightGBM] [Info] Number of data points in the train set: 498, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383534 -> initscore=-0.474574
[LightGBM] [Info] Start training from score -0.474574
[



[LightGBM] [Info] Number of positive: 191, number of negative: 308
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000108 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1948
[LightGBM] [Info] Number of data points in the train set: 499, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.382766 -> initscore=-0.477826
[LightGBM] [Info] Start training from score -0.477826
OOF gerado para lgbm_l2




OOF gerado para cat_l2
OOF gerado para logreg_l2
Shape camada 2 -> camada 3:
(623, 8) (268, 8)


In [11]:
meta_model = LogisticRegression(max_iter=2000)

meta_model.fit(X_train_level3, y_train)
y_pred_final = meta_model.predict(X_test_level3)

eval_classification(y_test, y_pred_final, "Stacking Manual 3 Camadas (Final)")



=== Stacking Manual 3 Camadas (Final) ===
Accuracy: 0.8395522388059702
Precision: 0.8947368421052632
Recall: 0.6601941747572816
F1: 0.7597765363128491
MSE: 0.16044776119402984


In [12]:
baseline = XGBClassifier(
    n_estimators=200, learning_rate=0.1, max_depth=3,
    subsample=0.8, colsample_bytree=0.8,
    objective="binary:logistic", eval_metric="logloss",
    random_state=42
)

baseline.fit(X_train, y_train)
y_pred_base = baseline.predict(X_test)

eval_classification(y_test, y_pred_base, "Baseline (XGB sozinho)")



=== Baseline (XGB sozinho) ===
Accuracy: 0.8059701492537313
Precision: 0.8072289156626506
Recall: 0.6504854368932039
F1: 0.7204301075268817
MSE: 0.19402985074626866
