In [3]:
# ==== FE nâng cao + Stacking để tăng điểm Kaggle ====
import re
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.base import clone
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb
import pandas as pd
# 1) Feature engineering nâng cao

def extract_title(name: str) -> str:
    if pd.isna(name):
        return "Unknown"
    m = re.search(r" ([A-Za-z]+)\.", str(name))
    if not m:
        return "Unknown"
    title = m.group(1)
    mapping = {
        'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs',
        'Lady': 'Rare', 'the Countess': 'Rare', 'Capt': 'Rare', 'Col': 'Rare',
        'Don': 'Rare', 'Dr': 'Rare', 'Major': 'Rare', 'Rev': 'Rare',
        'Sir': 'Rare', 'Jonkheer': 'Rare', 'Dona': 'Rare'
    }
    return mapping.get(title, title)

def engineer_features(df: pd.DataFrame, is_train: bool = True) -> pd.DataFrame:
    out = df.copy()
    # Cabin/Deck
    out['Deck'] = out['Cabin'].fillna('U').astype(str).str[0]
    # Gia đình
    out['FamilySize'] = out.get('SibSp', 0) + out.get('Parch', 0) + 1
    out['IsAlone'] = (out['FamilySize'] == 1).astype(int)
    out['FamilyCat'] = pd.cut(out['FamilySize'], bins=[0,1,3,4,20], labels=['Singleton','Small','Medium','Large'], include_lowest=True)
    # Title
    out['Title'] = out['Name'].apply(extract_title)
    # Vé
    out['TicketLen'] = out['Ticket'].astype(str).str.len()
    out['TicketFreq'] = out.groupby('Ticket')['Ticket'].transform('count')
    out['Companions'] = (out['TicketFreq'] - 1).clip(lower=0)
    # Fare per person + bins
    out['Fare'] = out['Fare'].replace(0, np.nan)
    denom = out['FamilySize'].replace(0, 1)
    out['FarePerPerson'] = out['Fare'] / denom
    out['FareCat'] = pd.qcut(out['Fare'].fillna(out['Fare'].median()), q=8, labels=False, duplicates='drop')
    # Age bins
    out['AgeBin'] = pd.cut(out['Age'], bins=[-1,5,12,18,30,45,60,80,120], labels=False)
    # Vai trò
    out['SexNum'] = (out['Sex'] == 'male').astype(int)
    out['IsChild'] = ((out['Age'] < 16).astype(float)).fillna(0).astype(int)
    out['IsMother'] = ((out['Sex'] == 'female') & (out.get('Parch', 0) > 0) & (out['Title'] != 'Miss')).astype(int)
    # Tương tác đơn giản
    if 'Pclass' in out.columns:
        out['AgeTimesClass'] = out['Age'].fillna(out['Age'].median()) * out['Pclass']
        out['FareTimesClass'] = out['Fare'].fillna(out['Fare'].median()) * out['Pclass']
    return out

# Tạo bản FE cho train/test gốc (đã có biến train, test trước đó)
train_raw = pd.read_csv('input/train.csv')
test_raw  = pd.read_csv('input/test.csv')
train_fe = engineer_features(train_raw, is_train=True)
test_fe = engineer_features(test_raw, is_train=False)

# Mục tiêu
y = train_fe['Survived'].astype(int)
X = train_fe.drop(columns=['Survived'])
X_submit = test_fe.copy()

# 2) Preprocess
numeric_cols = [c for c in X.columns if X[c].dtype != 'object']
categorical_cols = [c for c in X.columns if X[c].dtype == 'object']
preprocess = ColumnTransformer([
    ('num', Pipeline([
        ('imp', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), numeric_cols),
    ('cat', Pipeline([
        ('imp', SimpleImputer(strategy='most_frequent')),
        ('oh', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ]), categorical_cols)
])

# 3) Base models
rf = RandomForestClassifier(n_estimators=800, max_depth=10, random_state=42, n_jobs=-1)
gx = xgb.XGBClassifier(n_estimators=900, max_depth=5, learning_rate=0.03, subsample=0.8, colsample_bytree=0.8,
                        tree_method='hist', eval_metric='logloss', n_jobs=-1, random_state=42)
gb = GradientBoostingClassifier(n_estimators=400, learning_rate=0.05, max_depth=3, random_state=42)

pipes = [
    Pipeline([('pp', preprocess), ('mdl', rf)]),
    Pipeline([('pp', preprocess), ('mdl', gx)]),
    Pipeline([('pp', preprocess), ('mdl', gb)])
]

# 4) OOF stacking
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_list = [np.zeros(len(X)) for _ in pipes]
test_pred_list = [np.zeros(len(X_submit)) for _ in pipes]

for tr_idx, va_idx in cv.split(X, y):
    X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]
    for i, base in enumerate(pipes):
        m = clone(base)
        m.fit(X_tr, y_tr)
        oof_list[i][va_idx] = m.predict_proba(X_va)[:, 1]
        test_pred_list[i] += m.predict_proba(X_submit)[:, 1] / cv.get_n_splits()

OOF = np.vstack(oof_list).T
TEST = np.vstack(test_pred_list).T

# 5) Meta-learner + threshold tuning
meta = xgb.XGBClassifier(n_estimators=250, max_depth=3, learning_rate=0.08, subsample=0.9, colsample_bytree=0.9,
                         tree_method='hist', eval_metric='logloss', random_state=42, n_jobs=-1)
meta.fit(OOF, y)
oof_proba = meta.predict_proba(OOF)[:, 1]

def tune_threshold(y_true, proba):
    best_thr, best_acc = 0.5, 0.0
    for thr in np.linspace(0.3, 0.7, 81):
        pred = (proba >= thr).astype(int)
        acc = (pred == y_true.values).mean()
        if acc > best_acc:
            best_acc, best_thr = acc, thr
    return best_thr, best_acc

thr, acc = tune_threshold(y, oof_proba)
print(f"OOF acc(meta): {acc:.4f} | best thr: {thr:.3f}")

# 6) Dự đoán test và xuất submission
TEST_PROBA = meta.predict_proba(TEST)[:, 1]
preds = (TEST_PROBA >= thr).astype(int)
sub = pd.DataFrame({'PassengerId': test_raw['PassengerId'], 'Survived': preds.astype(int)})
sub.to_csv('submission_stacked.csv', index=False)
print('Saved submission_stacked.csv')


NameError: name 'np' is not defined