In [None]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import StackingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.svm import SVC

# 1. Load data
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")


def preprocess(df, train_ref=None):
    # Title
    df["Title"] = df["Name"].str.extract(r" ([A-Za-z]+)\.", expand=False)
    df["Title"] = df["Title"].replace(
        [
            "Lady",
            "Countess",
            "Capt",
            "Col",
            "Don",
            "Dr",
            "Major",
            "Rev",
            "Sir",
            "Jonkheer",
            "Dona",
        ],
        "Rare",
    )
    df["Title"] = df["Title"].replace({"Mlle": "Miss", "Ms": "Miss", "Mme": "Mrs"})
    # Deck
    df["Deck"] = df["Cabin"].str[0].fillna("N")
    # Ticket prefix
    df["TicketPrefix"] = (
        df["Ticket"].str.extract(r"([A-Za-z\.\/]+)", expand=False).fillna("None")
    )
    # Family size
    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
    df["IsAlone"] = (df["FamilySize"] == 1).astype(int)
    # Fill missing
    df["Embarked"] = df["Embarked"].fillna(train_ref["Embarked"].mode()[0])
    df["Fare"] = df["Fare"].fillna(train_ref["Fare"].median())
    # Age by Title
    age_map = train_ref.groupby("Title")["Age"].mean()
    df["Age"] = df["Age"].fillna(df["Title"].map(age_map))
    # Outlier capping
    df["Fare"] = df["Fare"].clip(*df["Fare"].quantile([0.01, 0.99]))
    df["Age"] = df["Age"].clip(*df["Age"].quantile([0.01, 0.99]))
    # New features
    df["FarePerPerson"] = df["Fare"] / df["FamilySize"]
    df["AgeClass"] = df["Age"] * df["Pclass"]
    return df


# Apply preprocessing
df_train = preprocess(df_train, train_ref=df_train)
df_test = preprocess(df_test, train_ref=df_train)

# 2. Encoding categorical features with unified LabelEncoder
categorical_cols = ["Sex", "Embarked", "Title", "Deck", "TicketPrefix"]
for col in categorical_cols:
    le = LabelEncoder()
    combined = pd.concat([df_train[col], df_test[col]], axis=0)
    le.fit(combined)
    df_train[col] = le.transform(df_train[col])
    df_test[col] = le.transform(df_test[col])

# Standardize numerical columns
scaler = StandardScaler()
numeric_cols = ["Age", "Fare", "FamilySize", "FarePerPerson", "AgeClass"]
df_train[numeric_cols] = scaler.fit_transform(df_train[numeric_cols])
df_test[numeric_cols] = scaler.transform(df_test[numeric_cols])

# 3. Prepare datasets
features = [
    "Pclass",
    "Sex",
    "Age",
    "Fare",
    "FamilySize",
    "IsAlone",
    "FarePerPerson",
    "AgeClass",
    "Embarked",
    "Title",
    "Deck",
    "TicketPrefix",
]
X_train, y_train = df_train[features], df_train["Survived"]
X_test = df_test[features]

# 4. Model & Stacking
lgbm = lgb.LGBMClassifier(verbosity=-1, random_state=42)
xgb = XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
svc = SVC(probability=True, random_state=42)
etc = ExtraTreesClassifier(n_estimators=100, random_state=42)
stack = StackingClassifier(
    estimators=[("lgbm", lgbm), ("xgb", xgb), ("svc", svc), ("etc", etc)],
    final_estimator=LogisticRegression(max_iter=1000),
    cv=StratifiedKFold(5, shuffle=True, random_state=42),
    n_jobs=-1,
)

# 5. Hyperparameter search
param_dist = {
    "lgbm__n_estimators": [100, 300, 500],
    "lgbm__learning_rate": [0.01, 0.05, 0.1],
    "xgb__max_depth": [3, 5, 7],
    "svc__C": [0.1, 1, 10],
    "etc__n_estimators": [50, 100, 200],
}
search = RandomizedSearchCV(
    stack,
    param_dist,
    n_iter=30,
    cv=StratifiedKFold(5, shuffle=True, random_state=42),
    n_jobs=-1,
    random_state=42,
    verbose=1,
)
search.fit(X_train, y_train)
best = search.best_estimator_
print("Best Params:", search.best_params_)
print("CV Accuracy:", round(search.best_score_, 4))

# 6. Predict & Save
preds = best.predict(X_test)
pd.DataFrame({"PassengerId": df_test["PassengerId"], "Survived": preds}).to_csv(
    "submission_full_pipeline.csv", index=False
)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Deck,TicketPrefix,FamilySize,IsAlone,FarePerPerson,AgeClass
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,2,2,7,3,2,0,3.625,66.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,0,3,2,18,2,0,35.64165,38.0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,2,1,7,38,1,1,7.925,78.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,2,3,2,16,2,0,26.55,35.0
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,2,2,7,16,1,1,8.05,105.0


In [4]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Pclass         418 non-null    int64  
 1   Sex            418 non-null    int64  
 2   Age            418 non-null    float64
 3   Fare           418 non-null    float64
 4   FamilySize     418 non-null    float64
 5   IsAlone        418 non-null    int64  
 6   FarePerPerson  418 non-null    float64
 7   AgeClass       418 non-null    float64
 8   Embarked       418 non-null    int64  
 9   Title          418 non-null    int64  
 10  Deck           418 non-null    int64  
 11  TicketPrefix   418 non-null    int64  
dtypes: float64(5), int64(7)
memory usage: 39.3 KB
