In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier 
from sklearn.model_selection import GridSearchCV, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.impute import KNNImputer, SimpleImputer
from imblearn.pipeline import Pipeline as imbPipeline, Pipeline 
from imblearn.over_sampling import ADASYN
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler
def enhanced_preprocess(df):
    df = df.copy()
    
    # Advanced feature engineering
    df['Title'] = df['Name'].str.extract(r',\s*([^\.]+)\.', expand=False).str.strip()
    title_map = {'Mr':0, 'Miss':1, 'Mrs':2, 'Master':3, 'Rare':4}
    df['Title'] = df['Title'].replace(['Dr','Rev','Col','Major','Countess','Lady',
                                      'Jonkheer','Dona','Don','Mme','Ms','Capt',
                                      'Mlle','Sir'], 'Rare').map(title_map)
    
    # Family features
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    df['FarePerPerson'] = df['Fare'] / df['FamilySize']
    
    # Cabin features
    df['Deck'] = df['Cabin'].str[0].fillna('Unknown')
    df['Deck'] = df['Deck'].map({'A':0, 'B':1, 'C':2, 'D':3, 'E':4, 
                                'F':5, 'G':6, 'T':7, 'Unknown':-1})
    
    # Binning strategies
    df['AgeBin'] = pd.cut(df['Age'], [0,12,18,30,50,100], labels=[0,1,2,3,4])
    df['FareBin'] = pd.qcut(df['Fare'], 5, labels=[0,1,2,3,4])
    
    # Smart imputation
    imputer = KNNImputer(n_neighbors=5)
    num_features = ['Age','Fare','SibSp','Parch']
    df[num_features] = imputer.fit_transform(df[num_features])
    
    # Categorical encoding
    df['Sex'] = df['Sex'].map({'male':0, 'female':1})
    df['Embarked'] = df['Embarked'].map({'C':0, 'Q':1, 'S':2}).fillna(1)
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns
    for col in categorical_cols:
        # Use mode to fill missing values
        df[col] = df[col].fillna(df[col].mode()[0])
    # Final columns
    return df.drop(columns=['PassengerId','Name','Ticket','Cabin'])

# Load and preprocess data
train = pd.read_csv("/kaggle/input/c/titanic/train.csv")
test = pd.read_csv("/kaggle/input/c/titanic/test.csv")

X_train = enhanced_preprocess(train).drop(columns=['Survived'])
y_train = train['Survived']
X_test = enhanced_preprocess(test)

estimators = [
    ('xgb', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')),
    ('lgbm', LGBMClassifier(random_state=42)),
    ('gbc', GradientBoostingClassifier(random_state=42))
]
final_estimator = LogisticRegression(max_iter=1000)

stacker = StackingClassifier(
    estimators=estimators,
    final_estimator=final_estimator,
    passthrough=True,  # include original features for final estimator
    cv=5
   ) 
param_grid = {
    'model__xgb__max_depth': [3, 5, 7],
    'model__xgb__learning_rate': [0.01, 0.05, 0.1],
    'model__lgbm__num_leaves': [31, 63],
    'model__gbc__n_estimators': [200, 300],
    'model__final_estimator__C': [0.1, 1, 10]
}
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
pipeline = imbPipeline([
    ('imputer', SimpleImputer(strategy='median')),  # Impute missing values
    ('adasyn', ADASYN(sampling_strategy=0.8, random_state=42)),
    ('scaler', RobustScaler()),
    ('model', stacker)  # Ensure stacker is defined correctly
]) 
# Optimized model pipeline
model = Pipeline([
    ('classifier', GradientBoostingClassifier(random_state=42))
])

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=42),
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)
grid_search.fit(X_train, y_train) 

cv = RepeatedStratifiedKFold(
    n_splits=10,
    n_repeats=3,
    random_state=42
)

best_model = grid_search.best_estimator_
print(f"Best CV Accuracy: {grid_search.best_score_:.2f}")

# Generate predictions
test_pred = best_model.predict(X_test)
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': test_pred
})
submission.to_csv("submission.csv", index=False)




Fitting 10 folds for each of 108 candidates, totalling 1080 fits
[LightGBM] [Info] Number of positive: 410, number of negative: 549
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000233 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 715
[LightGBM] [Info] Number of data points in the train set: 959, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.427529 -> initscore=-0.291941
[LightGBM] [Info] Start training from score -0.291941
[LightGBM] [Info] Number of positive: 328, number of negative: 439
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000181 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 581
[LightGBM] [Info] Number of data points in the train set: 