In [1]:
# Install required packages

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbPipeline

# Preprocessing function
def preprocess_data(data, train_params):
    df = data.copy()
    
    # Basic feature engineering
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    
    # Title extraction with normalization
    df['Title'] = df['Name'].str.extract(r',\s*([^\.]+)\.', expand=False).str.strip()
    df['Title'] = df['Title'].replace(train_params['rare_titles'], 'Rare')
    df['Title'] = df['Title'].map(train_params['title_mapping']).fillna(-1)
    
    # Cabin-based features
    df['Deck'] = df['Cabin'].str[0].fillna('Unknown')
    df['Deck'] = df['Deck'].map(train_params['deck_mapping'])
    
    # Fare transformations
    df['Fare'] = df['Fare'].fillna(train_params['fare_median'])
    df['LogFare'] = np.log1p(df['Fare'])
    
    # Age processing
    df['Age'] = df['Age'].fillna(train_params['age_median'])
    df['Age'] = df['Age'].clip(upper=train_params['age_clip'])
    
    # Categorical encoding
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    df['Embarked'] = df['Embarked'].map({'C': 0, 'Q': 1, 'S': 2}).fillna(1)

    # Drop unnecessary columns
    return df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin', 'Fare'])

# Load data
train = pd.read_csv("/kaggle/input/c/titanic/train.csv")
test = pd.read_csv("/kaggle/input/c/titanic/test.csv")

# Calculate training parameters for preprocessing
train_params = {
    'title_mapping': {'Mr': 0, 'Miss': 1, 'Mrs': 2, 'Master': 3, 'Rare': 4},
    'rare_titles': ['Dr', 'Rev', 'Col', 'Major', 'Countess', 'Lady', 'Jonkheer', 'Dona', 'Don', 'Mme', 'Ms', 'Capt', 'Mlle', 'Sir'],
    'deck_mapping': {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'T': 7, 'Unknown': -1},
    'fare_median': train['Fare'].median(),
    'age_median': train['Age'].median(),
    'age_clip': train['Age'].quantile(0.99)
}

# Preprocess data
X_train = preprocess_data(train, train_params). drop(columns=['Survived'])
y_train = train['Survived']
X_test = preprocess_data(test, train_params)
X_train = X_train[sorted(X_train.columns)]
X_test = X_test[sorted(X_train.columns)]
# Create model pipeline
pipeline = imbPipeline([
    ('smote', SMOTE(random_state=42)),
    ('selector', SelectKBest(mutual_info_classif, k='all')),
    ('ensemble', VotingClassifier(
        estimators=[
            ('rf', RandomForestClassifier(random_state=42)),
            ('xgb', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')),
            ('svc', SVC(probability=True, random_state=42))
        ],
        voting='soft'
    ))
])

# Hyperparameter grid
param_grid = {
    'ensemble__rf__n_estimators': [200, 300],
    'ensemble__rf__max_depth': [None, 15],
    'ensemble__xgb__max_depth': [3, 5],
    'ensemble__xgb__learning_rate': [0.01, 0.1],
    'ensemble__svc__C': [1, 10]
}

# Configure and run grid search
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=42),
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)
grid_search.fit(X_train, y_train)
# Final evaluation
best_model = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best CV Accuracy: {grid_search.best_score_:.2f}")

# Generate predictions
test_pred = best_model.predict(X_test)
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': test_pred
})
submission. to_csv("submission.csv", index=False)

Fitting 10 folds for each of 32 candidates, totalling 320 fits
Best Parameters: {'ensemble__rf__max_depth': 15, 'ensemble__rf__n_estimators': 200, 'ensemble__svc__C': 1, 'ensemble__xgb__learning_rate': 0.1, 'ensemble__xgb__max_depth': 3}
Best CV Accuracy: 0.83
