In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, make_scorer
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from scipy.stats import randint, uniform

class SpaceshipPreprocessor:
    """Custom preprocessor for Spaceship Titanic data"""
    
    def __init__(self):
        self.group_expenses = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    
    def create_features(self, df):
        """Create advanced features"""
        df = df.copy()
        
        # Basic preprocessing
        df[self.group_expenses] = df[self.group_expenses].fillna(0)
        for col in ['VIP', 'CryoSleep']:
            if col in df.columns:
                df[col] = df[col].fillna(False).astype(int)
        
        # Extract cabin information
        if 'Cabin' in df.columns:
            df[['Deck', 'Cabin_num', 'Side']] = df['Cabin'].str.split('/', expand=True)
            df['Cabin_num'] = pd.to_numeric(df['Cabin_num'], errors='coerce')
            df = df.drop('Cabin', axis=1)
        
        # Group features
        df['TotalSpend'] = df[self.group_expenses].sum(axis=1)
        df['HasSpent'] = (df['TotalSpend'] > 0).astype(int)
        df['NumServicesUsed'] = (df[self.group_expenses] > 0).sum(axis=1)
        df['AvgSpendPerService'] = df['TotalSpend'] / (df['NumServicesUsed'] + 1e-6)
        
        # Spending patterns
        df['MaxSpend'] = df[self.group_expenses].max(axis=1)
        df['SpendingVariety'] = df[self.group_expenses].apply(lambda x: (x > 0).sum(), axis=1)
        
        # Group size features
        if 'Name' in df.columns:
            df['LastName'] = df['Name'].fillna('').str.split().str[-1]
            lastname_counts = df['LastName'].value_counts()
            df['GroupSize'] = df['LastName'].map(lastname_counts)
            df = df.drop(['Name', 'LastName'], axis=1)
        
        # Age-based features
        if 'Age' in df.columns:
            df['IsChild'] = (df['Age'] < 13).astype(int)
            df['IsAdult'] = ((df['Age'] >= 13) & (df['Age'] < 65)).astype(int)
            df['IsSenior'] = (df['Age'] >= 65).astype(int)
            df['AgeBin'] = pd.qcut(df['Age'].fillna(df['Age'].median()), q=5, labels=False)
        
        return df

def create_model_pipeline(use_voting=True):
    """Create a pipeline with preprocessing and model"""
    
    # Identify column types
    categorical_cols = ['HomePlanet', 'Destination', 'Deck', 'Side']
    numerical_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
                     'Cabin_num', 'TotalSpend', 'NumServicesUsed', 'AvgSpendPerService',
                     'MaxSpend', 'SpendingVariety', 'GroupSize']
    binary_cols = ['VIP', 'CryoSleep', 'HasSpent', 'IsChild', 'IsAdult', 'IsSenior']
    
    # Create preprocessing pipelines
    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', num_pipeline, numerical_cols),
            ('cat', cat_pipeline, categorical_cols),
            ('pass', 'passthrough', binary_cols)
        ]
    )
    
    if use_voting:
        # Create individual models
        xgb = XGBClassifier(
            learning_rate=0.1,
            n_estimators=100,
            max_depth=7,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            use_label_encoder=False,
            eval_metric='logloss'
        )
        
        rf = RandomForestClassifier(
            n_estimators=100,
            max_depth=15,
            min_samples_split=5,
            min_samples_leaf=2,
            random_state=42,
            n_jobs=-1
        )
        
        lgbm = LGBMClassifier(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=7,
            random_state=42,
            n_jobs=-1
        )
        
        # Create voting classifier
        model = VotingClassifier(
            estimators=[
                ('xgb', xgb),
                ('rf', rf),
                ('lgbm', lgbm)
            ],
            voting='soft'
        )
    else:
        model = XGBClassifier(
            learning_rate=0.1,
            n_estimators=100,
            max_depth=7,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            use_label_encoder=False,
            eval_metric='logloss'
        )
    
    return Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])

def optimize_hyperparameters(pipeline, X_train, y_train):
    """Perform hyperparameter optimization"""
    
    param_distributions = {
        'classifier__xgb__learning_rate': uniform(0.01, 0.3),
        'classifier__xgb__n_estimators': randint(50, 300),
        'classifier__xgb__max_depth': randint(3, 10),
        'classifier__xgb__subsample': uniform(0.6, 0.4),
        'classifier__xgb__colsample_bytree': uniform(0.6, 0.4),
        
        'classifier__rf__n_estimators': randint(50, 300),
        'classifier__rf__max_depth': randint(5, 20),
        'classifier__rf__min_samples_split': randint(2, 10),
        
        'classifier__lgbm__learning_rate': uniform(0.01, 0.3),
        'classifier__lgbm__n_estimators': randint(50, 300),
        'classifier__lgbm__max_depth': randint(3, 10)
    }
    
    random_search = RandomizedSearchCV(
        pipeline,
        param_distributions=param_distributions,
        n_iter=50,
        cv=5,
        scoring='accuracy',
        n_jobs=-1,
        random_state=42
    )
    
    random_search.fit(X_train, y_train)
    print(f"Best parameters: {random_search.best_params_}")
    print(f"Best cross-validation accuracy: {random_search.best_score_:.4f}")
    
    return random_search.best_estimator_

def create_submission(train_df, test_df, use_optimization=True):
    """Create submission using the enhanced model pipeline"""
    
    # Preprocess data
    preprocessor = SpaceshipPreprocessor()
    train_df = preprocessor.create_features(train_df)
    test_df = preprocessor.create_features(test_df)
    
    # Save PassengerId for submission
    test_passenger_ids = test_df['PassengerId']
    
    # Prepare features and target
    X = train_df.drop(['PassengerId', 'Transported'], axis=1, errors='ignore')
    y = train_df['Transported'].astype(int)
    X_test = test_df.drop(['PassengerId'], axis=1, errors='ignore')
    
    # Create and train pipeline
    pipeline = create_model_pipeline(use_voting=True)
    
    if use_optimization:
        # Split data for optimization
        X_train, X_val, y_train, y_val = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
        
        # Optimize hyperparameters
        pipeline = optimize_hyperparameters(pipeline, X_train, y_train)
        
        # Evaluate on validation set
        y_val_pred = pipeline.predict(X_val)
        val_accuracy = accuracy_score(y_val, y_val_pred)
        print(f"\nValidation Accuracy: {val_accuracy:.4f}")
        
        # Cross-validation score
        cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')
        print(f"\nCross-validation scores: {cv_scores}")
        print(f"Mean CV score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    
    # Fit on full training data
    pipeline.fit(X, y)
    
    # Make predictions
    test_predictions = pipeline.predict(X_test)
    
    # Create submission DataFrame
    submission_df = pd.DataFrame({
        'PassengerId': test_passenger_ids,
        'Transported': test_predictions.astype(bool)
    })
    
    return submission_df, val_accuracy if use_optimization else None

# Main execution
if __name__ == "__main__":
    print("Loading data...")
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')
    
    print("\nCreating submission with optimized model...")
    submission_df, accuracy = create_submission(train_df, test_df, use_optimization=True)
    
    print("\nSaving submission file...")
    submission_df.to_csv('submission.csv', index=False)
    
    print("\nSubmission file info:")
    print(f"Total predictions: {len(submission_df)}")
    print(f"Percentage True: {(submission_df['Transported'] == True).mean():.2%}")
    print(f"Percentage False: {(submission_df['Transported'] == False).mean():.2%}")