In [80]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import joblib

def preprocess_data(file_path, is_train=True):
    df = pd.read_csv(file_path)

    # Convert binary categorical features to 0 and 1
    binary_features = ['CryoSleep', 'VIP']
    df[binary_features] = df[binary_features].astype(bool).astype(int)

    # Feature Engineering
    df['TotalSpending'] = df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)

    # Conditionally set spending-related features to 0 for passengers in cryosleep
    spending_features = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    df.loc[df['CryoSleep'] == 1, spending_features] = 0

    # Create interaction features
    df['HomePlanet_TotalSpending'] = df['HomePlanet'].astype(str) + '_' + df['TotalSpending'].astype(str)
    df['Destination_TotalSpending'] = df['Destination'].astype(str) + '_' + df['TotalSpending'].astype(str)

    # Extract components from 'Cabin'
    if 'Cabin' in df.columns:
        df[['Cabin_Deck', 'Cabin_Number', 'Cabin_Side']] = df['Cabin'].str.split('/', expand=True)
        df['Cabin_Side'] = df['Cabin_Side'].map({'P': 1, 'S': 0})
        df['Cabin_Number'] = pd.to_numeric(df['Cabin_Number'], errors='coerce')
        df.drop('Cabin', axis=1, inplace=True)

    # One-hot encode multi-category features
    multi_cat_features = ['HomePlanet', 'Destination']
    if is_train:
        one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
        encoded_features = one_hot_encoder.fit_transform(df[multi_cat_features])
        encoded_feature_names = one_hot_encoder.get_feature_names_out(multi_cat_features)
        joblib.dump(one_hot_encoder, 'one_hot_encoder.pkl')
    else:
        one_hot_encoder = joblib.load('one_hot_encoder.pkl')
        encoded_features = one_hot_encoder.transform(df[multi_cat_features])
        encoded_feature_names = one_hot_encoder.get_feature_names_out(multi_cat_features)

    encoded_features_df = pd.DataFrame(encoded_features.toarray(), columns=encoded_feature_names)
    df = pd.concat([df, encoded_features_df], axis=1)
    df.drop(multi_cat_features, axis=1, inplace=True)

    # Imputation and Scaling
    numeric_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Cabin_Number', 'Cabin_Side', 'TotalSpending']
    if is_train:
        imputer = KNNImputer(n_neighbors=5)
        scaler = StandardScaler()
        df[numeric_features] = imputer.fit_transform(df[numeric_features])
        df[numeric_features] = scaler.fit_transform(df[numeric_features])
        joblib.dump(imputer, 'imputer.pkl')
        joblib.dump(scaler, 'scaler.pkl')
    else:
        imputer = joblib.load('imputer.pkl')
        scaler = joblib.load('scaler.pkl')
        df[numeric_features] = imputer.transform(df[numeric_features])
        df[numeric_features] = scaler.transform(df[numeric_features])

    if is_train:
        # Convert 'Transported' to integer (True=1, False=0) for modeling
        df['Transported'] = df['Transported'].astype(int)

    return df

In [81]:
train_df = preprocess_data('csv_files/train.csv', is_train=True)
test_df = preprocess_data('csv_files/test.csv', is_train=False)

In [82]:
features = [col for col in train_df.columns if col not in ['PassengerId', 'Name', 'Transported', 'Cabin_Deck', 'HomePlanet_TotalSpending', 'Destination_TotalSpending']]
X = train_df[features]
y = train_df['Transported']

In [83]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [84]:
from sklearn.feature_selection import RFE
from xgboost import XGBClassifier

rfe = RFE(estimator=XGBClassifier(n_estimators=100, random_state=42), n_features_to_select=11)
X_selected = rfe.fit_transform(X_resampled, y_resampled)
selected_features = X.columns[rfe.support_]

In [85]:
from skopt.space import Real, Integer

# Modify the parameter space
param_space = [
    Integer(2, 6, name='max_depth'),  # Reduced the maximum depth
    Real(0.01, 0.1, prior='log-uniform', name='learning_rate'),
    Integer(50, 200, name='n_estimators'),
    Real(0.5, 1.0, prior='uniform', name='subsample'),
    Real(0.5, 1.0, prior='uniform', name='colsample_bytree'),
    Real(0, 5, prior='uniform', name='gamma'),  # Reduced the range of gamma
    Integer(1, 5, name='min_child_weight')  # Reduced the range of min_child_weight
]

In [86]:
from sklearn.model_selection import StratifiedKFold
from skopt.utils import use_named_args
import numpy as np

@use_named_args(param_space)
def objective(**params):
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = []
    
    for train_index, val_index in kfold.split(X_selected, y_resampled):
        X_train, X_val = X_selected[train_index], X_selected[val_index]
        y_train, y_val = y_resampled[train_index], y_resampled[val_index]
        
        model = XGBClassifier(**params, random_state=42, reg_alpha=0.1, reg_lambda=1)
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=10, verbose=False)
        
        score = model.score(X_val, y_val)
        scores.append(score)
    
    return -np.mean(scores)


In [87]:
from skopt import gp_minimize

result = gp_minimize(objective, dimensions=param_space, n_calls=500, random_state=42)
best_params = dict(zip([dim.name for dim in param_space], result.x))



In [None]:
best_model = XGBClassifier(**best_params, random_state=42)
best_model.fit(X_selected, y_resampled)

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(best_model, X_selected, y_resampled, cv=5, scoring='accuracy')
print(f"Cross-validation scores: {scores}")
print(f"Mean accuracy: {np.mean(scores)}")

Cross-validation scores: [0.74486301 0.72244432 0.7961165  0.81096516 0.76185037]
Mean accuracy: 0.767247874013284


In [None]:
test_features = test_df[selected_features]
predictions = best_model.predict(test_features)

In [None]:
submission = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Transported': predictions})
submission['Transported'] = submission['Transported'].astype(bool)
submission.to_csv('submissionXGB500.csv', index=False)