In [149]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import joblib

def preprocess_data(file_path, is_train=True):
    df = pd.read_csv(file_path)

    # Convert binary categorical features to 0 and 1
    binary_features = ['CryoSleep', 'VIP']
    df[binary_features] = df[binary_features].astype(bool).astype(int)

    # Feature Engineering
    df['TotalSpending'] = df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)

    # Conditionally set spending-related features to 0 for passengers in cryosleep
    spending_features = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    df.loc[df['CryoSleep'] == 1, spending_features] = 0

    # Create interaction features
    df['HomePlanet_TotalSpending'] = df['HomePlanet'].astype(str) + '_' + df['TotalSpending'].astype(str)
    df['Destination_TotalSpending'] = df['Destination'].astype(str) + '_' + df['TotalSpending'].astype(str)

    # Extract components from 'Cabin'
    if 'Cabin' in df.columns:
        df[['Cabin_Deck', 'Cabin_Number', 'Cabin_Side']] = df['Cabin'].str.split('/', expand=True)
        df['Cabin_Side'] = df['Cabin_Side'].map({'P': 1, 'S': 0})
        df['Cabin_Number'] = pd.to_numeric(df['Cabin_Number'], errors='coerce')
        df.drop('Cabin', axis=1, inplace=True)

    # One-hot encode multi-category features
    multi_cat_features = ['HomePlanet', 'Destination']
    if is_train:
        one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
        encoded_features = one_hot_encoder.fit_transform(df[multi_cat_features])
        encoded_feature_names = one_hot_encoder.get_feature_names_out(multi_cat_features)
        joblib.dump(one_hot_encoder, 'one_hot_encoder.pkl')
    else:
        one_hot_encoder = joblib.load('one_hot_encoder.pkl')
        encoded_features = one_hot_encoder.transform(df[multi_cat_features])
        encoded_feature_names = one_hot_encoder.get_feature_names_out(multi_cat_features)

    encoded_features_df = pd.DataFrame(encoded_features.toarray(), columns=encoded_feature_names)
    df = pd.concat([df, encoded_features_df], axis=1)
    df.drop(multi_cat_features, axis=1, inplace=True)

    # Imputation and Scaling
    numeric_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Cabin_Number', 'Cabin_Side', 'TotalSpending']
    if is_train:
        imputer = KNNImputer(n_neighbors=5)
        scaler = StandardScaler()
        df[numeric_features] = imputer.fit_transform(df[numeric_features])
        df[numeric_features] = scaler.fit_transform(df[numeric_features])
        joblib.dump(imputer, 'imputer.pkl')
        joblib.dump(scaler, 'scaler.pkl')
    else:
        imputer = joblib.load('imputer.pkl')
        scaler = joblib.load('scaler.pkl')
        df[numeric_features] = imputer.transform(df[numeric_features])
        df[numeric_features] = scaler.transform(df[numeric_features])

    if is_train:
        # Convert 'Transported' to integer (True=1, False=0) for modeling
        df['Transported'] = df['Transported'].astype(int)

    return df

In [150]:
train_df = preprocess_data('csv_files/train.csv', is_train=True)
test_df = preprocess_data('csv_files/test.csv', is_train=False)

In [151]:
features = [col for col in train_df.columns if col not in ['PassengerId', 'Name', 'Transported', 'Cabin_Deck', 'HomePlanet_TotalSpending', 'Destination_TotalSpending']]
X = train_df[features]
y = train_df['Transported']

In [152]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [153]:
from sklearn.feature_selection import RFE
from xgboost import XGBClassifier

rfe = RFE(estimator=XGBClassifier(n_estimators=100, random_state=42), n_features_to_select=11)
X_selected = rfe.fit_transform(X_resampled, y_resampled)
selected_features = X.columns[rfe.support_]

In [154]:
from skopt.space import Real, Integer

# Modify the parameter space
param_space = [
    Integer(2, 6, name='max_depth'),  # Reduced the maximum depth
    Real(0.01, 0.1, prior='log-uniform', name='learning_rate'),
    Integer(50, 200, name='n_estimators'),
    Real(0.5, 1.0, prior='uniform', name='subsample'),
    Real(0.5, 1.0, prior='uniform', name='colsample_bytree'),
    Real(0, 5, prior='uniform', name='gamma'),  # Reduced the range of gamma
    Integer(1, 5, name='min_child_weight')  # Reduced the range of min_child_weight
]

In [155]:
from sklearn.model_selection import StratifiedKFold
from skopt.utils import use_named_args
import numpy as np

@use_named_args(param_space)
def objective(**params):
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = []
    
    for train_index, val_index in kfold.split(X_selected, y_resampled):
        X_train, X_val = X_selected[train_index], X_selected[val_index]
        y_train, y_val = y_resampled[train_index], y_resampled[val_index]
        
        model = XGBClassifier(**params, random_state=42, early_stopping_rounds=10, eval_metric='error')
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=10, verbose=False)
        
        score = model.score(X_val, y_val)
        scores.append(score)
    
    return -np.mean(scores)


In [156]:
from skopt import gp_minimize

result = gp_minimize(objective, dimensions=param_space, n_calls=10, random_state=42)
best_params = dict(zip([dim.name for dim in param_space], result.x))

ValueError: 2 different `early_stopping_rounds` are provided.  Use the one in constructor or `set_params` instead.

In [None]:
best_model = XGBClassifier(**best_params, random_state=42)
best_model.fit(X_selected, y_resampled)

In [159]:
import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

def objective(trial):
    # Define the hyperparameters to tune
    params = {
        'max_depth': trial.suggest_int('max_depth', 2, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 8),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 1),
    }
    
    # Create the XGBoost classifier with the sampled hyperparameters
    model = XGBClassifier(**params, random_state=42, early_stopping_rounds=10, eval_metric='error')
    
    # Perform k-fold cross-validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    accuracies = []
    
    for train_index, val_index in cv.split(X_selected, y_resampled):
        X_train, X_val = X_selected[train_index], X_selected[val_index]
        y_train, y_val = y_resampled[train_index], y_resampled[val_index]
        
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=0)
        y_pred = model.predict(X_val)
        accuracy = accuracy_score(y_val, y_pred)
        accuracies.append(accuracy)
    
    return np.mean(accuracies)

# Create an Optuna study
study = optuna.create_study(direction='maximize')

# Optimize the hyperparameters
study.optimize(objective, n_trials=500)

# Print the best hyperparameters and best score
print("Best hyperparameters:", study.best_params)
print("Best accuracy score:", study.best_value)

# Train the final model with the best hyperparameters
best_params = study.best_params
best_model = XGBClassifier(**best_params, random_state=42)
best_model.fit(X_selected, y_resampled)
test_features = test_df[selected_features]
# Make predictions on the test set
test_predictions = best_model.predict(test_features)

# Prepare the submission file
submission = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Transported': test_predictions})
submission['Transported'] = submission['Transported'].astype(bool)
submission.to_csv('submission.csv', index=False)

[I 2024-03-25 23:57:43,522] A new study created in memory with name: no-name-0bd00bd0-83e8-4bcf-b2a2-8cafac2aa6d1
[I 2024-03-25 23:57:43,703] Trial 0 finished with value: 0.7931703736155986 and parameters: {'max_depth': 7, 'learning_rate': 0.04625925169311859, 'n_estimators': 99, 'subsample': 0.8315094447993434, 'colsample_bytree': 0.961949633377021, 'gamma': 2.4790984671052803, 'min_child_weight': 2, 'reg_alpha': 0.7298626050043064, 'reg_lambda': 0.07742347297934027}. Best is trial 0 with value: 0.7931703736155986.
[I 2024-03-25 23:57:43,874] Trial 1 finished with value: 0.7955681554441167 and parameters: {'max_depth': 7, 'learning_rate': 0.09711864563217834, 'n_estimators': 50, 'subsample': 0.9459985619133829, 'colsample_bytree': 0.9433630875569725, 'gamma': 3.9059706301349117, 'min_child_weight': 7, 'reg_alpha': 0.8669122470136242, 'reg_lambda': 0.0059830120974212475}. Best is trial 1 with value: 0.7955681554441167.
[I 2024-03-25 23:57:43,939] Trial 2 finished with value: 0.76016242

Best hyperparameters: {'max_depth': 8, 'learning_rate': 0.09587111045798082, 'n_estimators': 99, 'subsample': 0.6880233814374614, 'colsample_bytree': 0.8872143553906651, 'gamma': 1.5897817476832896, 'min_child_weight': 2, 'reg_alpha': 0.14078266266415462, 'reg_lambda': 0.9357618772979329}
Best accuracy score: 0.8036769595456217


In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(best_model, X_selected, y_resampled, cv=5, scoring='accuracy')
print(f"Cross-validation scores: {scores}")
print(f"Mean accuracy: {np.mean(scores)}")

Cross-validation scores: [0.76027397 0.76299258 0.79782981 0.81324957 0.78526556]
Mean accuracy: 0.7839222988038147


In [None]:
test_features = test_df[selected_features]
predictions = best_model.predict(test_features)

In [None]:
submission = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Transported': predictions})
submission['Transported'] = submission['Transported'].astype(bool)
submission.to_csv('submissionXGB500.csv', index=False)

In [160]:
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split

# Assuming you have already preprocessed your data and have X_selected and y_resampled

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_selected, y_resampled, test_size=0.2, random_state=42)

# Initialize LazyClassifier
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

# Train and evaluate models
models, predictions = clf.fit(X_train, X_val, y_train, y_val)

# Print the model performance
print(models)

 90%|████████▉ | 26/29 [00:06<00:00,  3.69it/s]

[LightGBM] [Info] Number of positive: 3495, number of negative: 3509
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000422 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1550
[LightGBM] [Info] Number of data points in the train set: 7004, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499001 -> initscore=-0.003998
[LightGBM] [Info] Start training from score -0.003998


100%|██████████| 29/29 [00:06<00:00,  4.49it/s]

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
LGBMClassifier                     0.80               0.80     0.80      0.80   
XGBClassifier                      0.80               0.80     0.80      0.80   
AdaBoostClassifier                 0.79               0.79     0.79      0.78   
SVC                                0.78               0.78     0.78      0.78   
RandomForestClassifier             0.78               0.78     0.78      0.78   
SGDClassifier                      0.78               0.78     0.78      0.78   
KNeighborsClassifier               0.78               0.78     0.78      0.78   
LogisticRegression                 0.78               0.78     0.78      0.78   
NuSVC                              0.77               0.77     0.77      0.77   
BaggingClassifier                  0.77               0.77     0.77      0.77   
LabelSpreading              


