In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from scipy.stats import expon, reciprocal
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.regularizers import l2
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow import keras
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from tensorflow.keras.optimizers import Adam
import keras_tuner as kt
import deap
import skopt
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.models import save_model
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.regularizers import l1, l2, l1_l2
from sklearn.model_selection import ParameterGrid

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
import joblib
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

def preprocess_data(file_path, is_train=True):
    df = pd.read_csv(file_path)
    
    # Convert binary categorical features to 0 and 1
    binary_features = ['CryoSleep', 'VIP']
    df[binary_features] = df[binary_features].astype(bool).astype(int)
    
    # Feature Engineering
    df['TotalSpending'] = df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
    
    # Conditionally set spending-related features to 0 for passengers in cryosleep
    spending_features = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    df.loc[df['CryoSleep'] == 1, spending_features] = 0
    
    # Create interaction features
    df['HomePlanet_TotalSpending'] = df['HomePlanet'].astype(str) + '_' + df['TotalSpending'].astype(str)
    df['Destination_TotalSpending'] = df['Destination'].astype(str) + '_' + df['TotalSpending'].astype(str)
    
    # Extract components from 'Cabin'
    if 'Cabin' in df.columns:
        df[['Cabin_Deck', 'Cabin_Number', 'Cabin_Side']] = df['Cabin'].str.split('/', expand=True)
        df['Cabin_Side'] = df['Cabin_Side'].map({'P': 1, 'S': 0})
        df['Cabin_Number'] = pd.to_numeric(df['Cabin_Number'], errors='coerce')
        df.drop('Cabin', axis=1, inplace=True)
    
    # One-hot encode multi-category features
    multi_cat_features = ['HomePlanet', 'Destination']
    if is_train:
        one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
        encoded_features = one_hot_encoder.fit_transform(df[multi_cat_features])
        encoded_feature_names = one_hot_encoder.get_feature_names_out(multi_cat_features)
        joblib.dump(one_hot_encoder, 'one_hot_encoder.pkl')
    else:
        one_hot_encoder = joblib.load('one_hot_encoder.pkl')
        encoded_features = one_hot_encoder.transform(df[multi_cat_features])
        encoded_feature_names = one_hot_encoder.get_feature_names_out(multi_cat_features)
    
    encoded_features_df = pd.DataFrame(encoded_features.toarray(), columns=encoded_feature_names)
    df = pd.concat([df, encoded_features_df], axis=1)
    df.drop(multi_cat_features, axis=1, inplace=True)
    
    # Imputation and Scaling
    numeric_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Cabin_Number', 'Cabin_Side', 'TotalSpending']
    if is_train:
        imputer = KNNImputer(n_neighbors=5)
        scaler = RobustScaler()  # Use RobustScaler instead of StandardScaler
        df[numeric_features] = imputer.fit_transform(df[numeric_features])
        df[numeric_features] = scaler.fit_transform(df[numeric_features])
        joblib.dump(imputer, 'imputer.pkl')
        joblib.dump(scaler, 'scaler.pkl')
    else:
        imputer = joblib.load('imputer.pkl')
        scaler = joblib.load('scaler.pkl')
        df[numeric_features] = imputer.transform(df[numeric_features])
        df[numeric_features] = scaler.transform(df[numeric_features])
    

    if is_train:
        # Convert 'Transported' to integer (True=1, False=0) for modeling
        df['Transported'] = df['Transported'].astype(int)
        
        # Save the list of features used for training
        train_features = [col for col in df.columns if col not in ['PassengerId', 'Name', 'Transported', 'Cabin_Deck', 'HomePlanet_TotalSpending', 'Destination_TotalSpending']]
        joblib.dump(train_features, 'train_features.pkl')
    
    return df

In [13]:
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb

def objective(trial):
    # Define the hyperparameter search space
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1.0),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'random_state': 42
    }
    
    # Create LightGBM datasets
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_val, label=y_val)
    
    # Train the LightGBM model
    model = lgb.train(params, train_data, num_boost_round=1000, valid_sets=[valid_data], 
                      callbacks=[lgb.early_stopping(50)])
    
    # Make predictions on the validation set
    y_pred = model.predict(X_val)
    y_pred = (y_pred > 0.5).astype(int)
    
    # Calculate the validation accuracy
    val_acc = accuracy_score(y_val, y_pred)
    
    return val_acc

# Prepare the data
train_df = preprocess_data('csv_files/train.csv', is_train=True)
test_df = preprocess_data('csv_files/test.csv', is_train=False)

features = [col for col in train_df.columns if col not in ['PassengerId', 'Name', 'Transported', 'Cabin_Deck', 'HomePlanet_TotalSpending', 'Destination_TotalSpending']]
X = train_df[features]
y = train_df['Transported']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an Optuna study
study = optuna.create_study(direction='maximize')

# Optimize the hyperparameters
study.optimize(objective, n_trials=10, timeout=None, n_jobs=-1, gc_after_trial=True, show_progress_bar=True)

# Get the best hyperparameters and performance
best_params = study.best_params
best_val_acc = study.best_value

print(f"Best Validation Accuracy: {best_val_acc}")
print(f"Best Hyperparameters: {best_params}")
# Train the final model with the best hyperparameters
best_model = lgb.train(best_params, lgb.Dataset(X, label=y))
# Make predictions on the test set
test_features = [col for col in test_df.columns if col not in ['PassengerId', 'Name', 'Cabin_Deck', 'HomePlanet_TotalSpending', 'Destination_TotalSpending']]
X_test = test_df[test_features]
y_pred = best_model.predict(X_test)
y_pred = y_pred > 0.5

# Create the submission DataFrame
submission_df = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Transported': y_pred})
submission_df.to_csv('submissionLGB.csv', index=False)

[I 2024-03-22 14:02:49,422] A new study created in memory with name: no-name-954c394d-021d-4b60-bb8b-705f22c111cf
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1.0),
  'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1.0),
  'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1.0),
  'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.

[LightGBM] [Info] Number of positive: 3500, number of negative: 3454
[LightGBM] [Info] Number of positive: 3500, number of negative: 3454
[LightGBM] [Info] Number of positive: 3500, number of negative: 3454
[LightGBM] [Info] Number of positive: 3500, number of negative: 3454
[LightGBM] [Info] Number of positive: 3500, number of negative: 3454
[LightGBM] [Info] Number of positive: 3500, number of negative: 3454
[LightGBM] [Info] Number of positive: 3500, number of negative: 3454
[LightGBM] [Info] Number of positive: 3500, number of negative: 3454
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003813 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003802 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=tru

                                      

Early stopping, best iteration is:
[35]	valid_0's binary_logloss: 0.41531
[I 2024-03-22 14:02:51,318] Trial 6 finished with value: 0.7924094307073031 and parameters: {'num_leaves': 153, 'max_depth': 8, 'learning_rate': 0.3265078937160492, 'min_child_samples': 87, 'subsample': 0.7970405755975023, 'colsample_bytree': 0.5270906755664913, 'reg_alpha': 0.03354211441886867, 'reg_lambda': 0.1546757922128888}. Best is trial 6 with value: 0.7924094307073031.


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1.0),
  'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),


[LightGBM] [Info] Number of positive: 3500, number of negative: 3454
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002155 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1916
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503307 -> initscore=0.013230
[LightGBM] [Info] Start training from score 0.013230
Training until validation scores don't improve for 50 rounds


Best trial: 6. Best value: 0.792409:  10%|█         | 1/10 [00:02<00:18,  2.09s/it]

Early stopping, best iteration is:
[71]	valid_0's binary_logloss: 0.413099
[I 2024-03-22 14:02:51,915] Trial 7 finished with value: 0.7981598619896493 and parameters: {'num_leaves': 99, 'max_depth': 6, 'learning_rate': 0.12435032003080158, 'min_child_samples': 7, 'subsample': 0.5813680367747637, 'colsample_bytree': 0.9979490776465224, 'reg_alpha': 0.003281118313074587, 'reg_lambda': 1.8390549847947741}. Best is trial 7 with value: 0.7981598619896493.


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1.0),
  'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),


[LightGBM] [Info] Number of positive: 3500, number of negative: 3454
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002933 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1916
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503307 -> initscore=0.013230
[LightGBM] [Info] Start training from score 0.013230
Training until validation scores don't improve for 50 rounds


Best trial: 7. Best value: 0.79816:  20%|██        | 2/10 [00:03<00:09,  1.15s/it]

Early stopping, best iteration is:
[588]	valid_0's binary_logloss: 0.411717
[I 2024-03-22 14:02:52,924] Trial 1 finished with value: 0.7981598619896493 and parameters: {'num_leaves': 35, 'max_depth': 3, 'learning_rate': 0.052890140818536144, 'min_child_samples': 18, 'subsample': 0.9595469469584578, 'colsample_bytree': 0.6195456251263739, 'reg_alpha': 0.002003127721179121, 'reg_lambda': 2.439372209603154}. Best is trial 7 with value: 0.7981598619896493.


Best trial: 7. Best value: 0.79816:  30%|███       | 3/10 [00:03<00:07,  1.09s/it]

Early stopping, best iteration is:
[295]	valid_0's binary_logloss: 0.415008


Best trial: 7. Best value: 0.79816:  40%|████      | 4/10 [00:12<00:25,  4.30s/it]

[I 2024-03-22 14:03:02,096] Trial 2 finished with value: 0.7958596894767107 and parameters: {'num_leaves': 188, 'max_depth': 11, 'learning_rate': 0.016371353080655845, 'min_child_samples': 63, 'subsample': 0.6638918927624473, 'colsample_bytree': 0.8573235558938428, 'reg_alpha': 0.002354115655425167, 'reg_lambda': 0.7463105759837583}. Best is trial 7 with value: 0.7981598619896493.


Best trial: 7. Best value: 0.79816:  40%|████      | 4/10 [00:16<00:25,  4.30s/it]

Early stopping, best iteration is:
[472]	valid_0's binary_logloss: 0.413787
[I 2024-03-22 14:03:06,289] Trial 9 finished with value: 0.79700977573318 and parameters: {'num_leaves': 82, 'max_depth': 12, 'learning_rate': 0.012793529680951984, 'min_child_samples': 89, 'subsample': 0.6505794475282356, 'colsample_bytree': 0.941995774278322, 'reg_alpha': 1.8988959301009314, 'reg_lambda': 0.03664162881772791}. Best is trial 7 with value: 0.7981598619896493.


Best trial: 7. Best value: 0.79816:  50%|█████     | 5/10 [00:16<00:21,  4.24s/it]

Early stopping, best iteration is:
[834]	valid_0's binary_logloss: 0.413719


Best trial: 7. Best value: 0.79816:  60%|██████    | 6/10 [00:17<00:12,  3.04s/it]

[I 2024-03-22 14:03:07,010] Trial 8 finished with value: 0.7906843013225991 and parameters: {'num_leaves': 96, 'max_depth': 6, 'learning_rate': 0.012384954501079229, 'min_child_samples': 17, 'subsample': 0.6634289478998068, 'colsample_bytree': 0.7120009766433112, 'reg_alpha': 0.027680756605919652, 'reg_lambda': 3.4337591810866543}. Best is trial 7 with value: 0.7981598619896493.
Did not meet early stopping. Best iteration is:
[1000]	valid_0's binary_logloss: 0.414609


Best trial: 7. Best value: 0.79816:  70%|███████   | 7/10 [00:20<00:08,  2.99s/it]

[I 2024-03-22 14:03:09,884] Trial 3 finished with value: 0.78953421506613 and parameters: {'num_leaves': 121, 'max_depth': 7, 'learning_rate': 0.005367507402949539, 'min_child_samples': 52, 'subsample': 0.8550090525146845, 'colsample_bytree': 0.8406009620529085, 'reg_alpha': 0.7146448093780772, 'reg_lambda': 0.011104649538810397}. Best is trial 7 with value: 0.7981598619896493.


Best trial: 7. Best value: 0.79816:  70%|███████   | 7/10 [00:21<00:08,  2.99s/it]

Did not meet early stopping. Best iteration is:
[1000]	valid_0's binary_logloss: 0.431258
[I 2024-03-22 14:03:11,258] Trial 5 finished with value: 0.7843588269120184 and parameters: {'num_leaves': 76, 'max_depth': 9, 'learning_rate': 0.002801050497456012, 'min_child_samples': 95, 'subsample': 0.6598951094703764, 'colsample_bytree': 0.6078870853822859, 'reg_alpha': 0.17758784661012852, 'reg_lambda': 0.4709544622169498}. Best is trial 7 with value: 0.7981598619896493.


Best trial: 7. Best value: 0.79816:  80%|████████  | 8/10 [00:21<00:04,  2.47s/it]



Best trial: 7. Best value: 0.79816:  90%|█████████ | 9/10 [00:22<00:01,  1.89s/it]

Did not meet early stopping. Best iteration is:
[1000]	valid_0's binary_logloss: 0.485455
[I 2024-03-22 14:03:11,855] Trial 0 finished with value: 0.7814836112708453 and parameters: {'num_leaves': 56, 'max_depth': 7, 'learning_rate': 0.001340112365726168, 'min_child_samples': 2, 'subsample': 0.5561110365568256, 'colsample_bytree': 0.7402118143736226, 'reg_alpha': 3.1401754577323024, 'reg_lambda': 4.06164570791949}. Best is trial 7 with value: 0.7981598619896493.


Best trial: 7. Best value: 0.79816:  90%|█████████ | 9/10 [00:23<00:01,  1.89s/it]

Did not meet early stopping. Best iteration is:
[1000]	valid_0's binary_logloss: 0.47392
[I 2024-03-22 14:03:12,930] Trial 4 finished with value: 0.7883841288096607 and parameters: {'num_leaves': 181, 'max_depth': 12, 'learning_rate': 0.0013505681582659716, 'min_child_samples': 67, 'subsample': 0.8887687221354315, 'colsample_bytree': 0.6452306679847344, 'reg_alpha': 0.011196050365055738, 'reg_lambda': 0.8660164755772313}. Best is trial 7 with value: 0.7981598619896493.


Best trial: 7. Best value: 0.79816: 100%|██████████| 10/10 [00:23<00:00,  2.36s/it]


Best Validation Accuracy: 0.7981598619896493
Best Hyperparameters: {'num_leaves': 99, 'max_depth': 6, 'learning_rate': 0.12435032003080158, 'min_child_samples': 7, 'subsample': 0.5813680367747637, 'colsample_bytree': 0.9979490776465224, 'reg_alpha': 0.003281118313074587, 'reg_lambda': 1.8390549847947741}
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000580 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1920
[LightGBM] [Info] Number of data points in the train set: 8693, number of used features: 19
[LightGBM] [Info] Start training from score 0.503624
