In [22]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from scipy.stats import expon, reciprocal
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.regularizers import l2
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow import keras
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from tensorflow.keras.optimizers import Adam
import keras_tuner as kt
import deap
import skopt
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.models import save_model
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.regularizers import l1, l2, l1_l2
from sklearn.model_selection import ParameterGrid

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
import joblib
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

def preprocess_data(file_path, is_train=True):
    df = pd.read_csv(file_path)
    
    # Convert binary categorical features to 0 and 1
    binary_features = ['CryoSleep', 'VIP']
    df[binary_features] = df[binary_features].astype(bool).astype(int)
    
    # Feature Engineering
    df['TotalSpending'] = df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
    
    # Conditionally set spending-related features to 0 for passengers in cryosleep
    spending_features = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    df.loc[df['CryoSleep'] == 1, spending_features] = 0
    
    # Create interaction features
    df['HomePlanet_TotalSpending'] = df['HomePlanet'].astype(str) + '_' + df['TotalSpending'].astype(str)
    df['Destination_TotalSpending'] = df['Destination'].astype(str) + '_' + df['TotalSpending'].astype(str)
    
    # Extract components from 'Cabin'
    if 'Cabin' in df.columns:
        df[['Cabin_Deck', 'Cabin_Number', 'Cabin_Side']] = df['Cabin'].str.split('/', expand=True)
        df['Cabin_Side'] = df['Cabin_Side'].map({'P': 1, 'S': 0})
        df['Cabin_Number'] = pd.to_numeric(df['Cabin_Number'], errors='coerce')
        df.drop('Cabin', axis=1, inplace=True)
    
    # One-hot encode multi-category features
    multi_cat_features = ['HomePlanet', 'Destination']
    if is_train:
        one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
        encoded_features = one_hot_encoder.fit_transform(df[multi_cat_features])
        encoded_feature_names = one_hot_encoder.get_feature_names_out(multi_cat_features)
        joblib.dump(one_hot_encoder, 'one_hot_encoder.pkl')
    else:
        one_hot_encoder = joblib.load('one_hot_encoder.pkl')
        encoded_features = one_hot_encoder.transform(df[multi_cat_features])
        encoded_feature_names = one_hot_encoder.get_feature_names_out(multi_cat_features)
    
    encoded_features_df = pd.DataFrame(encoded_features.toarray(), columns=encoded_feature_names)
    df = pd.concat([df, encoded_features_df], axis=1)
    df.drop(multi_cat_features, axis=1, inplace=True)
    
    # Imputation and Scaling
    numeric_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Cabin_Number', 'Cabin_Side', 'TotalSpending']
    if is_train:
        imputer = KNNImputer(n_neighbors=5)
        scaler = RobustScaler()  # Use RobustScaler instead of StandardScaler
        df[numeric_features] = imputer.fit_transform(df[numeric_features])
        df[numeric_features] = scaler.fit_transform(df[numeric_features])
        joblib.dump(imputer, 'imputer.pkl')
        joblib.dump(scaler, 'scaler.pkl')
    else:
        imputer = joblib.load('imputer.pkl')
        scaler = joblib.load('scaler.pkl')
        df[numeric_features] = imputer.transform(df[numeric_features])
        df[numeric_features] = scaler.transform(df[numeric_features])
    

    if is_train:
        # Convert 'Transported' to integer (True=1, False=0) for modeling
        df['Transported'] = df['Transported'].astype(int)
        
        # Save the list of features used for training
        train_features = [col for col in df.columns if col not in ['PassengerId', 'Name', 'Transported', 'Cabin_Deck', 'HomePlanet_TotalSpending', 'Destination_TotalSpending']]
        joblib.dump(train_features, 'train_features.pkl')
    
    return df

In [36]:
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.optimizers import Adam


class CustomPrintCallback(Callback):
    def __init__(self):
        super(CustomPrintCallback, self).__init__()
        self.best_acc = -np.inf
        self.best_loss = np.inf
        self.best_val_acc = -np.inf
        self.best_val_loss = np.inf

    def on_epoch_end(self, epoch, logs=None):
        current_loss = logs.get('loss')
        current_acc = logs.get('accuracy')
        current_val_loss = logs.get('val_loss')
        current_val_acc = logs.get('val_accuracy')

        # Check for improvements and update best metrics
        if current_acc > self.best_acc or current_val_acc > self.best_val_acc:
            self.best_acc = max(self.best_acc, current_acc)
            self.best_loss = min(self.best_loss, current_loss)
            self.best_val_acc = max(self.best_val_acc, current_val_acc)
            self.best_val_loss = min(self.best_val_loss, current_val_loss)

        # Print the best metrics so far
        print(f"After {epoch+1} epochs: Best Loss: {self.best_loss:.4f}, Best Accuracy: {self.best_acc:.4f}, "
                f"Best Val Loss: {self.best_val_loss:.4f}, Best Val Accuracy: {self.best_val_acc:.4f}")
def create_model(input_shape, layers, activation, dropout_rate, l1_reg, l2_reg, learning_rate):
    model = Sequential()
    for i, layer_size in enumerate(layers):
        if i == 0:
            model.add(Dense(layer_size, activation=activation, kernel_regularizer=l1_l2(l1=l1_reg, l2=l2_reg), input_shape=(input_shape,)))
        else:
            model.add(Dense(layer_size, activation=activation, kernel_regularizer=l1_l2(l1=l1_reg, l2=l2_reg)))
        model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

def train_and_evaluate(model, X_train, y_train, X_val, y_val, batch_size, epochs, class_weight, lr_schedule):
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    lr_scheduler = LearningRateScheduler(lr_schedule)
    
    custom_print_callback = CustomPrintCallback() 

    model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, 
              validation_data=(X_val, y_val), class_weight=class_weight, 
              callbacks=[early_stopping, lr_scheduler], verbose=0)
    

    val_loss, val_acc = model.evaluate(X_val, y_val, verbose=0)
    return val_loss, val_acc

def lr_schedule(epoch):
    if epoch < 10:
        return 0.001
    elif epoch < 20:
        return 0.0005
    else:
        return 0.0001

In [40]:
from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.regularizers import l1_l2
import numpy as np
from imblearn.over_sampling import SVMSMOTE

train_df = preprocess_data('csv_files/train.csv', is_train=True)
test_df = preprocess_data('csv_files/test.csv', is_train=False)

# Prepare features and target for the model
features = [col for col in train_df.columns if col not in ['PassengerId', 'Name', 'Transported', 'Cabin_Deck', 'HomePlanet_TotalSpending', 'Destination_TotalSpending']]
X = train_df[features]
y = train_df['Transported']

# Handling class imbalance
smote = SVMSMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Feature selection using RFE
rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=10)
X_selected = rfe.fit_transform(X_resampled, y_resampled)
selected_features = X.columns[rfe.support_]

# Update the hyperparameter search space
dim_layers = Integer(low=2, high=6, name='layers')
dim_units = Integer(low=64, high=1024, name='units')
dim_activation = Categorical(categories=['relu', 'selu', 'elu'], name='activation')
dim_dropout = Real(low=0.2, high=0.6, prior='uniform', name='dropout_rate')
dim_l1_reg = Real(low=1e-6, high=1e-3, prior='log-uniform', name='l1_reg')
dim_l2_reg = Real(low=1e-6, high=1e-3, prior='log-uniform', name='l2_reg')
dim_learning_rate = Real(low=1e-5, high=1e-3, prior='log-uniform', name='learning_rate')
dimensions = [dim_layers, dim_units, dim_activation, dim_dropout, dim_l1_reg, dim_l2_reg, dim_learning_rate]

dimensions = [dim_layers, dim_units, dim_activation, dim_dropout, dim_l1_reg, dim_l2_reg, dim_learning_rate]
@use_named_args(dimensions=dimensions)
def fitness(layers, units, activation, dropout_rate, l1_reg, l2_reg, learning_rate):
    layers = [units] * layers
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    val_accuracies = []
    
    for train_index, val_index in kfold.split(X_selected, y_resampled):
        X_train, X_val = X_selected[train_index], X_selected[val_index]
        y_train, y_val = y_resampled[train_index], y_resampled[val_index]
        
        class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
        class_weight_dict = dict(enumerate(class_weights))
        
        model = create_model(input_shape=X_train.shape[1], layers=layers, activation=activation,
                             dropout_rate=dropout_rate, l1_reg=l1_reg, l2_reg=l2_reg, learning_rate=learning_rate)
        _, val_acc = train_and_evaluate(model, X_train, y_train, X_val, y_val, 
                                        batch_size=32, epochs=100, class_weight=class_weight_dict, lr_schedule=lr_schedule)
        val_accuracies.append(val_acc)
    
    return -np.mean(val_accuracies)

In [41]:
from skopt import gp_minimize
import os
import tensorflow as tf

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # Suppress most TensorFlow logs, except for errors
tf.get_logger().setLevel('ERROR')  # Suppress TensorFlow warnings

# Perform Bayesian Optimization
n_calls = 10
best_result = gp_minimize(func=fitness, dimensions=dimensions, n_calls=n_calls, 
                          random_state=42, verbose=True)

# Retrieve the best hyperparameters
best_params = {
    'layers': [best_result.x[1]] * best_result.x[0],
    'activation': best_result.x[2],
    'dropout_rate': best_result.x[3],
    'l1_reg': best_result.x[4],
    'l2_reg': best_result.x[5],
    'learning_rate': best_result.x[6]
}
best_val_acc = -best_result.fun

#Train with best params
model_enhanced = create_model(input_shape=X_selected.shape[1], **best_params)
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_resampled), y=y_resampled)
class_weight_dict = dict(enumerate(class_weights))
model_enhanced.fit(X_selected, y_resampled, batch_size=32, epochs=100, class_weight=class_weight_dict, callbacks=[LearningRateScheduler(lr_schedule)])


print(f"Best Validation Accuracy: {best_val_acc}")
print(f"Best Hyperparameters: {best_params}")

Iteration No: 1 started. Evaluating function at random point.


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Iteration No: 1 ended. Evaluation done at random point.
Time taken: 270.7557
Function value obtained: -0.7904
Current minimum: -0.7904
Iteration No: 2 started. Evaluating function at random point.


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Iteration No: 2 ended. Evaluation done at random point.
Time taken: 141.4288
Function value obtained: -0.7887
Current minimum: -0.7904
Iteration No: 3 started. Evaluating function at random point.


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


KeyboardInterrupt: 


Run 1:


Not so advanced 
Best Validation Accuracy: 0.7998849749565125
Best Hyperparameters: {'layers': [379], 'activation': 'tanh', 'dropout_rate': 0.10031150633640573, 'learning_rate': 0.009647685075720105}


Run 2:


**Much more advanced**
Best Validation Accuracy: 0.8081
Best Hyperparameters: {'layers': [312, 312], 'activation': 'tanh', 'dropout_rate': 0.1, 'l1_reg': 5.778749032107547e-05, 'l2_reg': 1e-05, 'learning_rate': 0.01} what does this tell us? How can we incorperate these findings to enhancing performance?

Run 3:

**Honing In even More**
Best Validation Accuracy: 0.8092737674713135


Best Hyperparameters: {'layers': 5, 'units': 303, 'activation': 'swish', 'dropout_rate': 0.2193700315892974, 'l1_reg': 7.792297153882995e-06, 'l2_reg': 1.5847101210439079e-06, 'learning_rate': 0.002879047909793294}

Run 4:

Best Validation Accuracy: 0.8019639849662781

Best Hyperparameters: {'layers': [478], 'activation': 'relu', 'dropout_rate': 0.4243619745714856, 'l1_reg': 2.390809511930124e-05, 'l2_reg': 1.687015662272113e-05, 'learning_rate': 0.0023505874897344933}

In [None]:
from tensorflow.keras.models import load_model
import pandas as pd
import numpy as np
import joblib
print(f"Best Validation Accuracy: {best_val_acc}")
print(f"Best Hyperparameters: {best_params}")

# Preprocess the test data using the same steps as the training data
test_df = preprocess_data('csv_files/test.csv', is_train=False)

# Load the features used during training
train_features = joblib.load('train_features.pkl')

# Select the features used by the model, ensuring they are the same as those used in training
features = [col for col in train_features if col in test_df.columns]

# Ensure that X_test is a DataFrame with the correct numerical types
X_test = test_df[features].copy()

# Convert X_test to a NumPy array, which is the expected format for TensorFlow models
X_test_np = X_test.values

# Predict using the enhanced model
y_pred_test_proba = model_enhanced.predict(X_test_np)
y_pred_test = (y_pred_test_proba > 0.5).astype(int).flatten()

# Prepare the submission dataframe
submission_df = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Transported': y_pred_test
})

# Convert predictions back to boolean (True/False) if necessary
submission_df['Transported'] = submission_df['Transported'].astype(bool)

# Save the submission file
submission_df.to_csv('neuralnetbayesadvancedfocused.csv', index=False)

Best Validation Accuracy: 0.8092737674713135
Best Hyperparameters: {'layers': 5, 'units': 303, 'activation': 'swish', 'dropout_rate': 0.2193700315892974, 'l1_reg': 7.792297153882995e-06, 'l2_reg': 1.5847101210439079e-06, 'learning_rate': 0.002879047909793294}
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
