In [55]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from scipy.stats import expon, reciprocal
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.regularizers import l2
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow import keras
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from tensorflow.keras.optimizers import Adam
import keras_tuner as kt

In [56]:
# Load the dataset
train_df = pd.read_csv('csv_files/train.csv')

# Convert binary categorical features to 0 and 1
binary_features = ['CryoSleep', 'VIP']
train_df[binary_features] = train_df[binary_features].astype(bool).astype(int)

# Extract components from 'Cabin' and treat them as separate features
train_df[['Cabin_Deck', 'Cabin_Number', 'Cabin_Side']] = train_df['Cabin'].str.split('/', expand=True)
train_df['Cabin_Side'] = train_df['Cabin_Side'].map({'P': 1, 'S': 0})

# Explicitly convert 'Cabin_Number' to numeric, handling errors
train_df['Cabin_Number'] = pd.to_numeric(train_df['Cabin_Number'], errors='coerce')

# Drop the original 'Cabin' column as it's no longer needed
train_df.drop('Cabin', axis=1, inplace=True)

# One-hot encode multi-category features
multi_cat_features = ['HomePlanet', 'Destination']
one_hot_encoder = OneHotEncoder(drop='first')  # Adjusted here
encoded_features = one_hot_encoder.fit_transform(train_df[multi_cat_features])
encoded_feature_names = one_hot_encoder.get_feature_names_out(multi_cat_features)

# Convert to DataFrame and concatenate
encoded_features_df = pd.DataFrame(encoded_features.toarray(), columns=encoded_feature_names)

# Drop the original columns and concatenate the new one-hot encoded features
train_df.drop(multi_cat_features, axis=1, inplace=True)
train_df = pd.concat([train_df.reset_index(drop=True), encoded_features_df], axis=1)

# Convert 'Transported' to integer (True=1, False=0) for modeling
train_df['Transported'] = train_df['Transported'].astype(int)

# Define numeric features - ensure this list only contains numeric features that were not one-hot encoded
numeric_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Cabin_Number', 'Cabin_Side']
# Imputer for numeric features
imputer = SimpleImputer(strategy='median')
train_df[numeric_features] = imputer.fit_transform(train_df[numeric_features])

# Normalize the numeric features
scaler = StandardScaler()
train_df[numeric_features] = scaler.fit_transform(train_df[numeric_features])

# Prepare features and target for the model
features = [col for col in train_df.columns if col not in ['PassengerId', 'Name', 'Transported', 'Cabin_Deck' ]]  # Exclude non-features
X = train_df[features]
y = train_df['Transported']

# Splitting the dataset
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Building the model
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_val, y_val), verbose=1)

# Evaluate on the validation set
val_loss, val_acc = model.evaluate(X_val, y_val, verbose=1)
print(f'Validation Accuracy: {val_acc}')


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 639us/step - accuracy: 0.6721 - loss: 0.5871 - val_accuracy: 0.7792 - val_loss: 0.4627
Epoch 2/50
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 391us/step - accuracy: 0.7919 - loss: 0.4507 - val_accuracy: 0.7792 - val_loss: 0.4502
Epoch 3/50
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 384us/step - accuracy: 0.7955 - loss: 0.4433 - val_accuracy: 0.7809 - val_loss: 0.4469
Epoch 4/50
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 406us/step - accuracy: 0.7963 - loss: 0.4408 - val_accuracy: 0.7803 - val_loss: 0.4408
Epoch 5/50
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 408us/step - accuracy: 0.7969 - loss: 0.4314 - val_accuracy: 0.7832 - val_loss: 0.4438
Epoch 6/50
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 408us/step - accuracy: 0.8023 - loss: 0.4138 - val_accuracy: 0.7803 - val_loss: 0.4355
Epoch 7/50
[1m218/218[0m 

#### Enhanced Feature Engineering and Model Enhancement Strategies

In [72]:
# Create a copy of the original DataFrame to maintain separation
enhanced_train_df = train_df.copy()

# Generate Interaction Features based on SVM analysis
# Focus on the best interaction pairs identified
enhanced_train_df['RoomService_FoodCourt_Interaction'] = enhanced_train_df['RoomService'] * enhanced_train_df['FoodCourt']
enhanced_train_df['FoodCourt_ShoppingMall_Interaction'] = enhanced_train_df['FoodCourt'] * enhanced_train_df['ShoppingMall']
enhanced_train_df['RoomService_Spa_Interaction'] = enhanced_train_df['RoomService'] * enhanced_train_df['Spa']

# Update the list of numeric features to include these new interaction features
enhanced_numeric_features = numeric_features.copy() + ['RoomService_FoodCourt_Interaction', 'FoodCourt_ShoppingMall_Interaction', 'RoomService_Spa_Interaction']

# Create new instances for imputation and scaling specifically for the enhanced dataset
enhanced_imputer = SimpleImputer(strategy='median')
enhanced_scaler = StandardScaler()

# Apply imputation and scaling to the numeric features of the enhanced dataset
enhanced_train_df[enhanced_numeric_features] = enhanced_imputer.fit_transform(enhanced_train_df[enhanced_numeric_features])
enhanced_train_df[enhanced_numeric_features] = enhanced_scaler.fit_transform(enhanced_train_df[enhanced_numeric_features])

# Continue with model preparation steps for the enhanced dataset
# Make sure to include the new interaction features in your feature list
X_enhanced = enhanced_train_df[features + ['RoomService_FoodCourt_Interaction', 'FoodCourt_ShoppingMall_Interaction', 'RoomService_Spa_Interaction']]
y_enhanced = enhanced_train_df['Transported']


# Split the dataset for the enhanced model
X_train_enhanced, X_val_enhanced, y_train_enhanced, y_val_enhanced = train_test_split(
    X_enhanced, y_enhanced, test_size=0.2, random_state=42
)
#SMOTE 
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_enhanced, y_train_enhanced)
# Define the Enhanced Model Architecture
model_enhanced = Sequential(name='EnhancedTitanicModel')
model_enhanced.add(Dense(256, activation='relu', input_shape=(X_train_enhanced.shape[1],)))
model_enhanced.add(Dropout(0.3))
model_enhanced.add(Dense(128, activation='relu'))
model_enhanced.add(Dropout(0.3))
model_enhanced.add(Dense(64, activation='relu'))
model_enhanced.add(Dense(1, activation='sigmoid'))

# Compile the Enhanced Model
model_enhanced.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['accuracy'])
# Compute class weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_enhanced),
    y=y_enhanced)

class_weight_dict = dict(enumerate(class_weights))

# Train the Enhanced Model
history_enhanced = model_enhanced.fit(
    X_train_smote, 
    y_train_smote, 
    epochs=100, 
    batch_size=16, 
    validation_data=(X_val_enhanced, y_val_enhanced), 
    class_weight=class_weight_dict,
    callbacks=[EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)], 
    verbose=1
)

# Evaluate the Enhanced Model
val_loss_enhanced, val_acc_enhanced = model_enhanced.evaluate(X_val_enhanced, y_val_enhanced, verbose=1)
print(f'Enhanced Model Validation Accuracy: {val_acc_enhanced}')

Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 844us/step - accuracy: 0.7202 - loss: 0.5372 - val_accuracy: 0.7803 - val_loss: 0.4476
Epoch 2/100
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 618us/step - accuracy: 0.7908 - loss: 0.4547 - val_accuracy: 0.7867 - val_loss: 0.4374
Epoch 3/100
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 802us/step - accuracy: 0.8018 - loss: 0.4206 - val_accuracy: 0.7838 - val_loss: 0.4352
Epoch 4/100
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 672us/step - accuracy: 0.8108 - loss: 0.4137 - val_accuracy: 0.7855 - val_loss: 0.4302
Epoch 5/100
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 567us/step - accuracy: 0.8072 - loss: 0.4038 - val_accuracy: 0.7867 - val_loss: 0.4325
Epoch 6/100
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 708us/step - accuracy: 0.8159 - loss: 0.4040 - val_accuracy: 0.7838 - val_loss: 0.4321
Epoch 7/100
[1m438/43

Notes: 
- Class Weight Optimisation decreased accuracy from 0.7958 to 0.7866
- SMOTE decreased it to 0.791 , next try both
- Using both Increased accuracy to 0.795859


In [78]:
import keras_tuner as kt

def build_model(hp):
    model = Sequential(name='EnhancedTitanicModel')
    model.add(Dense(units=hp.Int('units', min_value=32, max_value=512, step=32),
                    activation='relu',
                    input_shape=(X_train_smote.shape[1],)))
    model.add(Dropout(rate=hp.Float('dropout', min_value=0.0, max_value=0.5, step=0.1)))
    model.add(Dense(units=hp.Int('units', min_value=32, max_value=512, step=32), activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer=Adam(learning_rate=hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='LOG')),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model
tuner = kt.RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=20,  # Number of trials to run
    executions_per_trial=1,  # Runs per trial to average over
    directory='keras_tuner_dir',
    project_name='titanic_model_optimization'
)

# Start the search
tuner.search(X_train_smote, y_train_smote,
             epochs=50,
             validation_data=(X_val_enhanced, y_val_enhanced),
             callbacks=[EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)])
# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
The hyperparameter search is complete. The optimal number of units in the first dense layer is {best_hps.get('units')},
the optimal dropout rate is {best_hps.get('dropout')}, and the optimal learning rate for the optimizer is {best_hps.get('learning_rate')}.
""")




Reloading Tuner from keras_tuner_dir/titanic_model_optimization/tuner0.json

The hyperparameter search is complete. The optimal number of units in the first dense layer is 96,
the optimal dropout rate is 0.30000000000000004, and the optimal learning rate for the optimizer is 0.0010848539808730223.

Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 496us/step - accuracy: 0.7091 - loss: 0.5501 - val_accuracy: 0.7815 - val_loss: 0.4558
Epoch 2/100
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 363us/step - accuracy: 0.7963 - loss: 0.4391 - val_accuracy: 0.7884 - val_loss: 0.4446
Epoch 3/100
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 356us/step - accuracy: 0.8044 - loss: 0.4244 - val_accuracy: 0.7855 - val_loss: 0.4430
Epoch 4/100
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 357us/step - accuracy: 0.8051 - loss: 0.4113 - val_accuracy: 0.7798 - val_loss: 0.4424
Epoch 5/100
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 359us/step - accuracy: 0.8092 - loss: 0.4142 - val_accuracy: 0.7769 - val_loss: 0.4407
Epoch 6/100
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 358us/step - accuracy: 0.8055 - loss: 0.4173 - val_accuracy: 0.7826 - val_loss: 0.4376
Epoch 7/100
[1m438/43

In [75]:
# Load the test dataset
test_df = pd.read_csv('csv_files/test.csv')

# Convert binary categorical features to 0 and 1 for the test data
test_df[binary_features] = test_df[binary_features].astype(bool).astype(int)

# Extract components from 'Cabin' and treat them as separate features for the test data
test_df[['Cabin_Deck', 'Cabin_Number', 'Cabin_Side']] = test_df['Cabin'].str.split('/', expand=True)
test_df['Cabin_Side'] = test_df['Cabin_Side'].map({'P': 1, 'S': 0})
test_df['Cabin_Number'] = pd.to_numeric(test_df['Cabin_Number'], errors='coerce')

# Drop the original 'Cabin' column as it's no longer needed for the test data
test_df.drop('Cabin', axis=1, inplace=True)

# One-hot encode multi-category features for the test data using the same encoder
encoded_features_test = one_hot_encoder.transform(test_df[multi_cat_features])
encoded_features_test_df = pd.DataFrame(encoded_features_test.toarray(), columns=encoded_feature_names)

# Drop the original multi-category columns and concatenate the new one-hot encoded features for the test data
test_df.drop(multi_cat_features, axis=1, inplace=True)
test_df = pd.concat([test_df.reset_index(drop=True), encoded_features_test_df.reset_index(drop=True)], axis=1)

# Impute missing values for numeric features only in the test data
test_df[numeric_features] = imputer.transform(test_df[numeric_features])

# Normalize the numeric features in the test data
test_df[numeric_features] = scaler.transform(test_df[numeric_features])

# Prepare the features for prediction
X_test = test_df[features]

# Make predictions with the TensorFlow model
y_pred_test_proba = model.predict(X_test)
y_pred_test = (y_pred_test_proba > 0.5).astype(int).flatten()  # Convert probabilities to binary predictions

# Prepare the submission dataframe
submission_df = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Transported': y_pred_test
})

# Convert predictions back to boolean (True/False) if necessary
submission_df['Transported'] = submission_df['Transported'].astype(bool)

# Save the submission file
submission_df.to_csv('tensorflow_result.csv', index=False)


ValueError: Exception encountered when calling Sequential.call().

[1mInput 0 of layer "dense_14" is incompatible with the layer: expected axis -1 of input shape to have value 19, but received input with shape (32, 16)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(32, 16), dtype=float32)
  • training=False
  • mask=None

In [77]:
# Load the test dataset
test_df = pd.read_csv('csv_files/test.csv')

# Convert binary categorical features to 0 and 1
test_df[binary_features] = test_df[binary_features].astype(bool).astype(int)

# Extract components from 'Cabin' and treat them as separate features if 'Cabin' exists in the test data
if 'Cabin' in test_df.columns:
    test_df[['Cabin_Deck', 'Cabin_Number', 'Cabin_Side']] = test_df['Cabin'].str.split('/', expand=True)
    test_df['Cabin_Side'] = test_df['Cabin_Side'].map({'P': 1, 'S': 0})
    test_df['Cabin_Number'] = pd.to_numeric(test_df['Cabin_Number'], errors='coerce')
    test_df.drop('Cabin', axis=1, inplace=True)

# One-hot encode multi-category features similar to the training data
if 'HomePlanet' in test_df.columns and 'Destination' in test_df.columns:
    encoded_features = one_hot_encoder.transform(test_df[multi_cat_features])
    encoded_feature_names = one_hot_encoder.get_feature_names_out(multi_cat_features)
    encoded_features_df = pd.DataFrame(encoded_features.toarray(), columns=encoded_feature_names)
    test_df.drop(multi_cat_features, axis=1, inplace=True)
    test_df = pd.concat([test_df.reset_index(drop=True), encoded_features_df], axis=1)

# Impute missing values for numeric features
test_df[numeric_features] = imputer.transform(test_df[numeric_features])

# Normalize the numeric features
test_df[numeric_features] = scaler.transform(test_df[numeric_features])

# Generate Interaction Features based on the same pairs identified during training
test_df['RoomService_FoodCourt_Interaction'] = test_df['RoomService'] * test_df['FoodCourt']
test_df['FoodCourt_ShoppingMall_Interaction'] = test_df['FoodCourt'] * test_df['ShoppingMall']
test_df['RoomService_Spa_Interaction'] = test_df['RoomService'] * test_df['Spa']

# Prepare features for prediction
X_test = test_df[features + ['RoomService_FoodCourt_Interaction', 'FoodCourt_ShoppingMall_Interaction', 'RoomService_Spa_Interaction']]

# Make predictions with the TensorFlow model
y_pred_test_proba = model_enhanced.predict(X_test)
y_pred_test = (y_pred_test_proba > 0.5).astype(int).flatten()

# Prepare the submission dataframe
submission_df = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Transported': y_pred_test
})

# Convert predictions back to boolean (True/False) if necessary
submission_df['Transported'] = submission_df['Transported'].astype(bool)

# Save the submission file
submission_df.to_csv('enhanced_tensorflow_result.csv', index=False)


[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 368us/step
