In [56]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from scipy.stats import expon, reciprocal
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.regularizers import l2
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow import keras
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from tensorflow.keras.optimizers import Adam
import keras_tuner as kt
from tensorflow.keras.models import Sequential

In [76]:
# Load the dataset
train_df = pd.read_csv('csv_files/train.csv')

# Convert binary categorical features to 0 and 1
binary_features = ['CryoSleep', 'VIP']
train_df[binary_features] = train_df[binary_features].astype(bool).astype(int)

# Extract components from 'Cabin' and treat them as separate features
train_df[['Cabin_Deck', 'Cabin_Number', 'Cabin_Side']] = train_df['Cabin'].str.split('/', expand=True)
train_df['Cabin_Side'] = train_df['Cabin_Side'].map({'P': 1, 'S': 0})

# Explicitly convert 'Cabin_Number' to numeric, handling errors
train_df['Cabin_Number'] = pd.to_numeric(train_df['Cabin_Number'], errors='coerce')

# Drop the original 'Cabin' column as it's no longer needed
train_df.drop('Cabin', axis=1, inplace=True)

# One-hot encode multi-category features
multi_cat_features = ['HomePlanet', 'Destination']
one_hot_encoder = OneHotEncoder(drop='first')  # Adjusted here
encoded_features = one_hot_encoder.fit_transform(train_df[multi_cat_features])
encoded_feature_names = one_hot_encoder.get_feature_names_out(multi_cat_features)

# Convert to DataFrame and concatenate
encoded_features_df = pd.DataFrame(encoded_features.toarray(), columns=encoded_feature_names)

# Drop the original columns and concatenate the new one-hot encoded features
train_df.drop(multi_cat_features, axis=1, inplace=True)
train_df = pd.concat([train_df.reset_index(drop=True), encoded_features_df], axis=1)

# Convert 'Transported' to integer (True=1, False=0) for modeling
train_df['Transported'] = train_df['Transported'].astype(int)

# Define numeric features - ensure this list only contains numeric features that were not one-hot encoded
numeric_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Cabin_Number', 'Cabin_Side']
# Imputer for numeric features
imputer = SimpleImputer(strategy='median')
train_df[numeric_features] = imputer.fit_transform(train_df[numeric_features])

# Normalize the numeric features
scaler = StandardScaler()
train_df[numeric_features] = scaler.fit_transform(train_df[numeric_features])

# Prepare features and target for the model
features = [col for col in train_df.columns if col not in ['PassengerId', 'Name', 'Transported', 'Cabin_Deck' ]]  # Exclude non-features
X = train_df[features]
y = train_df['Transported']

# Splitting the dataset
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Building the model
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_val, y_val), verbose=1)

# Evaluate on the validation set
val_loss, val_acc = model.evaluate(X_val, y_val, verbose=1)
print(f'Validation Accuracy: {val_acc}')


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 700us/step - accuracy: 0.7248 - loss: 0.5518 - val_accuracy: 0.7746 - val_loss: 0.4542
Epoch 2/50
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 425us/step - accuracy: 0.7919 - loss: 0.4378 - val_accuracy: 0.7780 - val_loss: 0.4557
Epoch 3/50
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 461us/step - accuracy: 0.8094 - loss: 0.4169 - val_accuracy: 0.7867 - val_loss: 0.4395
Epoch 4/50
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 530us/step - accuracy: 0.7960 - loss: 0.4174 - val_accuracy: 0.7798 - val_loss: 0.4455
Epoch 5/50
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 427us/step - accuracy: 0.8034 - loss: 0.4147 - val_accuracy: 0.7878 - val_loss: 0.4375
Epoch 6/50
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 416us/step - accuracy: 0.8049 - loss: 0.4071 - val_accuracy: 0.7878 - val_loss: 0.4326
Epoch 7/50
[1m218/218[0m 

#### Enhanced Feature Engineering and Model Enhancement Strategies

In [63]:
# Create a copy of the original DataFrame to maintain separation
enhanced_train_df = train_df.copy()

# Generate Interaction Features based on SVM analysis
# Focus on the best interaction pairs identified
# Apply square root transformation to FoodCourt and ShoppingMall
enhanced_train_df['sqrt_FoodCourt'] = np.sqrt(enhanced_train_df['FoodCourt'].clip(0))  # clip(0) to handle negative values
enhanced_train_df['sqrt_ShoppingMall'] = np.sqrt(enhanced_train_df['ShoppingMall'].clip(0))
enhanced_train_df['RoomService_FoodCourt_Interaction'] = enhanced_train_df['RoomService'] * enhanced_train_df['FoodCourt']
enhanced_train_df['FoodCourt_ShoppingMall_Interaction'] = enhanced_train_df['FoodCourt'] * enhanced_train_df['ShoppingMall']
enhanced_train_df['RoomService_Spa_Interaction'] = enhanced_train_df['RoomService'] * enhanced_train_df['Spa']

# Update the list of numeric features to include these new interaction features
enhanced_numeric_features = numeric_features.copy() + ['RoomService_FoodCourt_Interaction', 'FoodCourt_ShoppingMall_Interaction', 'RoomService_Spa_Interaction',  'sqrt_FoodCourt', 'sqrt_ShoppingMall']

# Create new instances for imputation and scaling specifically for the enhanced dataset
enhanced_imputer = SimpleImputer(strategy='median')
enhanced_scaler = StandardScaler()

# Apply imputation and scaling to the numeric features of the enhanced dataset
enhanced_train_df[enhanced_numeric_features] = enhanced_imputer.fit_transform(enhanced_train_df[enhanced_numeric_features])
enhanced_train_df[enhanced_numeric_features] = enhanced_scaler.fit_transform(enhanced_train_df[enhanced_numeric_features])

# Continue with model preparation steps for the enhanced dataset
# Make sure to include the new interaction features in your feature list
X_enhanced = enhanced_train_df[features + ['RoomService_FoodCourt_Interaction', 'FoodCourt_ShoppingMall_Interaction', 'RoomService_Spa_Interaction']]
y_enhanced = enhanced_train_df['Transported']


# Split the dataset for the enhanced model
X_train_enhanced, X_val_enhanced, y_train_enhanced, y_val_enhanced = train_test_split(
    X_enhanced, y_enhanced, test_size=0.2, random_state=42
)
#SMOTE 
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_enhanced, y_train_enhanced)
# Adjusted model architecture based on random search results
model_enhanced = Sequential(name='EnhancedTitanicModel')
model_enhanced.add(Dense(96, activation='tanh', input_shape=(X_train_smote.shape[1],)))  # Updated based on optimal units
model_enhanced.add(Dropout(0.3))  # Updated based on optimal dropout rate
model_enhanced.add(Dense(128, activation='tanh'))
model_enhanced.add(Dropout(0.3))  # Keeping consistent dropout rate for simplicity
model_enhanced.add(Dense(64, activation='tanh'))
model_enhanced.add(Dense(1, activation='sigmoid'))

# Compile the model with the optimal learning rate
optimal_lr = 0.0010848539808730223  # From random search results
model_enhanced.compile(optimizer=Adam(learning_rate=optimal_lr), loss='binary_crossentropy', metrics=['accuracy'])
# Compute class weights for handling imbalanced data
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train_smote),
    y=y_train_smote)

class_weight_dict = dict(enumerate(class_weights))

# Train the model with the same settings, now using optimized architecture and hyperparameters
history_enhanced = model_enhanced.fit(
    X_train_smote, 
    y_train_smote, 
    epochs=100,  # Adjust epochs as needed
    batch_size=16, 
    validation_data=(X_val_enhanced, y_val_enhanced),
    class_weight=class_weight_dict,
    callbacks=[EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)],
    verbose=1
)

# Evaluate the model on the validation set to check for improvement
val_loss_enhanced, val_acc_enhanced = model_enhanced.evaluate(X_val_enhanced, y_val_enhanced, verbose=1)
print(f'Enhanced Model Validation Accuracy: {val_acc_enhanced}')

Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 587us/step - accuracy: 0.7509 - loss: 0.5122 - val_accuracy: 0.7757 - val_loss: 0.4862
Epoch 2/100
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 457us/step - accuracy: 0.7853 - loss: 0.4696 - val_accuracy: 0.7723 - val_loss: 0.4846
Epoch 3/100
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 477us/step - accuracy: 0.7877 - loss: 0.4474 - val_accuracy: 0.7786 - val_loss: 0.4670
Epoch 4/100
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 474us/step - accuracy: 0.8004 - loss: 0.4346 - val_accuracy: 0.7838 - val_loss: 0.4539
Epoch 5/100
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 469us/step - accuracy: 0.7918 - loss: 0.4377 - val_accuracy: 0.7740 - val_loss: 0.4715
Epoch 6/100
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 452us/step - accuracy: 0.8100 - loss: 0.4201 - val_accuracy: 0.7642 - val_loss: 0.4490
Epoch 7/100
[1m438/43

Notes: 
- Class Weight Optimisation decreased accuracy from 0.7958 to 0.7866
- SMOTE decreased it to 0.791 , next try both
- Using both Increased accuracy to 0.795859


Best performing configuration: {'units': 64, 'activation': 'tanh', 'validation_accuracy': 0.7987349033355713}
Best performing configuration: {'units': 128, 'activation': 'tanh', 'batch_size': 32}, Validation Accuracy: 0.795859694480896
Best performing configuration: {'num_layers': 2, 'additional_units': 32, 'batch_size': 32}, Validation Accuracy: 0.7970097661018372

In [59]:
additional_units_options = [32, 64, 128]  # Options for units in additional layers
num_layers_options = [2, 3, 4]  # Including the first layer which is already defined
batch_size_options = [16, 32, 64]  # Explore a range of batch sizes
best_val_accuracy = 0
best_config = None

for num_layers in num_layers_options:
    for additional_units in additional_units_options:
        for batch_size in batch_size_options:
            print(f"Testing configuration: Layers={num_layers}, Additional Units={additional_units}, Batch Size={batch_size}")
            
            # Model building
            model = Sequential(name=f"Model_Layers_{num_layers}_Units_{additional_units}")
            model.add(Dense(64, activation='tanh', input_shape=(X_train_smote.shape[1],)))  # First layer as per best config
            model.add(Dropout(0.3))
            
            for _ in range(1, num_layers):  # Add additional layers
                model.add(Dense(additional_units, activation='tanh'))
                model.add(Dropout(0.3))
            
            model.add(Dense(1, activation='sigmoid'))  # Output layer
            
            # Model compilation
            model.compile(optimizer=Adam(learning_rate=0.0010848539808730223),
                          loss='binary_crossentropy',
                          metrics=['accuracy'])
            
            # Model training
            history = model.fit(X_train_smote, y_train_smote, 
                                validation_data=(X_val_enhanced, y_val_enhanced),
                                epochs=100,  # Adjust based on early stopping
                                batch_size=batch_size,
                                class_weight=class_weight_dict,
                                callbacks=[EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)],
                                verbose=0)  # Set to 1 for progress
            
            # Model evaluation
            val_loss, val_acc = model.evaluate(X_val_enhanced, y_val_enhanced, verbose=0)
            
            # Update best config if improvement
            if val_acc > best_val_accuracy:
                best_val_accuracy = val_acc
                best_config = {
                    'num_layers': num_layers,
                    'additional_units': additional_units,
                    'batch_size': batch_size
                }
                print(f"New best configuration found: {best_config}, Validation Accuracy: {best_val_accuracy}")

print(f"Best performing configuration: {best_config}, Validation Accuracy: {best_val_accuracy}")


Testing configuration: Layers=2, Additional Units=32, Batch Size=16


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


New best configuration found: {'num_layers': 2, 'additional_units': 32, 'batch_size': 16}, Validation Accuracy: 0.7901092767715454
Testing configuration: Layers=2, Additional Units=32, Batch Size=32
New best configuration found: {'num_layers': 2, 'additional_units': 32, 'batch_size': 32}, Validation Accuracy: 0.7906842827796936
Testing configuration: Layers=2, Additional Units=32, Batch Size=64
New best configuration found: {'num_layers': 2, 'additional_units': 32, 'batch_size': 64}, Validation Accuracy: 0.7912593483924866
Testing configuration: Layers=2, Additional Units=64, Batch Size=16
Testing configuration: Layers=2, Additional Units=64, Batch Size=32
New best configuration found: {'num_layers': 2, 'additional_units': 64, 'batch_size': 32}, Validation Accuracy: 0.7918344140052795
Testing configuration: Layers=2, Additional Units=64, Batch Size=64
New best configuration found: {'num_layers': 2, 'additional_units': 64, 'batch_size': 64}, Validation Accuracy: 0.7941345572471619
Testi

KeyboardInterrupt: 

In [81]:
# Reload the test dataset to ensure it's fresh
test_df = pd.read_csv('csv_files/test.csv')

# Process the test data similarly to how the training data was processed for the enhanced model

# Convert binary categorical features to 0 and 1
binary_features = ['CryoSleep', 'VIP']
test_df[binary_features] = test_df[binary_features].astype(bool).astype(int)

# Extract components from 'Cabin' and treat them as separate features
test_df[['Cabin_Deck', 'Cabin_Number', 'Cabin_Side']] = test_df['Cabin'].str.split('/', expand=True)
test_df['Cabin_Side'] = test_df['Cabin_Side'].map({'P': 1, 'S': 0}).fillna(-1).astype(int)
test_df['Cabin_Number'] = pd.to_numeric(test_df['Cabin_Number'], errors='coerce')
test_df.drop('Cabin', axis=1, inplace=True)

# Assuming one_hot_encoder is already fitted with the training data
multi_cat_features = ['HomePlanet', 'Destination']
encoded_features_test = one_hot_encoder.transform(test_df[multi_cat_features])
encoded_feature_names = one_hot_encoder.get_feature_names_out(multi_cat_features)
encoded_features_test_df = pd.DataFrame(encoded_features_test.toarray(), columns=encoded_feature_names)
test_df = pd.concat([test_df, encoded_features_test_df], axis=1)
test_df.drop(multi_cat_features, axis=1, inplace=True)

# Impute and scale numeric features using the enhanced model's imputer and scaler
numeric_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Cabin_Number', 'Cabin_Side']
test_df[numeric_features] = enhanced_imputer.transform(test_df[numeric_features])
test_df[numeric_features] = enhanced_scaler.transform(test_df[numeric_features])

# Add interaction features and any other transformations specific to the enhanced model
test_df['sqrt_FoodCourt'] = np.sqrt(test_df['FoodCourt'].clip(0))
test_df['sqrt_ShoppingMall'] = np.sqrt(test_df['ShoppingMall'].clip(0))
test_df['RoomService_FoodCourt_Interaction'] = test_df['RoomService'] * test_df['FoodCourt']
test_df['FoodCourt_ShoppingMall_Interaction'] = test_df['FoodCourt'] * test_df['ShoppingMall']
test_df['RoomService_Spa_Interaction'] = test_df['RoomService'] * test_df['Spa']

# Prepare the features for prediction
X_test = test_df[features + ['sqrt_FoodCourt', 'sqrt_ShoppingMall', 'RoomService_FoodCourt_Interaction', 'FoodCourt_ShoppingMall_Interaction', 'RoomService_Spa_Interaction']]

# Use the enhanced model for prediction
y_pred_test_proba = model_enhanced.predict(X_test)
y_pred_test = (y_pred_test_proba > 0.5).astype(int).flatten()

# Prepare the submission dataframe
submission_df = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Transported': y_pred_test
})

# Convert predictions back to boolean (True/False) if necessary
submission_df['Transported'] = submission_df['Transported'].astype(bool)

# Save the submission file
submission_df.to_csv('enhanced_tensorflow_submission.csv', index=False)


ValueError: Exception encountered when calling Sequential.call().

[1mInput 0 of layer "dense_340" is incompatible with the layer: expected axis -1 of input shape to have value 19, but received input with shape (32, 26)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(32, 26), dtype=float32)
  • training=False
  • mask=None

In [80]:
# Load the test dataset
test_df = pd.read_csv('csv_files/test.csv')

# Convert binary categorical features to 0 and 1
binary_features = ['CryoSleep', 'VIP']
test_df[binary_features] = test_df[binary_features].astype(bool).astype(int)

# Extract components from 'Cabin' and treat them as separate features
test_df[['Cabin_Deck', 'Cabin_Number', 'Cabin_Side']] = test_df['Cabin'].str.split('/', expand=True)
test_df['Cabin_Side'] = test_df['Cabin_Side'].map({'P': 1, 'S': 0}).fillna(-1).astype(int)
test_df['Cabin_Number'] = pd.to_numeric(test_df['Cabin_Number'], errors='coerce')
test_df.drop('Cabin', axis=1, inplace=True)

# One-hot encode multi-category features
multi_cat_features = ['HomePlanet', 'Destination']
if set(multi_cat_features).issubset(test_df.columns):
    encoded_features_test = one_hot_encoder.transform(test_df[multi_cat_features])
    encoded_feature_names = one_hot_encoder.get_feature_names_out(multi_cat_features)
    encoded_features_test_df = pd.DataFrame(encoded_features_test.toarray(), columns=encoded_feature_names)
    test_df = pd.concat([test_df, encoded_features_test_df], axis=1)
    test_df.drop(multi_cat_features, axis=1, inplace=True)
numeric_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Cabin_Number', 'Cabin_Side']

# Impute and scale numeric features
enhanced_imputer = SimpleImputer(strategy='median')
enhanced_scaler = StandardScaler()
enhanced_imputer.fit(train_df[numeric_features])  # Fit imputer to the training data
enhanced_scaler.fit(train_df[numeric_features])  # Fit scaler to the training data

test_df[numeric_features] = enhanced_imputer.transform(test_df[numeric_features])
test_df[numeric_features] = enhanced_scaler.transform(test_df[numeric_features])

# Generate interaction features and square root transformations
test_df['sqrt_FoodCourt'] = np.sqrt(test_df['FoodCourt'].clip(0))
test_df['sqrt_ShoppingMall'] = np.sqrt(test_df['ShoppingMall'].clip(0))
test_df['RoomService_FoodCourt_Interaction'] = test_df['RoomService'] * test_df['FoodCourt']
test_df['FoodCourt_ShoppingMall_Interaction'] = test_df['FoodCourt'] * test_df['ShoppingMall']
test_df['RoomService_Spa_Interaction'] = test_df['RoomService'] * test_df['Spa']

# Prepare the feature matrix for prediction
features = [col for col in test_df.columns if col not in ['PassengerId', 'Name', 'Cabin_Deck']]
X_test = test_df[features]

# Load the trained model and make predictions
y_pred_test_proba = model_enhanced.predict(X_test)
y_pred_test = (y_pred_test_proba > 0.5).astype(int)

# Prepare the submission dataframe
submission_df = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Transported': y_pred_test.flatten().astype(bool)
})

# Save the submission file
submission_df.to_csv('enhanced_tensorflow_submission.csv', index=False)


ValueError: Exception encountered when calling Sequential.call().

[1mInput 0 of layer "dense_340" is incompatible with the layer: expected axis -1 of input shape to have value 19, but received input with shape (32, 21)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(32, 21), dtype=float32)
  • training=False
  • mask=None