In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from scipy.stats import expon, reciprocal
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.regularizers import l2
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow import keras
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from tensorflow.keras.optimizers import Adam
import keras_tuner as kt

In [2]:
# Load the dataset
train_df = pd.read_csv('csv_files/train.csv')

# Convert binary categorical features to 0 and 1
binary_features = ['CryoSleep', 'VIP']
train_df[binary_features] = train_df[binary_features].astype(bool).astype(int)

# Extract components from 'Cabin' and treat them as separate features
train_df[['Cabin_Deck', 'Cabin_Number', 'Cabin_Side']] = train_df['Cabin'].str.split('/', expand=True)
train_df['Cabin_Side'] = train_df['Cabin_Side'].map({'P': 1, 'S': 0})

# Explicitly convert 'Cabin_Number' to numeric, handling errors
train_df['Cabin_Number'] = pd.to_numeric(train_df['Cabin_Number'], errors='coerce')

# Drop the original 'Cabin' column as it's no longer needed
train_df.drop('Cabin', axis=1, inplace=True)

# One-hot encode multi-category features
multi_cat_features = ['HomePlanet', 'Destination']
one_hot_encoder = OneHotEncoder(drop='first')  # Adjusted here
encoded_features = one_hot_encoder.fit_transform(train_df[multi_cat_features])
encoded_feature_names = one_hot_encoder.get_feature_names_out(multi_cat_features)

# Convert to DataFrame and concatenate
encoded_features_df = pd.DataFrame(encoded_features.toarray(), columns=encoded_feature_names)

# Drop the original columns and concatenate the new one-hot encoded features
train_df.drop(multi_cat_features, axis=1, inplace=True)
train_df = pd.concat([train_df.reset_index(drop=True), encoded_features_df], axis=1)

# Convert 'Transported' to integer (True=1, False=0) for modeling
train_df['Transported'] = train_df['Transported'].astype(int)

# Define numeric features - ensure this list only contains numeric features that were not one-hot encoded
numeric_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Cabin_Number', 'Cabin_Side']
# Imputer for numeric features
imputer = SimpleImputer(strategy='median')
train_df[numeric_features] = imputer.fit_transform(train_df[numeric_features])

# Normalize the numeric features
scaler = StandardScaler()
train_df[numeric_features] = scaler.fit_transform(train_df[numeric_features])

# Prepare features and target for the model
features = [col for col in train_df.columns if col not in ['PassengerId', 'Name', 'Transported', 'Cabin_Deck' ]]  # Exclude non-features
X = train_df[features]
y = train_df['Transported']

# Splitting the dataset
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Building the model
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_val, y_val), verbose=1)

# Evaluate on the validation set
val_loss, val_acc = model.evaluate(X_val, y_val, verbose=1)
print(f'Validation Accuracy: {val_acc}')


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 651us/step - accuracy: 0.7078 - loss: 0.5604 - val_accuracy: 0.7763 - val_loss: 0.4604
Epoch 2/50
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 406us/step - accuracy: 0.7995 - loss: 0.4430 - val_accuracy: 0.7855 - val_loss: 0.4482
Epoch 3/50
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 399us/step - accuracy: 0.8070 - loss: 0.4277 - val_accuracy: 0.7786 - val_loss: 0.4455
Epoch 4/50
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 400us/step - accuracy: 0.7929 - loss: 0.4340 - val_accuracy: 0.7832 - val_loss: 0.4445
Epoch 5/50
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 400us/step - accuracy: 0.8045 - loss: 0.4256 - val_accuracy: 0.7849 - val_loss: 0.4380
Epoch 6/50
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 408us/step - accuracy: 0.8137 - loss: 0.4103 - val_accuracy: 0.7752 - val_loss: 0.4353
Epoch 7/50
[1m218/218[0m 

In [5]:
import keras_tuner as kt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

def build_model(hp):
    model = Sequential([
        Dense(
            units=hp.Int('units_1', min_value=32, max_value=256, step=32),
            activation='tanh',
            input_shape=(X_train.shape[1],)
        ),
        Dropout(hp.Float('dropout_1', min_value=0.0, max_value=0.3, step=0.1)),
        Dense(
            units=hp.Int('units_2', min_value=16, max_value=128, step=16),
            activation='relu'
        ),
        Dense(1, activation='relu')
    ])
    
    model.compile(
        optimizer=Adam(hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='LOG')),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

tuner = kt.RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=200,  # Number of random configurations to try
    executions_per_trial=2,  # Number of times to train the model per trial
    directory='random_search',
    project_name='titanic_optimization'
)

tuner.search(
    X_train, y_train,
    epochs=30,
    validation_data=(X_val, y_val),
    callbacks=[EarlyStopping(monitor='val_loss', patience=5)]
)

# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
Best number of units in the first dense layer: {best_hps.get('units_1')}
Best dropout rate in the first dropout layer: {best_hps.get('dropout_1')}
Best number of units in the second dense layer: {best_hps.get('units_2')}
Best learning rate for the optimizer: {best_hps.get('learning_rate')}
""")

# Build the model with the best hyperparameters
model = tuner.hypermodel.build(best_hps)
history = model.fit(
    X_train, y_train,
    epochs=50,
    validation_data=(X_val, y_val),
    callbacks=[EarlyStopping(monitor='val_loss', patience=10)]
)

# Evaluate on the validation set
val_loss, val_acc = model.evaluate(X_val, y_val, verbose=1)
print(f'Validation Accuracy with Random Search Tuning: {val_acc}')


Trial 200 Complete [00h 00m 06s]
val_accuracy: 0.7889591753482819

Best val_accuracy So Far: 0.8010351061820984
Total elapsed time: 03h 55m 44s

Best number of units in the first dense layer: 224
Best dropout rate in the first dropout layer: 0.2
Best number of units in the second dense layer: 80
Best learning rate for the optimizer: 0.0005668473081877774

Epoch 1/50
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 693us/step - accuracy: 0.6593 - loss: 1.0566 - val_accuracy: 0.7671 - val_loss: 0.5038
Epoch 2/50
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 473us/step - accuracy: 0.7545 - loss: 0.6247 - val_accuracy: 0.7378 - val_loss: 0.5999
Epoch 3/50
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 464us/step - accuracy: 0.7344 - loss: 0.5877 - val_accuracy: 0.7729 - val_loss: 0.5218
Epoch 4/50
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 469us/step - accuracy: 0.7789 - loss: 0.5492 - val_accuracy: 0.7821 - val

In [4]:
# Load the test dataset
test_df = pd.read_csv('csv_files/test.csv')

# Convert binary categorical features to 0 and 1 for the test data
test_df[binary_features] = test_df[binary_features].astype(bool).astype(int)

# Extract components from 'Cabin' and treat them as separate features for the test data
test_df[['Cabin_Deck', 'Cabin_Number', 'Cabin_Side']] = test_df['Cabin'].str.split('/', expand=True)
test_df['Cabin_Side'] = test_df['Cabin_Side'].map({'P': 1, 'S': 0})
test_df['Cabin_Number'] = pd.to_numeric(test_df['Cabin_Number'], errors='coerce')

# Drop the original 'Cabin' column as it's no longer needed for the test data
test_df.drop('Cabin', axis=1, inplace=True)

# One-hot encode multi-category features for the test data using the same encoder
encoded_features_test = one_hot_encoder.transform(test_df[multi_cat_features])
encoded_features_test_df = pd.DataFrame(encoded_features_test.toarray(), columns=encoded_feature_names)

# Drop the original multi-category columns and concatenate the new one-hot encoded features for the test data
test_df.drop(multi_cat_features, axis=1, inplace=True)
test_df = pd.concat([test_df.reset_index(drop=True), encoded_features_test_df.reset_index(drop=True)], axis=1)

# Impute missing values for numeric features only in the test data
test_df[numeric_features] = imputer.transform(test_df[numeric_features])

# Normalize the numeric features in the test data
test_df[numeric_features] = scaler.transform(test_df[numeric_features])

# Prepare the features for prediction
X_test = test_df[features]

# Make predictions with the TensorFlow model
y_pred_test_proba = model.predict(X_test)
y_pred_test = (y_pred_test_proba > 0.5).astype(int).flatten()  # Convert probabilities to binary predictions

# Prepare the submission dataframe
submission_df = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Transported': y_pred_test
})

# Convert predictions back to boolean (True/False) if necessary
submission_df['Transported'] = submission_df['Transported'].astype(bool)

# Save the submission file
submission_df.to_csv('tensorflow_resultsimple.csv', index=False)


[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 303us/step
