In [21]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from scipy.stats import expon, reciprocal
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.regularizers import l2
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow import keras
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from tensorflow.keras.optimizers import Adam
import keras_tuner as kt
import deap
import skopt
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.models import save_model
from tensorflow.keras.optimizers import SGD
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE

In [20]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from skopt import gp_minimize
from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args
import joblib
import optuna 

def preprocess_data(file_path, is_train=True):
    df = pd.read_csv(file_path)
    
    # Convert binary categorical features to 0 and 1
    binary_features = ['CryoSleep', 'VIP']
    df[binary_features] = df[binary_features].astype(bool).astype(int)
    
    # Feature Engineering
    df['TotalSpending'] = df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
    
    # Conditionally set spending-related features to 0 for passengers in cryosleep
    spending_features = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    df.loc[df['CryoSleep'] == 1, spending_features] = 0
    
    # Create interaction features
    df['HomePlanet_TotalSpending'] = df['HomePlanet'].astype(str) + '_' + df['TotalSpending'].astype(str)
    df['Destination_TotalSpending'] = df['Destination'].astype(str) + '_' + df['TotalSpending'].astype(str)
    
    # Extract components from 'Cabin'
    if 'Cabin' in df.columns:
        df[['Cabin_Deck', 'Cabin_Number', 'Cabin_Side']] = df['Cabin'].str.split('/', expand=True)
        df['Cabin_Side'] = df['Cabin_Side'].map({'P': 1, 'S': 0})
        df['Cabin_Number'] = pd.to_numeric(df['Cabin_Number'], errors='coerce')
        df.drop('Cabin', axis=1, inplace=True)
    
    # One-hot encode multi-category features
    multi_cat_features = ['HomePlanet', 'Destination']
    if is_train:
        one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
        encoded_features = one_hot_encoder.fit_transform(df[multi_cat_features])
        encoded_feature_names = one_hot_encoder.get_feature_names_out(multi_cat_features)
        joblib.dump(one_hot_encoder, 'one_hot_encoder.pkl')
    else:
        one_hot_encoder = joblib.load('one_hot_encoder.pkl')
        encoded_features = one_hot_encoder.transform(df[multi_cat_features])
        encoded_feature_names = one_hot_encoder.get_feature_names_out(multi_cat_features)
    
    encoded_features_df = pd.DataFrame(encoded_features.toarray(), columns=encoded_feature_names)
    df = pd.concat([df, encoded_features_df], axis=1)
    df.drop(multi_cat_features, axis=1, inplace=True)
    
    # Imputation and Scaling
    numeric_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Cabin_Number', 'Cabin_Side', 'TotalSpending']
    if is_train:
        imputer = KNNImputer(n_neighbors=5)
        scaler = StandardScaler()
        df[numeric_features] = imputer.fit_transform(df[numeric_features])
        df[numeric_features] = scaler.fit_transform(df[numeric_features])
        joblib.dump(imputer, 'imputer.pkl')
        joblib.dump(scaler, 'scaler.pkl')
    else:
        imputer = joblib.load('imputer.pkl')
        scaler = joblib.load('scaler.pkl')
        df[numeric_features] = imputer.transform(df[numeric_features])
        df[numeric_features] = scaler.transform(df[numeric_features])
    
    if is_train:
        # Convert 'Transported' to integer (True=1, False=0) for modeling
        df['Transported'] = df['Transported'].astype(int)
        
        # Save the list of features used for training
        train_features = [col for col in df.columns if col not in ['PassengerId', 'Name', 'Transported', 'Cabin_Deck', 'HomePlanet_TotalSpending', 'Destination_TotalSpending']]
        joblib.dump(train_features, 'train_features.pkl')
    
    return df


#  2. PyTorch Model Def

In [67]:
import optuna
from optuna.trial import TrialState
from skopt import gp_minimize
from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.callbacks import LearningRateScheduler
import numpy as np

train_df = preprocess_data('csv_files/train.csv', is_train=True)
test_df = preprocess_data('csv_files/test.csv', is_train=False)

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

class Net(nn.Module):
    def __init__(self, input_shape, layers, units, activation, dropout_rate):
        super(Net, self).__init__()
        self.layers = nn.ModuleList()

        # Reduce the number of layers and units per layer
        layers = max(1, layers // 2)  # Ensure at least one layer
        units = max(32, units // 2)   # Ensure a minimum number of units

        for i in range(layers):
            if i == 0:
                self.layers.append(nn.Linear(input_shape, units))
            else:
                self.layers.append(nn.Linear(units, units))
            if activation == 'relu':
                self.layers.append(nn.ReLU())
            elif activation == 'tanh':
                self.layers.append(nn.Tanh())
            elif activation == 'elu':
                self.layers.append(nn.ELU())
            # Increase dropout rate to prevent overfitting
            self.layers.append(nn.Dropout(min(0.5, dropout_rate + 0.1)))

        self.out = nn.Linear(units, 1)

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return torch.sigmoid(self.out(x))

# Objective Function with Optuna and PyTorch

In [81]:
from sklearn.model_selection import train_test_split

# Define a global variable to keep track of the best validation accuracy
best_validation_accuracy = 0
best_model_params = None
def objective(trial):
    global best_validation_accuracy, best_model_params
    # Load and preprocess data
    features = [col for col in train_df.columns if col not in ['PassengerId', 'Name', 'Transported', 'Cabin_Deck', 'HomePlanet_TotalSpending', 'Destination_TotalSpending']]
    X = train_df[features].values
    y = train_df['Transported'].values

    # Split the data into training and validation sets (80/20 split)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Handling class imbalance
    smote = SMOTE(random_state=42)
    X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

    # Convert to tensors
    X_train_tensor = torch.tensor(X_train_res, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train_res, dtype=torch.float32).unsqueeze(1)
    X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
    y_val_tensor = torch.tensor(y_val, dtype=torch.float32).unsqueeze(1)

    # Model Configuration
    input_shape = X_train_tensor.shape[1]
    layers = trial.suggest_int('layers', 1, 2)
    units = trial.suggest_int('units', 32, 256)
    dropout_rate = trial.suggest_float('dropout_rate', 0.2, 0.5)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True)

    # Instantiate the model with 'tanh' activation
    model = Net(input_shape, layers, units, 'tanh', dropout_rate)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # DataLoader for PyTorch
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

    # Training loop
    model.train()
    for epoch in range(100):
        for batch_x, batch_y in train_loader:
            optimizer.zero_grad()
            output = model(batch_x)
            loss = F.binary_cross_entropy(output, batch_y)
            loss.backward()
            optimizer.step()

    # Validation
    model.eval()
    with torch.no_grad():
        output = model(X_val_tensor)
        prediction = output.round()
        accuracy = (prediction.eq(y_val_tensor).sum() / float(y_val_tensor.size(0))).item()

    if accuracy > best_validation_accuracy:
        best_validation_accuracy = accuracy
        best_model_params = {
            'input_shape': input_shape,
            'layers': layers,
            'units': units,
            'activation': 'tanh',
            'dropout_rate': dropout_rate,
            'learning_rate': learning_rate
        }
        torch.save(model.state_dict(), 'model_enhanced.pth')
        
    return accuracy


# Running the Optuna Study

In [88]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

print(f"Best Validation Accuracy: {study.best_value}")
print(f"Best Hyperparameters: {study.best_params}")


[I 2024-03-21 20:25:27,564] A new study created in memory with name: no-name-906132a7-03be-490b-a483-4667ca60e38e
[I 2024-03-21 20:25:33,881] Trial 0 finished with value: 0.7734330296516418 and parameters: {'layers': 2, 'units': 105, 'dropout_rate': 0.44916995743269655, 'learning_rate': 0.0002674333559558307}. Best is trial 0 with value: 0.7734330296516418.
[I 2024-03-21 20:25:41,688] Trial 1 finished with value: 0.7837837934494019 and parameters: {'layers': 2, 'units': 176, 'dropout_rate': 0.3857429461137464, 'learning_rate': 0.0006224958347891364}. Best is trial 1 with value: 0.7837837934494019.
[I 2024-03-21 20:25:49,210] Trial 2 finished with value: 0.7941345572471619 and parameters: {'layers': 1, 'units': 175, 'dropout_rate': 0.288263676896526, 'learning_rate': 0.004949347635465978}. Best is trial 2 with value: 0.7941345572471619.
[I 2024-03-21 20:25:55,380] Trial 3 finished with value: 0.7780333757400513 and parameters: {'layers': 2, 'units': 116, 'dropout_rate': 0.42080071817997

Best Validation Accuracy: 0.7941345572471619
Best Hyperparameters: {'layers': 1, 'units': 175, 'dropout_rate': 0.288263676896526, 'learning_rate': 0.004949347635465978}


In [86]:
# After Optuna study
def retrain_best_model_on_full_data(best_model_params):
    # Combine your training and validation data
    features = [col for col in train_df.columns if col not in ['PassengerId', 'Name', 'Transported', 'Cabin_Deck', 'HomePlanet_TotalSpending', 'Destination_TotalSpending']]
    X = train_df[features].values
    y = train_df['Transported'].values
    X_tensor = torch.tensor(X, dtype=torch.float32)
    y_tensor = torch.tensor(y, dtype=torch.float32).unsqueeze(1)

    # Handling class imbalance on the full dataset
    smote = SMOTE(random_state=42)
    X_res, y_res = smote.fit_resample(X, y)
    X_resampled = torch.tensor(X_res, dtype=torch.float32)
    y_resampled = torch.tensor(y_res, dtype=torch.float32).unsqueeze(1)

    # DataLoader for PyTorch
    full_dataset = TensorDataset(X_resampled, y_resampled)
    full_loader = DataLoader(full_dataset, batch_size=32, shuffle=True)

    # Model instantiation using best model parameters
    model = Net(
        input_shape=best_model_params['input_shape'],
        layers=best_model_params['layers'],
        units=best_model_params['units'],
        activation='tanh',
        dropout_rate=best_model_params['dropout_rate']
    )

    optimizer = optim.Adam(model.parameters(), lr=best_model_params['learning_rate'])

    # Retrain loop on the full dataset
    model.train()
    for epoch in range(100):
        for batch_x, batch_y in full_loader:
            optimizer.zero_grad()
            output = model(batch_x)
            loss = F.binary_cross_entropy(output, batch_y)
            loss.backward()
            optimizer.step()

    # Save the retrained model
    torch.save(model.state_dict(), 'model_enhanced_full.pth')
if best_model_params:
    retrain_best_model_on_full_data(best_model_params)

# Exoprt

In [87]:
import torch
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the best hyperparameters and model architecture from the Optuna study
best_params = study.best_params
# Assuming 'train_df' is your DataFrame and you want to exclude certain columns to get the input shape
excluded_columns = ['PassengerId', 'Name', 'Transported', 'Cabin_Deck', 'HomePlanet_TotalSpending', 'Destination_TotalSpending']
input_shape = len([col for col in train_df.columns if col not in excluded_columns])


# Define the model with the best hyperparameters
model_enhanced = Net(input_shape=input_shape, layers=best_params['layers'], units=best_params['units'],
                     activation='tanh', dropout_rate=best_params['dropout_rate'])

# Load the trained model weights
model_enhanced.load_state_dict(torch.load('model_enhanced_full.pth'))
model_enhanced.eval()  # Set the model to evaluation mode


# Preprocess the test data
test_df = preprocess_data('csv_files/test.csv', is_train=False)

# Select the features used by the model, ensuring they are the same as those used in training
features = [col for col in train_df.columns if col not in ['PassengerId', 'Name', 'Transported', 'Cabin_Deck', 'HomePlanet_TotalSpending', 'Destination_TotalSpending']]
numeric_features = [col for col in features if test_df[col].dtype != 'object']

# Prepare the test data for the PyTorch model
X_test = test_df[features].copy()

# Scale the numeric features using the StandardScaler
scaler = StandardScaler()
X_test[numeric_features] = scaler.fit_transform(X_test[numeric_features])

X_test_np = X_test.values
X_test_tensor = torch.tensor(X_test_np, dtype=torch.float32)

# Disable gradient computation for inference
with torch.no_grad():
    y_pred_test_tensor = model_enhanced(X_test_tensor)
    y_pred_test_proba = torch.sigmoid(y_pred_test_tensor).numpy()  # sigmoid applied if not in the model's forward method
    y_pred_test = (y_pred_test_proba > 0.5).astype(int).flatten()

# Prepare the submission dataframe
submission_df = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Transported': y_pred_test
})

# Convert predictions back to boolean (True/False) for final submission
submission_df['Transported'] = submission_df['Transported'].astype(bool)

# Save the submission file
submission_df.to_csv('pytorchandoptunavalac.csv', index=False)