In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from scipy.stats import expon, reciprocal
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.regularizers import l2
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow import keras
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from tensorflow.keras.optimizers import Adam
import keras_tuner as kt
import deap
import skopt
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.models import save_model
from tensorflow.keras.optimizers import SGD
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE

In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from skopt import gp_minimize
from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args
import joblib

def preprocess_data(file_path, is_train=True):
    df = pd.read_csv(file_path)
    
    # Convert binary categorical features to 0 and 1
    binary_features = ['CryoSleep', 'VIP']
    df[binary_features] = df[binary_features].astype(bool).astype(int)
    
    # Feature Engineering
    df['TotalSpending'] = df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
    
    # Conditionally set spending-related features to 0 for passengers in cryosleep
    spending_features = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    df.loc[df['CryoSleep'] == 1, spending_features] = 0
    
    # Create interaction features
    df['HomePlanet_TotalSpending'] = df['HomePlanet'].astype(str) + '_' + df['TotalSpending'].astype(str)
    df['Destination_TotalSpending'] = df['Destination'].astype(str) + '_' + df['TotalSpending'].astype(str)
    
    # Extract components from 'Cabin'
    if 'Cabin' in df.columns:
        df[['Cabin_Deck', 'Cabin_Number', 'Cabin_Side']] = df['Cabin'].str.split('/', expand=True)
        df['Cabin_Side'] = df['Cabin_Side'].map({'P': 1, 'S': 0})
        df['Cabin_Number'] = pd.to_numeric(df['Cabin_Number'], errors='coerce')
        df.drop('Cabin', axis=1, inplace=True)
    
    # One-hot encode multi-category features
    multi_cat_features = ['HomePlanet', 'Destination']
    if is_train:
        one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
        encoded_features = one_hot_encoder.fit_transform(df[multi_cat_features])
        encoded_feature_names = one_hot_encoder.get_feature_names_out(multi_cat_features)
        joblib.dump(one_hot_encoder, 'one_hot_encoder.pkl')
    else:
        one_hot_encoder = joblib.load('one_hot_encoder.pkl')
        encoded_features = one_hot_encoder.transform(df[multi_cat_features])
        encoded_feature_names = one_hot_encoder.get_feature_names_out(multi_cat_features)
    
    encoded_features_df = pd.DataFrame(encoded_features.toarray(), columns=encoded_feature_names)
    df = pd.concat([df, encoded_features_df], axis=1)
    df.drop(multi_cat_features, axis=1, inplace=True)
    
    # Imputation and Scaling
    numeric_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Cabin_Number', 'Cabin_Side', 'TotalSpending']
    if is_train:
        imputer = KNNImputer(n_neighbors=5)
        scaler = StandardScaler()
        df[numeric_features] = imputer.fit_transform(df[numeric_features])
        df[numeric_features] = scaler.fit_transform(df[numeric_features])
        joblib.dump(imputer, 'imputer.pkl')
        joblib.dump(scaler, 'scaler.pkl')
    else:
        imputer = joblib.load('imputer.pkl')
        scaler = joblib.load('scaler.pkl')
        df[numeric_features] = imputer.transform(df[numeric_features])
        df[numeric_features] = scaler.transform(df[numeric_features])
    
    if is_train:
        # Convert 'Transported' to integer (True=1, False=0) for modeling
        df['Transported'] = df['Transported'].astype(int)
        
        # Save the list of features used for training
        train_features = [col for col in df.columns if col not in ['PassengerId', 'Name', 'Transported', 'Cabin_Deck', 'HomePlanet_TotalSpending', 'Destination_TotalSpending']]
        joblib.dump(train_features, 'train_features.pkl')
    
    return df


# Start The TORCH

In [5]:
class Net(nn.Module):
    def __init__(self, input_shape, layers, activation, dropout_rate):
        super(Net, self).__init__()
        self.layers = nn.ModuleList()
        for i, layer_size in enumerate(layers):
            if i == 0:
                self.layers.append(nn.Linear(input_shape, layer_size))
            else:
                self.layers.append(nn.Linear(layers[i-1], layer_size))
            self.layers.append(nn.Dropout(dropout_rate))
            if activation == 'relu':
                self.layers.append(nn.ReLU())
            elif activation == 'tanh':
                self.layers.append(nn.Tanh())
            elif activation == 'elu':
                self.layers.append(nn.ELU())
        self.layers.append(nn.Linear(layers[-1], 1))
        self.layers.append(nn.Sigmoid())

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

def train_and_evaluate(model, X_train, y_train, X_val, y_val, batch_size, epochs, class_weight, learning_rate):
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.BCELoss(weight=class_weight)

    X_train_tensor = torch.from_numpy(X_train.values).float()
    y_train_tensor = torch.from_numpy(y_train.values).float().unsqueeze(1)
    X_val_tensor = torch.from_numpy(X_val.values).float()
    y_val_tensor = torch.from_numpy(y_val.values).float().unsqueeze(1)

    train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    for epoch in range(epochs):
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    with torch.no_grad():
        outputs = model(X_val_tensor)
        predicted = (outputs > 0.5).float()
        val_acc = (predicted == y_val_tensor).float().mean().item()

    return -val_acc

if __name__ == "__main__":
    # Preprocess the data
    train_df = preprocess_data('csv_files/train.csv', is_train=True)
    test_df = preprocess_data('csv_files/test.csv', is_train=False)

    # Prepare features and target for the model
    features = joblib.load('train_features.pkl')
    X = train_df[features]
    y = train_df['Transported']

    # Handling class imbalance
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y.values.ravel())
    # Define the hyperparameter search space
    dim_layers = Integer(low=1, high=4, name='layers')
    dim_units = Integer(low=32, high=512, name='units')
    dim_activation = Categorical(categories=['relu', 'tanh', 'elu'], name='activation')
    dim_dropout = Real(low=0.1, high=0.5, prior='uniform', name='dropout_rate')
    dim_learning_rate = Real(low=1e-4, high=1e-2, prior='log-uniform', name='learning_rate')

    dimensions = [dim_layers, dim_units, dim_activation, dim_dropout, dim_learning_rate]

    @use_named_args(dimensions=dimensions)
    def fitness(layers, units, activation, dropout_rate, learning_rate):
        layers = [units] * layers
        
        kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        val_accuracies = []
        
        for train_index, val_index in kfold.split(X_resampled, y_resampled):
            X_train, X_val = X_resampled.iloc[train_index], X_resampled.iloc[val_index]
            y_train, y_val = y_resampled[train_index], y_resampled[val_index]
            
            class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
            class_weight_tensor = torch.from_numpy(class_weights).float()
            
            model = Net(input_shape=X_train.shape[1], layers=layers, activation=activation, dropout_rate=dropout_rate)
            val_acc = train_and_evaluate(model, X_train, y_train, X_val, y_val, 
                                         batch_size=32, epochs=100, class_weight=class_weight_tensor, learning_rate=learning_rate)
            val_accuracies.append(val_acc)
        
        return np.mean(val_accuracies)

    # Perform Bayesian Optimization
    n_calls = 50
    best_result = gp_minimize(func=fitness, dimensions=dimensions, n_calls=n_calls, 
                              random_state=42, verbose=True)

    # Retrieve the best hyperparameters
    best_params = {
        'layers': [best_result.x[1]] * best_result.x[0],
        'activation': best_result.x[2],
        'dropout_rate': best_result.x[3],
        'learning_rate': best_result.x[4]
    }
    best_val_acc = -best_result.fun

    # Train the final model with the best hyperparameters
    model_enhanced = Net(input_shape=X_resampled.shape[1], **best_params)
    class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_resampled), y=y_resampled)
    class_weight_tensor = torch.from_numpy(class_weights).float()
    train_and_evaluate(model_enhanced, X_resampled, y_resampled, X_resampled, y_resampled, 
                       batch_size=32, epochs=100, class_weight=class_weight_tensor, learning_rate=best_params['learning_rate'])

    print(f"Best Validation Accuracy: {best_val_acc}")
    print(f"Best Hyperparameters: {best_params}")

Iteration No: 1 started. Evaluating function at random point.


AttributeError: 'numpy.ndarray' object has no attribute 'values'

# Exoprt

In [None]:
from tensorflow.keras.models import load_model
import pandas as pd
import numpy as np
import joblib
print(f"Best Validation Accuracy: {best_val_acc}")
print(f"Best Hyperparameters: {best_params}")

# Preprocess the test data using the same steps as the training data
test_df = preprocess_data('csv_files/test.csv', is_train=False)

# Load the features used during training
train_features = joblib.load('train_features.pkl')

# Select the features used by the model, ensuring they are the same as those used in training
features = [col for col in train_features if col in test_df.columns]

# Ensure that X_test is a DataFrame with the correct numerical types
X_test = test_df[features].copy()

# Convert X_test to a NumPy array, which is the expected format for TensorFlow models
X_test_np = X_test.values

# Predict using the enhanced model
y_pred_test_proba = model_enhanced.predict(X_test_np)
y_pred_test = (y_pred_test_proba > 0.5).astype(int).flatten()

# Prepare the submission dataframe
submission_df = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Transported': y_pred_test
})

# Convert predictions back to boolean (True/False) if necessary
submission_df['Transported'] = submission_df['Transported'].astype(bool)

# Save the submission file
submission_df.to_csv('neuralnetbayesadvancedfocused.csv', index=False)