In [10]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv


In [11]:
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [12]:
data_original = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")

In [13]:
# Split the 'Cabin' column
data_original[['Cabin Section', 'Cabin Number', 'Cabin Type']] = data_original['Cabin'].str.split('/', expand=True)

# Drop the original 'Cabin' column
data_original.drop(columns='Cabin', inplace=True)

# Map 'Cabin Number' to unique integers
cabin_number_mapping = {number: i for i, number in enumerate(data_original['Cabin Number'].unique())}
data_original['Cabin Number'] = data_original['Cabin Number'].map(cabin_number_mapping)

In [14]:
# One-hot encode the 'Cabin Section' and 'Cabin Type' columns
data_with_dummies = pd.get_dummies(data_original, columns=['Cabin Section', 'Cabin Type'])
# one hot encode the 'HomePlanet' column
data_with_dummies = pd.get_dummies(data_with_dummies, columns=['HomePlanet'])

# Combine into Spending
#data_with_dummies['Spending'] = data_with_dummies[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)

In [15]:
spending_features = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

features = spending_features + ['Cabin Number', 'Age','CryoSleep', 'VIP'] + \
           [col for col in data_with_dummies.columns if 'Cabin Section_' in col or 'Cabin Type_' in col or 'HomePlanet_' in col]

X = data_with_dummies[features].copy()
y = data_original.Transported

In [16]:
# Fill missing values for numerical columns with median
numerical_cols = ['Age', 'Cabin Number'] + spending_features
for col in numerical_cols:
    median_val = X[col].median()
    X[col].fillna(median_val, inplace=True)

# impute for CryoSleep and VIP
boolean_cols = ['CryoSleep', 'VIP']
for col in boolean_cols:
    mode_val = X[col].mode()[0]
    X[col].fillna(mode_val, inplace=True)

# After filling, verify if there are any more missing values
nan_in_X = X.isna().sum().sum()

nan_in_X


0

In [17]:
# Check for NaN values in training data
nan_in_X = X.isna().sum().sum()

# Check for infinite values in training data
inf_in_X = (X == float('inf')).sum().sum() + (X == float('-inf')).sum().sum()

nan_in_X, inf_in_X

(0, 0)

In [19]:
nan_columns_X = X.columns[X.isna().any()].tolist()
nan_counts_X = X[nan_columns_X].isna().sum()

nan_counts_X

Series([], dtype: float64)

In [20]:
# Convert True to 1 and False to 0 for training and validation data
X = X * 1

# Verify the conversion by checking unique values in the dataset
unique_values = X.nunique()

unique_values

RoomService          1273
FoodCourt            1507
ShoppingMall         1115
Spa                  1327
VRDeck               1306
Cabin Number         1818
Age                    80
CryoSleep               2
VIP                     2
Cabin Section_A         2
Cabin Section_B         2
Cabin Section_C         2
Cabin Section_D         2
Cabin Section_E         2
Cabin Section_F         2
Cabin Section_G         2
Cabin Section_T         2
Cabin Type_P            2
Cabin Type_S            2
HomePlanet_Earth        2
HomePlanet_Europa       2
HomePlanet_Mars         2
dtype: int64

In [28]:
# look at features 
print(X.columns)

Index(['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Cabin Number', 'Age', 'CryoSleep', 'VIP', 'Cabin Section_A',
       'Cabin Section_B', 'Cabin Section_C', 'Cabin Section_D',
       'Cabin Section_E', 'Cabin Section_F', 'Cabin Section_G',
       'Cabin Section_T', 'Cabin Type_P', 'Cabin Type_S', 'HomePlanet_Earth',
       'HomePlanet_Europa', 'HomePlanet_Mars'],
      dtype='object')


In [21]:
from keras.models import Model
from keras.layers import Input, Dense, Dropout, Embedding, Flatten, concatenate
from keras.optimizers import Adam

def build_model(input_shape_other_features, num_unique_cabin_numbers, embedding_dim=50):
    """
    Builds and returns the neural network model.
    
    Parameters:
    - input_shape_other_features: Shape of the input excluding the 'Cabin Number' feature.
    - num_unique_cabin_numbers: Number of unique cabin numbers, for the embedding layer.
    - embedding_dim: Dimension of the embedding layer (default is 50).
    
    Returns:
    - A compiled Keras model.
    """
    # Define input layers
    input_cabin_number = Input(shape=(1,))
    input_other_features = Input(shape=input_shape_other_features)

    # Embedding layer for cabin number
    embedding = Embedding(input_dim=num_unique_cabin_numbers, output_dim=embedding_dim)(input_cabin_number)
    embedding = Flatten()(embedding)

    # Dense layers
    x = concatenate([embedding, input_other_features])
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.5)(x)
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.5)(x)
    output = Dense(1, activation='sigmoid')(x)

    # Compile the model
    model = Model(inputs=[input_cabin_number, input_other_features], outputs=output)
    optimizer = Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

# Stratified K Fold and Dropout

In [27]:
from sklearn.model_selection import StratifiedKFold
import numpy as np

# Define number of splits
n_splits = 5

# Define KFold object
kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
cvscores = []
num_unique_cabin_numbers = X['Cabin Number'].nunique()


for train_index, val_index in kfold.split(X, y):
    X_train_fold = X.iloc[train_index]
    y_train_fold = y.iloc[train_index]
    X_val_fold = X.iloc[val_index]
    y_val_fold = y.iloc[val_index]
    
    # ... (Build your model here as shown above, including dropout)
    model = build_model(X_train_fold.drop(columns='Cabin Number').shape[1], num_unique_cabin_numbers)

    # Train the model
    model.fit([X_train_fold['Cabin Number'], X_train_fold.drop(columns='Cabin Number')], y_train_fold, epochs=30, batch_size=32, verbose=0)
    
    # Evaluate the model on the validation data
    scores = model.evaluate([X_val_fold['Cabin Number'], X_val_fold.drop(columns='Cabin Number')], y_val_fold, verbose=0)
    print(f"Fold {len(cvscores)+1}: {model.metrics_names[1]}: {scores[1]*100:.2f}%")
    cvscores.append(scores[1] * 100)

print(f"{n_splits}-fold cross-validation accuracy: {np.mean(cvscores):.2f}% (+/- {np.std(cvscores):.2f}%)")


Fold 1: accuracy: 75.91%
Fold 2: accuracy: 73.55%
Fold 3: accuracy: 75.39%
Fold 4: accuracy: 76.70%
Fold 5: accuracy: 74.97%
5-fold cross-validation accuracy: 75.30% (+/- 1.05%)
