In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Required libraries for linear algebra and data processing
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)

# For displaying outputs in IPython environments
from IPython.display import display, HTML

# This block is used to list all data files in the "../input/" directory in Kaggle environments
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

def create_scrollable_table(df, table_id, title):
    """
    Create a scrollable HTML table from a pandas DataFrame.
    
    Parameters:
    - df (pandas.DataFrame): The DataFrame to be converted to an HTML table.
    - table_id (str): HTML ID attribute for the div containing the table. This is used for potential CSS or JS targeting.
    - title (str): Title to be displayed above the table.
    
    Returns:
    - str: An HTML string that represents a scrollable table.
    """
    
    # Start with the table's title
    html = f'<h3>{title}</h3>'
    
    # Create a div for the table with a scrollable feature (200px fixed height)
    html += f'<div id="{table_id}" style="height:200px; overflow:auto;">'
    
    # Convert the DataFrame to an HTML table and append to the string
    html += df.to_html()
    
    # Close the div
    html += '</div>'
    
    return html


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Reading the "train.csv" file from the "spaceship-titanic" directory and storing it in a pandas DataFrame named 'df'
df = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")

In [None]:
# Getting the dimensions (number of rows and columns) of the 'df' DataFrame
df.shape

In [None]:
# Displaying the first 5 rows of the 'df' DataFrame for a quick overview
df.head(5)

In [None]:
def preprocessing(df):
    # Split 'PassengerId' column values by '_' and create a new 'Group_no' column
    split_array = np.array([item.split("_") for item in df["PassengerId"]])
    df["Group_no"] = split_array[:, 0].astype(int)

    # Drop unnecessary columns 'Name' and 'PassengerId'
    df = df.drop('Name', axis=1)
    df = df.drop("PassengerId", axis=1)

    # Calculate 'Spending' and the percentage of each activity in the total spending
    df['Spending'] = df['RoomService'] + df['FoodCourt'] + df['ShoppingMall'] + df['Spa'] + df['VRDeck']
    df["PercentageRoomService"] = (df["RoomService"] / df["Spending"]) * 100
    df["PercentageFoodCourt"] = (df["FoodCourt"] / df["Spending"]) * 100
    df["PercentageShoppingMall"] = (df["ShoppingMall"] / df["Spending"]) * 100
    df["PercentageSpa"] = (df["Spa"] / df["Spending"]) * 100
    df["PercentageVRDeck"] = (df["VRDeck"] / df["Spending"]) * 100

    # Create binary columns based on 'HomePlanet' and 'CryoSleep' conditions
    df["Earth_Cryo"] = ((df["HomePlanet"] == "Earth") & (df["CryoSleep"] == "True")).astype(int)
    df["Earth_Cryo2"] = ((df["HomePlanet"] == "Earth") & (df["CryoSleep"] == "False")).astype(int)
    df["Europa_Cryro"] = ((df["HomePlanet"] == "Europa") & (df["CryoSleep"] == "True")).astype(int)
    df["Europa_Cryo2"] = ((df["HomePlanet"] == "Europa") & (df["CryoSleep"] == "False")).astype(int)
    df["Mars_Cryo"] = ((df["HomePlanet"] == "Mars") & (df["CryoSleep"] == "True")).astype(int)
    df["Mars_Cryo2"] = ((df["HomePlanet"] == "Mars") & (df["CryoSleep"] == "False")).astype(int)

    # Create binary columns based on 'HomePlanet' and 'Destination' conditions
    df["Pair1"] = ((df["HomePlanet"] == "Earth") & (df["Destination"] == "TRAPPIST-1e")).astype(int)
    df["Pair2"] = ((df["HomePlanet"] == "Earth") & (df["Destination"] == "55 Cancri e")).astype(int)
    df["Pair3"] = ((df["HomePlanet"] == "Earth") & (df["Destination"] == "PSO J318.5-22")).astype(int)
    df["Pair4"] = ((df["HomePlanet"] == "Europa") & (df["Destination"] == "TRAPPIST-1e")).astype(int)
    df["Pair5"] = ((df["HomePlanet"] == "Europa") & (df["Destination"] == "55 Cancri e")).astype(int)
    df["Pair6"] = ((df["HomePlanet"] == "Europa") & (df["Destination"] == "PSO J318.5-22")).astype(int)
    df["Pair7"] = ((df["HomePlanet"] == "Mars") & (df["Destination"] == "TRAPPIST-1e")).astype(int)
    df["Pair8"] = ((df["HomePlanet"] == "Mars") & (df["Destination"] == "55 Cancri e")).astype(int)
    df["Pair9"] = ((df["HomePlanet"] == "Mars") & (df["Destination"] == "PSO J318.5-22")).astype(int)

    # Convert 'Cabin' column to string and create 'side' and 'deck' columns based on its values
    df["Cabin"] = df["Cabin"].astype(str)
    df["side"] = np.array([item[-1] for item in df["Cabin"]], dtype=object)
    df["deck"] = np.array([item[0] for item in df["Cabin"]], dtype=object)
    df = df.drop("Cabin", axis=1)

    # Calculate square roots of certain columns
    df["sq_RoomService"] = np.sqrt(df["RoomService"])
    df["sq_Spa"] = np.sqrt(df["Spa"])
    df["sq_VRDeck"] = np.sqrt(df["VRDeck"])
    df["sq_Spending"] = np.sqrt(df["Spending"])
    return df

In [None]:
# Displaying the first 15 rows of the 'df' DataFrame for a detailed overview
df.head(15)

In [None]:
# Selecting only the numerical features from the 'df' DataFrame
numerical_features = df.select_dtypes(include=[np.number])

# Computing summary statistics for the selected numerical features
summary_stats = numerical_features.describe().T

# Using the create_scrollable_table function to generate a scrollable HTML table for the summary statistics
html_numerical = create_scrollable_table(summary_stats, "numerical_features", "Summary statistics for numerical features")

# Displaying the generated HTML table in the notebook
display(HTML(html_numerical))

In [None]:
# Selecting only the categorical features (including boolean) from the 'df' DataFrame
categorical_features = df.select_dtypes(include=[object, np.bool_])

# Computing summary statistics for the selected categorical features
summary_stats = categorical_features.describe().T

# Using the create_scrollable_table function to generate a scrollable HTML table for the summary statistics of categorical features
html_categorical = create_scrollable_table(summary_stats, "categorical_features", "Summary statistics for categorical features")

# Displaying the generated HTML table in the notebook
display(HTML(html_categorical))


In [None]:
# Calculating the number of missing values for each column in the 'df' DataFrame
null_values = df.isnull().sum()

# Using the create_scrollable_table function to generate a scrollable HTML table for the count of missing values
html_null_values = create_scrollable_table(null_values.to_frame(), 'null_values', 'Null values in the dataset')

# Calculating the percentage of missing values for each column in relation to the total number of rows
missing_percentage = (df.isnull().sum() / len(df)) / 100

# Using the create_scrollable_table function to generate a scrollable HTML table for the percentage of missing values
html_missing_percentage = create_scrollable_table(missing_percentage.to_frame(), 'missing_percentage', 'Percentage of missing values in the dataset')

# Combining and displaying both HTML tables in the notebook
display(HTML(html_null_values + html_missing_percentage))

In [None]:
# Applying the preprocessing function to the 'df' DataFrame and storing the result in 'X'
X = preprocessing(df)


In [None]:
# Importing necessary modules and functions for data preprocessing and transformation

# FunctionTransformer allows you to create a transformer from any callable Python object
from sklearn.preprocessing import FunctionTransformer

# Pipeline and make_pipeline help in sequentially applying a list of transforms and a final estimator
from sklearn.pipeline import Pipeline, make_pipeline

# SimpleImputer and KNNImputer are used for filling missing values using various strategies
from sklearn.impute import SimpleImputer, KNNImputer

# StandardScaler standardizes features by removing the mean and scaling to unit variance
# OneHotEncoder is used for encoding categorical variables as a one-hot numeric array
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# ColumnTransformer is for applying transformers to columns of an array or pandas DataFrame
from sklearn.compose import ColumnTransformer


In [None]:
split_array = np.array([item.split("_")for item in df["PassengerId"]])
df["Group_no"] = split_array[:,0].astype(int)
    
df = df.drop('Name',axis=1)
df = df.drop("PassengerId",axis=1)
    
df['Spending'] = df['RoomService'] + df['FoodCourt'] + df['ShoppingMall'] + df['Spa'] + df['VRDeck']
df["PercentageRoomService"] = (df["RoomService"]/df["Spending"])*100
df["PercentageFoodCourt"] = (df["FoodCourt"]/df["Spending"])*100
df["PercentageShoppingMall"] = (df["ShoppingMall"]/df["Spending"])*100
df["PercentageSpa"] = (df["Spa"]/df["Spending"])*100
df["PercentageVRDeck"] = (df["VRDeck"]/df["Spending"])*100


df["Earth_Cryo"] = ((df["HomePlanet"] == "Earth") & (df["CryoSleep"]== "True")).astype(int)
df["Earth_Cryo2"] = ((df["HomePlanet"] == "Earth") & (df["CryoSleep"]== "False")).astype(int)
df["Europa_Cryro"] = ((df["HomePlanet"] == "Europa") & (df["CryoSleep"]== "True")).astype(int)
df["Europa_Cryo2"] = ((df["HomePlanet"] == "Europa") & (df["CryoSleep"]== "False")).astype(int)
df["Mars_Cryo"] = ((df["HomePlanet"] == "Mars") & (df["CryoSleep"]== "True")).astype(int)
df["Mars_Cryo2"] = ((df["HomePlanet"] == "Mars") & (df["CryoSleep"]== "False")).astype(int)

df["Pair1"] = ((df["HomePlanet"] == "Earth") & (df["Destination"]== "TRAPPIST-1e")).astype(int)
df["Pair2"] = ((df["HomePlanet"] == "Earth") & (df["Destination"]== "55 Cancri e")).astype(int)
df["Pair3"] = ((df["HomePlanet"] == "Earth") & (df["Destination"]== "PSO J318.5-22")).astype(int)
df["Pair4"] = ((df["HomePlanet"] == "Europa") & (df["Destination"]== "TRAPPIST-1e")).astype(int)
df["Pair5"] = ((df["HomePlanet"] == "Europa") & (df["Destination"]== "55 Cancri e")).astype(int)
df["Pair6"] = ((df["HomePlanet"] == "Europa") & (df["Destination"]== "PSO J318.5-22")).astype(int)
df["Pair7"] = ((df["HomePlanet"] == "Mars") & (df["Destination"]== "TRAPPIST-1e")).astype(int)
df["Pair8"] = ((df["HomePlanet"] == "Mars") & (df["Destination"]== "55 Cancri e")).astype(int)
df["Pair9"] = ((df["HomePlanet"] == "Mars") & (df["Destination"]== "PSO J318.5-22")).astype(int)
df["Cabin"] = df["Cabin"].astype(str)
df["side"] = np.array([item[-1]for item in df["Cabin"]],dtype=object)
df["deck"] = np.array([item[0]for item in df["Cabin"]],dtype=object)
df = df.drop("Cabin",axis=1)
df["sq_RoomService"] = np.sqrt(df["RoomService"])
df["sq_Spa"] = np.sqrt(df["Spa"])
df["sq_VRDeck"] = np.sqrt(df["VRDeck"])
df["sq_Spending"] = np.sqrt(df["Spending"])

In [None]:
# Creating a transformation pipeline for numerical features:
# 1. Imputing missing values with the median of the column.
# 2. Scaling the values to have zero mean and unit variance using StandardScaler.

numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),  # Impute missing values using the median of the column
    ("scaler", StandardScaler())  # Scale the data
])

# Creating a transformation pipeline for categorical features:
# 1. Imputing missing values with the most frequent value in the column.
# 2. One-hot encoding the categorical variables.

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),  # Impute missing values using the most frequent value
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))  # One-hot encode the data
])


In [None]:
# Identifying categorical columns in the 'df' DataFrame.
# We select columns that have data type 'object' or 'category'.
categorical_columns = df.select_dtypes(include=["object", "category"]).columns

# Identifying numerical columns in the 'df' DataFrame.
# We select columns that have data type 'int64' or 'float64'.
numerical_columns = df.select_dtypes(include=["int64", "float64"]).columns


In [None]:
categorical_columns

In [None]:
numerical_columns

In [None]:
# Converting all values in the categorical columns of 'df' DataFrame to string type
df[categorical_columns] = df[categorical_columns].astype(str)

In [None]:
# Setting up the ColumnTransformer that will apply the specified preprocessing steps to the specified columns.
# The 'num' transformer applies the 'numerical_transformer' to the 'numerical_columns'.
# The 'cat' transformer applies the 'categorical_transformer' to the 'categorical_columns'.
# Any other columns not specified will be 'passed through' without any changes (due to 'remainder="passthrough"').
preprocessor = ColumnTransformer(transformers=[
    ("num", numerical_transformer, numerical_columns),
    ("cat", categorical_transformer, categorical_columns)
], remainder="passthrough")


In [None]:
# Creating a pipeline that applies the 'preprocessor' (ColumnTransformer) to the data.
# This pipeline can be extended with other steps such as a machine learning model in the future if needed.
pipeline = Pipeline(steps=[("preprocessor", preprocessor)])


In [None]:
pipeline

In [None]:
print(X)

In [None]:
X.shape

In [None]:
X.columns

In [None]:
# Assigning the 'Transported' column as the target variable 'y' and converting it to integer type
y = df["Transported"].astype(int)

# Removing the 'Transported' column from the feature matrix 'X' as it's the target variable
X = X.drop(["Transported"], axis=1)

# Applying the previously defined preprocessing pipeline to the feature matrix 'X'
# This will impute missing values, scale numerical features, and one-hot encode categorical features.
X_preprocessed = pipeline.fit_transform(X)


In [None]:
print(X_preprocessed.shape)

In [None]:
X_preprocessed

In [None]:
# Importing machine learning classifiers/algorithms
from sklearn.linear_model import LogisticRegression  # Logistic Regression model
from sklearn.neighbors import KNeighborsClassifier  # K-Nearest Neighbors classifier
from sklearn.svm import SVC  # Support Vector Machine classifier
from sklearn.ensemble import RandomForestClassifier  # Random Forest classifier
from sklearn.ensemble import GradientBoostingClassifier  # Gradient Boosting classifier

# Importing tools for model selection and evaluation
from sklearn.model_selection import GridSearchCV  # Tool for hyperparameter tuning using cross-validation
from sklearn.model_selection import KFold  # Provides train/test indices to split data into train/test sets
from sklearn.model_selection import cross_val_score  # Evaluate a score by cross-validation


In [None]:
# Importing the function to split datasets
from sklearn.model_selection import train_test_split

# Splitting the preprocessed data into training and testing sets
# 80% of the data will be used for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2)


In [None]:
# Defining a dictionary of classifiers/algorithms.
# This dictionary can be easily extended to add more models or be iterated over for model evaluation.
models = {
    "RandomForest": RandomForestClassifier(),  # Random Forest classifier
    "SupportVectorMachine": SVC(),  # Support Vector Machine classifier
    "XGBoost": GradientBoostingClassifier()  # Gradient Boosting classifier (commonly referred to as XGBoost, though XGBoost is technically a separate library)
}


In [None]:
# Defining a dictionary of hyperparameter grids for each classifier
param_grids = {
    "RandomForest": {
        "n_estimators": [100, 150, 200],
        "criterion": ["gini", "entropy", "logloss"]
    },
    "SupportVectorMachine": {
        "kernel": ["poly", "rbf", "sigmoid"],
        "gamma": ["scale", "auto"],
        "C": [0.5, 1.0, 5.0, 10.0]
    },
    "XGBoost": {
        "loss": ["log_loss", "exponential"],
        "learning_rate": [0.1, 0.01, 0.05],
        "n_estimators": [200, 250, 300]
    }
}

# Initializing a KFold cross-validator with 3 splits
cv = KFold(n_splits=3, shuffle=True)

# Dictionary to store the GridSearchCV objects for each model
grids = {}

# Looping over the defined classifiers to perform hyperparameter tuning
for model_name, model in models.items():
    # Applying GridSearchCV for hyperparameter tuning
    grids[model_name] = GridSearchCV(estimator=model, param_grid=param_grids[model_name], cv=cv, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
    grids[model_name].fit(X_train, y_train)
    
    # Extracting the best parameters and best RMSE for each model
    best_params = grids[model_name].best_params_
    best_score = np.sqrt(-1 * grids[model_name].best_score_)
    
    # Printing the results
    print(f'Best parameters for {model_name}: {best_params}')
    print(f'Best RMSE for {model_name}: {best_score}\n')


In [None]:
# Importing the mean_squared_error metric to evaluate the model's performance
from sklearn.metrics import mean_squared_error

# Looping over the trained models to compute and print the RMSE on the test set
for model_name in grids.keys():
    # Predicting the target values using the current model
    predictions = grids[model_name].predict(X_test)
    
    # Calculating the RMSE for the current model
    rmse = np.sqrt(mean_squared_error(predictions, y_test))
    
    # Printing the RMSE for the current model
    print(f'{model_name}: {rmse}')


In [None]:
# Loading the test dataset from the specified path into a pandas DataFrame 'df_test'
df_test = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")


In [None]:
df_test

In [None]:
# Creating separate DataFrames to store predictions of different models.
# Each DataFrame is initialized with the "PassengerId" column from the test set.

# DataFrame for Logistic Regression model predictions
Logistic_df = df_test[["PassengerId"]].copy()

# DataFrame for Support Vector Machine model predictions
SupportVectorMachine_df = df_test[["PassengerId"]].copy()

# DataFrame for XGBoost model predictions
XGBoost_df = df_test[["PassengerId"]].copy()

# DataFrame for Random Forest model predictions
RandomForest_df = df_test[["PassengerId"]].copy()


In [None]:
# Applying the preprocessing function to the 'df_test' DataFrame to prepare it for predictions
df_test = preprocessing(df_test)


In [None]:
df_test.head()

In [None]:
# Applying the previously defined preprocessing pipeline to the 'df_test' DataFrame 
# This will impute missing values, scale numerical features, and one-hot encode categorical features.
df_test_preprocessed = pipeline.transform(df_test)


In [None]:
# Using the RandomForest model (previously tuned with GridSearchCV) to make predictions on the preprocessed test set
y_RandomForest = grids['RandomForest'].predict(df_test_preprocessed)

# Converting the predicted values to boolean type (True/False)
y_RandomForest = y_RandomForest.astype(bool)


In [None]:
# Using the XGBoost model (previously tuned with GridSearchCV) to make predictions on the preprocessed test set
y_XGBoostRegression = grids['XGBoost'].predict(df_test_preprocessed)

# Converting the predicted values to boolean type (True/False)
y_XGBoostRegression = y_XGBoostRegression.astype(bool)


In [None]:
# Using the Support Vector Machine model (previously tuned with GridSearchCV) to make predictions on the preprocessed test set
y_SupportVectorMachine = grids["SupportVectorMachine"].predict(df_test_preprocessed)

# Converting the predicted values to boolean type (True/False)
y_SupportVectorMachine = y_SupportVectorMachine.astype(bool)


In [None]:
# Assigning the predictions from the RandomForest model to the 'Transported' column of the 'RandomForest_df' DataFrame
RandomForest_df["Transported"] = y_RandomForest


In [None]:
# Assigning the predictions from the Support Vector Machine model to the 'Transported' column of the 'SupportVectorMachine_df' DataFrame
SupportVectorMachine_df["Transported"] = y_SupportVectorMachine


In [None]:
# Assigning the predictions from the XGBoost model to the 'Transported' column of the 'XGBoost_df' DataFrame
XGBoost_df["Transported"] = y_XGBoostRegression


In [None]:
SupportVectorMachine_df

In [None]:
XGBoost_df

In [None]:
SupportVectorMachine_df.to_csv("submission_support_vector_machine.csv",index=False)

In [None]:
XGBoost_df.to_csv("submission_XGBoost.csv",index=False)

In [None]:
RandomForest_df.to_csv("submission_RandomForest.csv",index=False)