<a href="https://www.kaggle.com/code/mesutssmn/titanic-spaceship-ml-ann?scriptVersionId=167043514" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from tabulate import tabulate
from sklearn.preprocessing import MinMaxScaler,StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report,confusion_matrix,mean_squared_error,accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings("ignore")

In [None]:
pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [None]:
train = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
df = pd.concat([train, test], ignore_index=True).reset_index(drop=True)
df.head()

# *File and Data Field Descriptions*


*   **train.csv** - Personal records for about two-thirds (~8700) of the passengers, to be used as training data.
*   **PassengerId** - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.
*   **HomePlanet** - The planet the passenger departed from, typically their planet of permanent residence.
*   **CryoSleep** - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
*   **Cabin** - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
*   **Destination** - The planet the passenger will be debarking to.
*   **Age** - The age of the passenger.
*   **VIP** - Whether the passenger has paid for special VIP service during the voyage.
*   **RoomService, FoodCourt, ShoppingMall, Spa, VRDeck** - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
*   **Name** - The first and last names of the passenger.
*   **Transported** - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.





# *Data Informations & Manipulations*

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
def check_df(dataframe):
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Head #####################")
    print(dataframe.head(3))
    print("##################### Tail #####################")
    print(dataframe.tail(3))
    print("##################### NA #####################")
    print(dataframe.isnull().sum())
    print("##################### Quantiles #####################")
    numeric_columns = dataframe.select_dtypes(include=['number']).columns
    print(dataframe[numeric_columns].quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T)

In [None]:
check_df(df)

In [None]:
def grab_col_names(dataframe, cat_th=10, car_th=20):
    
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]

    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]

    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]

    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')
    return cat_cols, cat_but_car, num_cols

In [None]:
cat_cols, cat_but_car, num_cols = grab_col_names(df)

In [None]:
def cat_summary(dataframe, col_name, plot=False):
    print(pd.DataFrame({col_name: dataframe[col_name].value_counts(),
                        "Ratio": 100 * dataframe[col_name].value_counts() / len(dataframe)}))

    if plot:
        fig, axs = plt.subplots(1, 2, figsize=(8, 6))
        plt.subplot(1, 2, 1)
        sns.countplot(x=dataframe[col_name], data=dataframe)
        plt.title("Frequency of " + col_name)
        plt.xticks(rotation=90)

        plt.subplot(1, 2, 2)
        values = dataframe[col_name].value_counts()
        plt.pie(x=values, labels=values.index, autopct=lambda p: '{:.2f}% ({:.0f})'.format(p, p/100 * sum(values)))
        plt.title("Frequency of " + col_name)
        plt.legend(labels=['{} - {:.2f}%'.format(index, value/sum(values)*100) for index, value in zip(values.index, values)],
                   loc='upper center', bbox_to_anchor=(0.5, -0.2), fancybox=True, shadow=True, ncol=1)
        plt.show(block=True)

In [None]:
for col in cat_cols:
    cat_summary(df, col, True)

In [None]:
def num_summary(dataframe, numerical_col, plot=False):
    quantiles = [0.05, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 0.99]
    print(dataframe[numerical_col].describe(quantiles).T)

    if plot:
        dataframe[numerical_col].hist(bins=50)
        plt.xlabel(numerical_col)
        plt.title(numerical_col)
        plt.show()

    print("#####################################")

In [None]:
for col in num_cols:
    num_summary(df, col, True)

In [None]:
def target_summary_with_cat(dataframe, target, categorical_col):
    print(pd.DataFrame({"TARGET_MEAN": dataframe.groupby(categorical_col)[target].mean()}), end="\n\n\n")

In [None]:
for col in cat_cols:
    target_summary_with_cat(df,"Transported",col)

In [None]:
corr = df[num_cols].corr()
corr
sns.set(rc={'figure.figsize': (10, 6)})
sns.heatmap(corr, cmap="RdBu", annot=True, fmt=".2f")
plt.show()

In [None]:
def outlier_thresholds(dataframe, variable, low_quantile=0.10, up_quantile=0.90):
    quantile_one = dataframe[variable].quantile(low_quantile)
    quantile_three = dataframe[variable].quantile(up_quantile)
    interquantile_range = quantile_three - quantile_one
    up_limit = quantile_three + 1.5 * interquantile_range
    low_limit = quantile_one - 1.5 * interquantile_range
    return low_limit, up_limit

In [None]:
def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False

In [None]:
for col in num_cols:
    if col != "Transported":
      print(col, check_outlier(df, col))

In [None]:
def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

In [None]:
for col in num_cols:
    if col != "Transported":
        replace_with_thresholds(df,col)

In [None]:
def missing_values_table(dataframe, na_name=False):
    na_columns = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0]

    n_miss = dataframe[na_columns].isnull().sum().sort_values(ascending=False)

    ratio = (dataframe[na_columns].isnull().sum() / dataframe.shape[0] * 100).sort_values(ascending=False)

    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['n_miss', 'ratio'])

    print(missing_df, end="\n")

    if na_name:
        return na_columns

In [None]:
missing_values_table(df)


In [None]:
df[(df['CryoSleep'] == True) & ((df['RoomService'] > 0) | (df['FoodCourt'] > 0) | (df['ShoppingMall'] > 0) | (df['Spa'] > 0) | (df['VRDeck'] > 0))]

In [None]:
df.loc[(df['CryoSleep'] == True) & ((df['RoomService'] > 0) | (df['FoodCourt'] > 0) | (df['ShoppingMall'] > 0) | (df['Spa'] > 0) | (df['VRDeck'] > 0)),['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']] = 0

In [None]:
missing_values_table(df)

In [None]:
def quick_missing_imp(data, num_method="median", cat_length=20, target="Transported"):
    variables_with_na = [col for col in data.columns if data[col].isnull().sum() > 0]
    temp_target = data[target]

    print("# BEFORE")
    print(data[variables_with_na].isnull().sum(), "\n\n")
    data = data.apply(lambda x: x.fillna(x.mode()[0]) if (x.dtype == "O" and len(x.unique()) <= cat_length) else x, axis=0)

    if num_method == "mean":
        data = data.apply(lambda x: x.fillna(x.mean()) if x.dtype != "O" else x, axis=0)
    elif num_method == "median":
        data = data.apply(lambda x: x.fillna(x.median()) if x.dtype != "O" else x, axis=0)

    data[target] = temp_target

    print("# AFTER \n Imputation method is 'MODE' for categorical variables!")
    print(" Imputation method is '" + num_method.upper() + "' for numeric variables! \n")
    print(data[variables_with_na].isnull().sum(), "\n\n")

    return data

In [None]:
df = quick_missing_imp(df, num_method="median", cat_length=16)

In [None]:
df["Cabin"]=df["Cabin"].fillna(df["Cabin"].mode)
df["Cabin"].isna().sum()

In [None]:
df.drop(['Name'],axis=1,inplace=True)

In [None]:
df.isna().sum()

In [None]:
def rare_analyser(dataframe, target, cat_cols):
    for col in cat_cols:
        print(col, ":", len(dataframe[col].value_counts()))
        print(pd.DataFrame({"COUNT": dataframe[col].value_counts(),
                            "RATIO": dataframe[col].value_counts() / len(dataframe),
                            "TARGET_MEAN": dataframe.groupby(col)[target].mean()}), end="\n\n\n")

rare_analyser(df, "Transported", cat_cols)

In [None]:
df.isna().sum()

In [None]:
def Cabines(df): 
    deck = []
    num = []
    side = []
    for cabin in df['Cabin']:
        if isinstance(cabin, str):
            deck.append(cabin.split('/')[0])
            num.append(int(cabin.split('/')[1]))
            side.append(cabin.split('/')[-1])
        else:
            deck.append(None)
            num.append(None)
            side.append(None)
    return deck, num, side

deck, num, side = Cabines(df)

In [None]:
cabin = pd.DataFrame({'Deck':deck,'Num':num,'Side':side})
cabin

In [None]:
cabin.isna().sum()

In [None]:
cabin["Deck"].fillna(cabin["Deck"].mode()[0], inplace=True)
cabin["Side"].fillna(cabin["Side"].mode()[0], inplace=True)
cabin["Num"].fillna(cabin["Num"].median(), inplace=True)
cabin["Num"] = cabin["Num"].astype("int")

In [None]:
cabin.isna().sum()

In [None]:
df.drop('Cabin',axis=1,inplace=True)
df = pd.concat([df,cabin],axis=1)
df.tail()

In [None]:
df.info()

In [None]:
df['total_spent']= df['RoomService']+ df['FoodCourt']+ df['ShoppingMall']+ df['Spa']+ df['VRDeck']

In [None]:
df['AgeGroup'] = 0
for i in range(6):
    df.loc[(df["Age"] >= 10*i) & (df["Age"] < 10*(i + 1)), 'AgeGroup'] = i

In [None]:
target_column = df['Transported']
df.drop("Transported", axis=1, inplace=True)

In [None]:
cat_cols, cat_but_car, num_cols = grab_col_names(df)

def label_encoder(dataframe, binary_col):
    labelencoder = LabelEncoder()
    dataframe[binary_col] = labelencoder.fit_transform(dataframe[binary_col])
    return dataframe

In [None]:
df.head()

In [None]:
def one_hot_encoder(dataframe, categorical_cols, drop_first=True):
    dataframe = pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first)
    return dataframe
dfd = one_hot_encoder(df, cat_cols, drop_first=True)

In [None]:
binary_cols = [col for col in dfd.columns if dfd[col].dtypes == "O" and len(dfd[col].unique()) == 2]

for col in binary_cols:
    label_encoder(dfd, col)

In [None]:
binary_cols

In [None]:
dfd = pd.concat([dfd, target_column], axis=1)

In [None]:
dfd.info()

In [None]:
dfd.head()

In [None]:
for i in dfd.columns:
    if dfd[i].dtype == bool:  
        dfd[i] = dfd[i].astype(int) 

In [None]:
dfd.head()

In [None]:
dfd[dfd['Transported'].notnull()]

In [None]:
dfd[dfd['Transported'].isnull()]

In [None]:
train = dfd[dfd['Transported'].notnull()]
test = dfd[dfd['Transported'].isnull()]

In [None]:
train['Transported'] = train['Transported'].astype(int)

In [None]:
train.head()

In [None]:
y = train['Transported']
X = train.drop(["PassengerId", "Transported"], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
models = [('LR', LogisticRegression()),
          ('KNN', KNeighborsClassifier()),
          ('CART', DecisionTreeClassifier()),
          ('RF', RandomForestClassifier(n_estimators=200)),
          ('SVC', SVC()),
          ('GBM', GradientBoostingClassifier(n_estimators=200, learning_rate=0.06)),
          ("XGBoost", XGBClassifier(learning_rate=0.06)),
          ("LightGBM", LGBMClassifier(learning_rate=0.06, verbose=0)),
          ("CatBoost", CatBoostClassifier(learning_rate=0.06,verbose=False))]

In [None]:
acclist=[]
for name, model in models:
    acc = np.mean(cross_val_score(model, X_train, y_train, cv=5, scoring="accuracy"))
    acclist.append([name,acc])
    print(f"ACC: {round(acc, 4)} ({name}) ")

In [None]:
train_df = dfd[dfd['Transported'].notnull()]

target_column = train_df['Transported']
if target_column.dtype.kind in 'biufc':
    y = np.log1p(target_column)

In [None]:
X = train_df.drop(["PassengerId","Transported"], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
catboost = CatBoostClassifier(verbose=False, learning_rate=0.09).fit(X_train, y_train)
y_pred = catboost.predict(X_test)
catboost.score(X_train, y_train)

In [None]:
y_pred = np.expm1(y_pred)
y_test = np.expm1(y_test)

In [None]:
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
catboost_model = CatBoostClassifier(verbose=False, random_state=46)

rmse = np.mean(np.sqrt(-cross_val_score(catboost_model, X, y, cv=5, scoring="neg_mean_squared_error")))
rmse

In [None]:
catboost_params = {"learning_rate": [0.01, 0.04, 0.06, 0.08, 0.1],
               "n_estimators": [200, 400, 500, 1000, 1500]}
catboost_gs_best = GridSearchCV(catboost_model,
                            catboost_params,
                            cv=3,
                            n_jobs=-1,
                            verbose=False).fit(X_train, y_train)

In [None]:
final_model = catboost_model.set_params(**catboost_gs_best.best_params_).fit(X, y)

In [None]:
def plot_importance(model, features, num=len(X), save=False):

    feature_imp = pd.DataFrame({"Value": model.feature_importances_, "Feature": features.columns})
    plt.figure(figsize=(10, 30))
    sns.set(font_scale=0.5)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False)[0:num])
    plt.title("Features")
    plt.tight_layout()
    plt.show()
    if save:
        plt.savefig("importances.png")

In [None]:
model = CatBoostClassifier(verbose=0)
model.fit(X, y)

plot_importance(model, X)

# Final with the Best ML Model

In [None]:
model = CatBoostClassifier(**catboost_gs_best.best_params_, verbose=False)
model.fit(X_train, y_train)
predictions = model.predict(test.drop(["PassengerId","Transported"], axis=1))

In [None]:
model.score(X_train, y_train)

In [None]:
predictions.shape

In [None]:
predictions = np.expm1(predictions)
y_test = np.expm1(y_test)

In [None]:
predictions = [True if p >= 0.5 else False for p in predictions]

In [None]:
predictions[:10]

In [None]:
sub=pd.read_csv("/kaggle/input/spaceship-titanic/sample_submission.csv")
sub.head()

In [None]:
sub["Transported"] = predictions
sub.head(20)

In [None]:
sub =sub.set_index("PassengerId")
sub

In [None]:
sub.to_csv("titanic_space_catboost.csv")

# ANN Model

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization

In [None]:
#X_train.shape[1]

In [None]:
model = tf.keras.Sequential()
input_layer = tf.keras.layers.Dense(21, input_shape=(X_train.shape[1],), activation="relu")
model.add(input_layer)

model.add(tf.keras.layers.Dense(128, activation="relu"))
model.add(tf.keras.layers.Dropout(0.4))
model.add(tf.keras.layers.BatchNormalization())

model.add(tf.keras.layers.Dense(512, activation="relu"))
model.add(tf.keras.layers.Dropout(0.6))
model.add(tf.keras.layers.BatchNormalization())

model.add(tf.keras.layers.Dense(2048, activation="relu"))
model.add(tf.keras.layers.Dropout(0.4))
model.add(tf.keras.layers.BatchNormalization())

model.add(tf.keras.layers.Dense(1024, activation="relu"))
model.add(tf.keras.layers.Dropout(0.8))
model.add(tf.keras.layers.BatchNormalization())

model.add(tf.keras.layers.Dense(256, activation="relu"))
model.add(tf.keras.layers.Dropout(0.4))
model.add(tf.keras.layers.BatchNormalization())

model.add(tf.keras.layers.Dense(64, activation="relu"))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.BatchNormalization())

model.add(tf.keras.layers.Dense(8, activation="relu"))
model.add(tf.keras.layers.Dropout(0.4))
model.add(tf.keras.layers.BatchNormalization())

model.add(tf.keras.layers.Dense(1, activation="sigmoid"))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

early_stopping = tf.keras.callbacks.EarlyStopping(
    patience=18,
    restore_best_weights=True)
model.summary()


In [None]:
history = model.fit(X_train, y_train,
                    callbacks=[early_stopping],
                    epochs=100, verbose=1, batch_size=100, validation_split=0.2)

In [None]:
loss_and_metrics = model.evaluate(X_test, y_test)
print(loss_and_metrics)
print('Loss = ',loss_and_metrics[0])
print('Accuracy = ',loss_and_metrics[1])

In [None]:
plt.figure(figsize=(10, 6))

train_loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(train_loss) + 1)

plt.plot(epochs, train_loss, label="Training Loss")
plt.plot(epochs, val_loss, label="Validation Loss")

plt.title("Training and Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

plt.show()

In [None]:
predicted = model.predict(X_test)
predicted = tf.squeeze(predicted)
predicted = np.array([1 if x >= 0.5 else 0 for x in predicted])
actual = np.array(y_test)

In [None]:
predicted

# Final with the ANN Model 

In [None]:
test.head()

In [None]:
predicted = model.predict(test.drop(["PassengerId","Transported"], axis=1))
predicted = tf.squeeze(predicted)
predicted = np.array([True if p >= 0.5 else False for p in predicted])
actual = np.array(y_test)
predicted[:10]

In [None]:
sub = pd.read_csv("/kaggle/input/spaceship-titanic/sample_submission.csv")

In [None]:
sub["Transported"] = predicted
sub.head(20)

In [None]:
sub =sub.set_index("PassengerId")
sub

In [None]:
sub.to_csv("titanic_space_ann.csv")

# CNN Model

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.callbacks import EarlyStopping


In [None]:
input_shape = (27, 1)  
num_classes = 1  

In [None]:
model = Sequential([
    Conv1D(128, kernel_size=1, activation='relu', input_shape=input_shape),
    MaxPooling1D(pool_size=2),
    BatchNormalization(),
    Conv1D(256, kernel_size=1, activation='relu'),
    MaxPooling1D(pool_size=2),
    BatchNormalization(),
    Conv1D(512, kernel_size=1, activation='relu'),
    MaxPooling1D(pool_size=2),
    BatchNormalization(),
    Conv1D(1024, kernel_size=1, activation='relu'),
    MaxPooling1D(pool_size=2),
    BatchNormalization(),
    Flatten(),
    Dense(1024, activation='relu'),
    Dropout(0.5),
    Dense(512, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='sigmoid') 
])


In [None]:
model.compile(optimizer=Adam(learning_rate=0.001), 
              loss=BinaryCrossentropy(),
              metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=16, restore_best_weights=True)

model.summary()

In [None]:
print(X_train.shape)


In [None]:
history = model.fit(X_train, y_train,
                    callbacks=[early_stopping],
                    epochs=200, verbose=1, batch_size=200, validation_split=0.2)

In [None]:
loss_and_metrics = model.evaluate(X_test, y_test)
print(loss_and_metrics)
print('Loss = ',loss_and_metrics[0])
print('Accuracy = ',loss_and_metrics[1])

In [None]:
predicted = model.predict(X_test)
predicted = tf.squeeze(predicted)
predicted = np.array([1 if x >= 0.5 else 0 for x in predicted])
actual = np.array(y_test)

In [None]:
predicted

In [None]:
predicted = model.predict(test.drop(["PassengerId","Transported"], axis=1))
predicted = tf.squeeze(predicted)
predicted = np.array([True if p >= 0.5 else False for p in predicted])
actual = np.array(y_test)
predicted[:10]

In [None]:
sub = pd.read_csv("/kaggle/input/spaceship-titanic/sample_submission.csv")
sub

In [None]:
sub["Transported"] = predicted
sub.head()

In [None]:
sub =sub.set_index("PassengerId")
sub.head(15)

In [None]:
sub.to_csv("titanic_space_cnn.csv")