In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from category_encoders import BinaryEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_log_error, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import seaborn as sns
import optuna

plt.style.use('fivethirtyeight')

# Input data files are available in the read-only "/share/dutta/eyao/dataset/kaggle/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/share/dutta/eyao/dataset/kaggle'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory () that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Spaceship Titanic
## Introduction 
- In this notebook we will be building a model for the kaggle competition Spaceship Titanic. This notebook will go through the steps necessary to enrich your dataset and make a good classification model. In this iteration we will be focusing on using Random Forest Classifier however you can experiment with others (e.g. Gradient boosting, Logistic Regression). This notebook will include some commented out code which will assist you in implementing the other aforementioned models.

## Table of Contents
- [Exploratory Data Analysis (EDA)](#Exploratory-Data-Analysis)
- [Data Preprocessing](#Data-Preprocessing)
    - [Feature Engineering](#Feature-Engineering)
    - [Encoding](#Encoding)
- [Model Building](#Model-Building)
    - [Submission](#Submission)

## Exploratory Data Analysis
- In this section we will be taking a quick look into our dataset to get a feel for it. While looking we will devise stratedgies that we will carry out during the Data preprocessing phase.

[🔝](#Table-of-Contents)

In [None]:
df = pd.read_csv('/share/dutta/eyao/dataset/kaggle/spaceship-titanic/train.csv')
df_test = pd.read_csv('/share/dutta/eyao/dataset/kaggle/spaceship-titanic/test.csv')

In [None]:
df.head()

From the preview above we will notice we have categorical and numerical features so we should think about encoding stratedgies. We have a few columns that contains purchases which we could use for feature engineering. Within the categorical columns we can see a lot of them are parts connected by some characters like ' ', '/', '-'. It would be easy to seperate these columns to gain better results while encoding. 

In [None]:
df.info()

We can see the shape of our dataset is (8693, 14) which is a nice size for models like Logistic Regression, Gradient Boosting and Decision Trees.

In [None]:
labels = ['Not Transported', 'Transported']

fig, ax = plt.subplots(figsize=(10, 6))
df['Transported'].value_counts().plot(kind='pie', ax=ax, labels=labels, autopct='%1.1f%%', colors=['coral', 'teal'])

ax.set_xlabel('Passenger Fate')
ax.set_ylabel('') 
ax.legend(title='Fates')

plt.show()

In [None]:
numeric_df = df.select_dtypes(include=['int64', 'float64'])
corr = numeric_df.corr()

colors = ["coral", "white", "teal"] 
cmap = LinearSegmentedColormap.from_list("custom_coral_teal", colors)

plt.figure(figsize=(10, 8))  
sns.heatmap(corr, annot=True, fmt=".2f", cmap=cmap,
            xticklabels=corr.columns, yticklabels=corr.columns,
            cbar_kws={'label': 'Correlation coefficient'})

plt.title('Correlation Heatmap')
plt.xticks(rotation=45)  
plt.yticks(rotation=45)  
plt.show()


In [None]:
df.isna().sum()

We can see we have quite a few missing values in almost every column so we should be thinking of imputation stratedgies.

In [None]:
print(f'HomePlanet: {len(df.HomePlanet.unique())} \nCabin: {len(df.Cabin.unique())}\nDestination:{len(df.Destination.unique())}\nName:{len(df.Name.unique())}')

Too investigate the categorical columns we can immediately see `Name` and `Cabin` would greatly benefit from being split. It isn't necessary to reduce the dimensionality of the other columns and will encode them as is.

## Data Preprocessing
- In this stage we will be doing some feature engineering as discussed previously. We will be leaving some steps out here as we will cover them in our machine learning pipeline later. This allows us to tune some of the preprocessors like the imputer.

[🔝](#Table-of-Contents)





### Feature Engineering
The feature engineering steps will help you build the best model possible. The steps we will be doing are the following:
- Splitting Cabin into 3 columns composing of the parts
- Split Name into First Name and Last Name to correlate families
- Calculate a total column that comprises total spent on services

[🔝](#Table-of-Contents)

In [None]:
df[['Cabin1', 'Cabin2', 'Cabin3']] = df['Cabin'].str.split('/', expand=True)
df[['FirstName', 'LastName']] = df['Name'].str.split(' ', expand=True)
df['total'] = df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
df['AgeBin'] = pd.qcut(df['Age'].fillna(df['Age'].mode()[0]), q=5, labels=False)


df.drop(['Cabin', 'Name'], axis=1, inplace=True)

df.head()

In [None]:
print(f"Cabin1: {len(df.Cabin1.unique())} \nCabin2: {len(df.Cabin2.unique())}\nCabin3: {len(df.Cabin3.unique())}\nFirstName: {len(df.FirstName.unique())}\nLastName: {len(df.LastName.unique())}")

We can see we reduced the dimensionality pretty significantly for all of the columns. We can see cabin split very well with the high cardinality being a numerical features. We see First Name and Last Name are still high cardinality so we will need to take that into account when choosing an encoding stratedgy.

In [None]:
y = df['Transported'].astype(int)
df.drop(['Transported'], axis=1, inplace=True)

Here we are dropping the transported column before encoding so we can transform our submission dataframe to it later with dimension issues.

### Encoding
- Here we are going to encode our features with the BinaryEncoder. For low cardinality features it will be pretty similar to OneHotEncoding however it won't blow up our column size with high cardinality features. There are a lot of encoders out there so I would reccomend experimenting with some for potential improvements.

[🔝](#Table-of-Contents)

In [None]:
encoder = BinaryEncoder(cols=['FirstName', 'LastName', 'Cabin1', 'Cabin3', 'Destination', 'VIP', 'HomePlanet'], return_df=True)
df = encoder.fit_transform(df)

df.drop(['VIP_1'], axis=1,inplace=True)

df.columns

## Model Building
- In this section we will be building a machine learning pipeline where we perform imputing and scaling before passing into our model. This will be paired with a GridSearch to find the most optimal parameters. I have commented out the parametwers I've tried with other models for future use. 

[🔝](#Table-of-Contents)

In [None]:
# scaled_cols = ['Age', 'RoomService', 'FoodCourt']

X = df
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
xgb_params = {'n_estimators': 248, 'learning_rate': 0.08276477030425759, 'max_depth': 4, 'reg_lambda': 9.144307734410582, 'subsample': 0.9761017636523421}
rf_params = {'n_estimators': 829, 'max_depth': 19, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt'}

xgb_model = XGBClassifier(**xgb_params)
rf_model = RandomForestClassifier(**rf_params)

pipeline = Pipeline([
    ('impute', KNNImputer(weights='distance', n_neighbors=3)),
    ('scale', MinMaxScaler()),
    ('xgb', VotingClassifier(
        estimators=[
            ('xgb', xgb_model),
            ('rf', rf_model)
        ]
    )),
])

best_model = pipeline.fit(X_train, Y_train)

Y_pred = best_model.predict(X_test)

# Y_proba = best_model.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy: {accuracy}")

precision = precision_score(Y_test, Y_pred)
print(f"Precision: {precision}")

recall = recall_score(Y_test, Y_pred)
print(f"Recall: {recall}")

f1 = f1_score(Y_test, Y_pred)
print(f"F1 Score: {f1}")

# roc_auc = roc_auc_score(Y_test, Y_proba)
# print(f"ROC-AUC Score: {roc_auc}")

### Submission
- Here we are applying all of the preprocessing steps that we did with the training data. It's important that you perform these steps in the same order. It's also important that you using transform with the encoder and not fit. We also need to add a 0 column `LastName_11` since the encoding dimensionality of the test set is lower and needs to be fit to the pipeline.

[🔝](#Table-of-Contents)

In [None]:
df_test[['Cabin1', 'Cabin2', 'Cabin3']] = df_test['Cabin'].str.split('/', expand=True)
df_test[['FirstName', 'LastName']] = df_test['Name'].str.split(' ', expand=True)
df_test['total'] = df_test[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
df_test['AgeBin'] = pd.qcut(df_test['Age'].fillna(df_test['Age'].mode()[0]), q=5, labels=False)
df_test.drop(['Cabin', 'Name'], axis=1, inplace=True)
df_test = encoder.transform(df_test)
df_test.drop(['VIP_1'], axis=1,inplace=True)
df_test['LastName_11'] = 0


preds = best_model.predict(df_test)

In [None]:
preds = preds.astype(bool)
df_test['Transported'] = preds
submission_df = df_test[['PassengerId', 'Transported']]
submission_df.to_csv('submission.csv', index=False)