# Titanic spaceship kaggle competition

## Initialisation

In [3]:
import numpy as np
import pandas as pd
import kaggle
import seaborn as sns
import matplotlib.pyplot as plt
import os

In [4]:
REPO_DATA = 'data'
REPO_MODEL = 'model'
REPO_SUBMISSION = 'submission'

SUBMIT_ON_KAGGLE = True

In [5]:
if not os.path.exists(REPO_DATA):
    os.mkdir(REPO_DATA)
if not os.path.exists(REPO_MODEL):
    os.mkdir(REPO_MODEL)
if not os.path.exists(REPO_SUBMISSION):
    os.mkdir(REPO_SUBMISSION)

## Load data

In [6]:
import zipfile

if not os.path.exists(os.path.join(REPO_DATA, 'spaceship-titanic.zip')) \
or not os.path.exists(os.path.join(REPO_DATA, 'train.csv')) \
or not os.path.exists(os.path.join(REPO_DATA, 'test.csv')) \
or not os.path.exists(os.path.join(REPO_DATA, 'sample_submission.csv')):
    
    !kaggle competitions download -c spaceship-titanic -p $REPO_DATA
    
    with zipfile.ZipFile(os.path.join(REPO_DATA, 'spaceship-titanic.zip'), 'r') as zip_ref:
        zip_ref.extractall(REPO_DATA)
        
df_train = pd.read_csv(os.path.join(REPO_DATA, 'train.csv'))
df_test = pd.read_csv(os.path.join(REPO_DATA, 'test.csv'))

In [7]:
df_train.head(4)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False


In [8]:
df_test.head(4)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter


In [9]:
print("There is {n_rows_train} rows and {n_cols_train} columns in the train dataframe."
      .format(n_rows_train=df_train.shape[0], n_cols_train=df_train.shape[1]))
print("There is {n_rows_test} rows and {n_cols_test} columns in the test dataframe."
      .format(n_rows_test=df_test.shape[0], n_cols_test=df_test.shape[1]))

There is 8693 rows and 14 columns in the train dataframe.
There is 4277 rows and 13 columns in the test dataframe.


## Preprocessing

### PassengerId

In [10]:
df_train["GroupId"] = df_train.PassengerId.apply(lambda passenger_id: passenger_id.split('_')[0])
df_train["PassengerGroupNumber"] = df_train.PassengerId.apply(lambda passenger_id: passenger_id.split('_')[1])

df_test["GroupId"] = df_test.PassengerId.apply(lambda passenger_id: passenger_id.split('_')[0])
df_test["PassengerGroupNumber"] = df_test.PassengerId.apply(lambda passenger_id: passenger_id.split('_')[1])

### Cabin

In [11]:
df_train["CabinDeck"] = df_train.Cabin.apply(lambda cabin: cabin.split('/')[0] if pd.notna(cabin) else cabin)
df_train["CabinSide"] = df_train.Cabin.apply(lambda cabin: cabin.split('/')[2] if pd.notna(cabin) else cabin)

df_test["CabinDeck"] = df_test.Cabin.apply(lambda cabin: cabin.split('/')[0] if pd.notna(cabin) else cabin)
df_test["CabinSide"] = df_test.Cabin.apply(lambda cabin: cabin.split('/')[2] if pd.notna(cabin) else cabin)

### HomePlanet

In [12]:
group_home_planet_mapping = {row.GroupId: row.HomePlanet for _, row in pd.concat([df_train, df_test]) \
                                                                         .groupby(by=["GroupId"], axis=0, as_index=False)["HomePlanet"] \
                                                                         .first()
                                                                         .iterrows()}

group_home_planet_mapping = {key: value if value != None else np.nan for key, value in group_home_planet_mapping.items()}

df_train["HomePlanet"] = df_train.apply(lambda row: row.HomePlanet if pd.notna(row.HomePlanet) else group_home_planet_mapping.get(row.GroupId, row.HomePlanet), axis=1)
df_test["HomePlanet"] = df_test.apply(lambda row: row.HomePlanet if pd.notna(row.HomePlanet) else group_home_planet_mapping.get(row.GroupId, row.HomePlanet), axis=1)

In [13]:
def complete_missing_home_planet_using_deck(row):
    if pd.notna(row.HomePlanet):
        home_planet = row.HomePlanet
    elif pd.isna(row.CabinDeck):
        home_planet = row.HomePlanet
    elif row.CabinDeck in ['A', 'B', 'C', 'T']:
        home_planet = 'Europa'
    elif row.CabinDeck in ['G']:
        home_planet = 'Earth'
    else:
        home_planet = row.HomePlanet
    return home_planet

df_train.HomePlanet = df_train.apply(complete_missing_home_planet_using_deck, axis=1)
df_test.HomePlanet = df_test.apply(complete_missing_home_planet_using_deck, axis=1)

### Split features and target

In [14]:
X_train = df_train.drop(columns=["PassengerId", "Cabin", "GroupId", "PassengerGroupNumber", "Name", "Transported"])
X_test = df_test.drop(columns=["PassengerId", "Cabin", "GroupId", "PassengerGroupNumber", "Name"])
y_train = df_train["Transported"]
list_passenger_id = df_test["PassengerId"]

## Function model tuning

In [15]:
from sklearn.model_selection import GridSearchCV
import json

def hyperparameter_tuning(pipeline, param_grid, model_name, repo_model, X_train, y_train):
    
    clf = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        scoring="accuracy",
        n_jobs=-1,
        refit=True,
        cv=5
    )
    
    clf = clf.fit(X_train, y_train)
    
    data = {
        "model_name": model_name,
        "best_score": clf.best_score_,
        "best_params": clf.best_params_,
    }
    
    with open(os.path.join(repo_model, model_name + ".json"), "w+") as file:
        json.dump(data, file)

In [46]:
def make_submission(pipeline, model_name, repo_model, repo_submission, submit_on_kaggle, X_train, y_train, X_test, list_passenger_id):
    
    with open(os.path.join(repo_model, model_name + ".json"), 'r') as file:
        data = json.load(file)
        
    print("Model name: {}".format(data["model_name"]))
    print("Cross validation score: {}".format(data["best_score"]))
    print("Best params: {}".format(data["best_params"]))
    
    pipeline = pipeline.set_params(**data["best_params"])
    pipeline = pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    submission = pd.DataFrame({'PassengerId': list_passenger_id, 'Transported': y_pred})
    submission.to_csv(os.path.join(repo_submission, model_name + ".csv"), index=False)
    
    if submit_on_kaggle == True:
        
        path = os.path.join(repo_submission, model_name + ".csv")
        request = !kaggle competitions submit -c spaceship-titanic -f $path -m $model_name
        
        response = !kaggle competitions submissions -c spaceship-titanic --csv
        score_kaggle = float(response[2].split(',')[4])
        
        data["score_kaggle"] = score_kaggle
        
        with open(os.path.join(repo_model, model_name + ".json"), 'w+') as file:
            json.dump(data, file)
        
        print("Kaggle submission score: {}".format(score_kaggle))

## Preprocessor pipeline

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

features_1 = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
features_2 = ["Age"]
features_3 = ["CryoSleep", "VIP"]
features_4 = ["HomePlanet", "Destination", "CabinDeck", "CabinSide"]

transformer_1 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('log', FunctionTransformer(func=lambda x: np.log(x+1))),
    ('scaler', StandardScaler()),
])

transformer_2 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('yeo_jonhson', PowerTransformer(method='yeo-johnson', standardize=True)),
])

transformer_3 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=False)),
    ('caster', FunctionTransformer(func=lambda x: np.int64(x))),
])

transformer_4 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False)),
])

preprocessor = ColumnTransformer(transformers=[
    ('features_1', transformer_1, features_1),
    ('features_2', transformer_2, features_2),
    ('features_3', transformer_3, features_3),
    ('features_4', transformer_4, features_4),
])

## Pipeline

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

def get_pipeline(model_name):

    if model_name == "logistic_regression_v1":
        
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('logistic', LogisticRegression(max_iter=1000, random_state=42)),
        ])
        
    elif model_name == "knn_v1":
        
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('knn', KNeighborsClassifier()),
        ])
        
    elif model_name == "svm_v1":
        
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('svm', SVC(random_state=42)),
        ])
    
    else:
        raise NotImplementedError
        
    return pipeline

## Param grid

In [19]:
def get_param_grid(model_name):
    
    if model_name == "logistic_regression_v1":
        
        param_grid = [
            {
                "logistic__solver": ["liblinear"],
                "logistic__penalty": ["l1", "l2"],
                "logistic__C": list(np.logspace(-5, 5, 31)),
            },
            {
                "logistic__solver": ["saga"],
                "logistic__penalty": ["elasticnet"],
                "logistic__C": list(np.logspace(-5, 5, 31)),
                "logistic__l1_ratio": list(np.linspace(0.1, 0.9, 9)),
            }
        ]
        
    elif model_name == "knn_v1":
        
        param_grid = {
            "knn__n_neighbors": list(map(int, np.linspace(1, 10, 10))),
            "knn__weights": ["uniform", "distance"],
            "knn__p": [1, 2]
        }
        
    elif model_name == "svm_v1":
        
        param_grid = [
            {
                "svm__C": list(np.logspace(-5, 5, 1)), 
                "svm__kernel": ["linear", "rbf", "sigmoid"]
            },
            {
                "svm__C":  list(np.logspace(-5, 5, 1)),
                "svm__kernel": ["poly"],
                "svm__degree": list(map(int, np.linspace(1, 5, 1)))
            }
        ]
        
    else:
        raise NotImplementedError
        
    return param_grid

## Pipeline v1

### Logistic regression

In [18]:
MODEL_NAME = "logistic_regression_v1"

pipeline = get_pipeline(
    model_name=MODEL_NAME
)

param_grid = get_param_grid(
    model_name=MODEL_NAME
)

In [19]:
hyperparameter_tuning(
    pipeline=pipeline,
    param_grid=param_grid,
    model_name=MODEL_NAME,
    repo_model=REPO_MODEL,
    X_train=X_train,
    y_train=y_train
)

In [54]:
make_submission(
    pipeline=pipeline, 
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    submit_on_kaggle=SUBMIT_ON_KAGGLE, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: logistic_regression_v1
Cross validation score: 0.7747624886596068
Best params: {'logistic__C': 0.0031622776601683794, 'logistic__penalty': 'l2', 'logistic__solver': 'liblinear'}
Kaggle submission score: 0.78021


### SVM

In [20]:
MODEL_NAME = "svm_v1"

pipeline = get_pipeline(
    model_name=MODEL_NAME
)

param_grid = get_param_grid(
    model_name=MODEL_NAME
)

In [21]:
hyperparameter_tuning(
    pipeline=pipeline,
    param_grid=param_grid,
    model_name=MODEL_NAME,
    repo_model=REPO_MODEL,
    X_train=X_train,
    y_train=y_train
)

In [47]:
make_submission(
    pipeline=pipeline, 
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    submit_on_kaggle=SUBMIT_ON_KAGGLE, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: svm_v1
Cross validation score: 0.5036235657835443
Best params: {'svm__C': 1e-05, 'svm__kernel': 'linear'}
Kaggle submission score: 0.50689
