# Titanic spaceship kaggle competition

## Initialisation

In [1]:
import numpy as np
import pandas as pd
import kaggle
import os
from titanic_spaceship_package import dataframe_preprocessing, get_pipeline

In [2]:
REPO_DATA = 'data'
REPO_DATA_PREPROCESSED = 'data_preprocessed'
REPO_MODEL = 'model'
REPO_SUBMISSION = 'submission'

SUBMIT_ON_KAGGLE = True

In [3]:
if not os.path.exists(REPO_DATA):
    os.mkdir(REPO_DATA)
if not os.path.exists(REPO_DATA_PREPROCESSED):
    os.mkdir(REPO_DATA_PREPROCESSED)
if not os.path.exists(REPO_MODEL):
    os.mkdir(REPO_MODEL)
if not os.path.exists(REPO_SUBMISSION):
    os.mkdir(REPO_SUBMISSION)

## Load data

In [4]:
import zipfile

if not os.path.exists(os.path.join(REPO_DATA, 'train.csv')) \
or not os.path.exists(os.path.join(REPO_DATA, 'test.csv')) \
or not os.path.exists(os.path.join(REPO_DATA, 'sample_submission.csv')):
    
    !kaggle competitions download -c spaceship-titanic -p $REPO_DATA
    
    with zipfile.ZipFile(os.path.join(REPO_DATA, 'spaceship-titanic.zip'), 'r') as zip_ref:
        zip_ref.extractall(REPO_DATA)
        
df_train = pd.read_csv(os.path.join(REPO_DATA, 'train.csv'))
df_test = pd.read_csv(os.path.join(REPO_DATA, 'test.csv'))

In [5]:
if not os.path.exists(os.path.join(REPO_DATA_PREPROCESSED, "X_train.csv")) \
or not os.path.exists(os.path.join(REPO_DATA_PREPROCESSED, "X_test.csv")) \
or not os.path.exists(os.path.join(REPO_DATA_PREPROCESSED, "y_train.csv")) \
or not os.path.exists(os.path.join(REPO_DATA_PREPROCESSED, "list_passenger_id.csv")):
    
    X_train, X_test, y_train, list_passenger_id = dataframe_preprocessing(df_train, df_test)
    X_train.to_csv(os.path.join(REPO_DATA_PREPROCESSED, "X_train.csv"), index=False)
    X_test.to_csv(os.path.join(REPO_DATA_PREPROCESSED, "X_test.csv"), index=False)
    y_train.to_csv(os.path.join(REPO_DATA_PREPROCESSED, "y_train.csv"), index=False)
    list_passenger_id.to_csv(os.path.join(REPO_DATA_PREPROCESSED, "list_passenger_id.csv"), index=False)
    
else:
    
    X_train = pd.read_csv(os.path.join(REPO_DATA_PREPROCESSED, "X_train.csv"))
    X_test = pd.read_csv(os.path.join(REPO_DATA_PREPROCESSED, "X_test.csv"))
    y_train = pd.read_csv(os.path.join(REPO_DATA_PREPROCESSED, "y_train.csv")).Transported
    list_passenger_id = pd.read_csv(os.path.join(REPO_DATA_PREPROCESSED, "list_passenger_id.csv")).PassengerId

## Kaggle submission function

In [6]:
import json
import time

def make_submission(model_name, repo_model, repo_submission, submit_on_kaggle, X_train, y_train, X_test, list_passenger_id):
    
    with open(os.path.join(repo_model, model_name + ".json"), 'r') as file:
        data = json.load(file)
        
    print("Model name: {}".format(data["model_name"]))
    print("Cross validation score: {}".format(data["best_score"]))
    print("Best params: {}".format(data["best_params"]))
    
    pipeline = get_pipeline(model_name=model_name)
    pipeline = pipeline.set_params(**data["best_params"])
    pipeline = pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
        
    submission = pd.DataFrame({'PassengerId': list_passenger_id, 'Transported': y_pred})
    submission.to_csv(os.path.join(repo_submission, model_name + ".csv"), index=False)
    
    if submit_on_kaggle == True:
        
        path = os.path.join(repo_submission, model_name + ".csv")
        !kaggle competitions submit -c spaceship-titanic -f $path -m $model_name
        
        time.sleep(5)
        
        response = !kaggle competitions submissions -c spaceship-titanic --csv
        score_kaggle = float(response[2].split(',')[4])
        
        data["score_kaggle"] = score_kaggle
        
        with open(os.path.join(repo_model, model_name + ".json"), 'w+') as file:
            json.dump(data, file)
        
        print("Kaggle submission score: {}".format(score_kaggle))

## Pipeline v1

### Logistic regression

In [7]:
MODEL_NAME = "logistic_regression__v01"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

make_submission(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    submit_on_kaggle=SUBMIT_ON_KAGGLE, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: logistic_regression__v01
Cross validation score: 0.7794786363867969
Best params: {'logistic__C': 0.01, 'logistic__penalty': 'l2', 'logistic__solver': 'liblinear'}
Successfully submitted to Spaceship Titanic



  0%|          | 0.00/60.5k [00:00<?, ?B/s]
 13%|#3        | 8.00k/60.5k [00:00<00:01, 42.4kB/s]
100%|##########| 60.5k/60.5k [00:02<00:00, 30.8kB/s]


Kaggle submission score: 0.78442


### KNN

In [None]:
MODEL_NAME = "knn__v01"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

make_submission(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    submit_on_kaggle=SUBMIT_ON_KAGGLE, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

### SVM

In [None]:
MODEL_NAME = "svm__v01"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

make_submission(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    submit_on_kaggle=SUBMIT_ON_KAGGLE, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)