# Titanic spaceship kaggle competition

## Initialisation

In [1]:
import numpy as np
import pandas as pd
import kaggle
import os
import json
import time
from titanic_spaceship_package import dataframe_preprocessing, get_pipeline

In [2]:
REPO_DATA = 'data'
REPO_DATA_PREPROCESSED = 'data_preprocessed'
REPO_MODEL = 'model'
REPO_SUBMISSION = 'submission'

In [3]:
if not os.path.exists(REPO_DATA):
    os.mkdir(REPO_DATA)
if not os.path.exists(REPO_DATA_PREPROCESSED):
    os.mkdir(REPO_DATA_PREPROCESSED)
if not os.path.exists(REPO_MODEL):
    os.mkdir(REPO_MODEL)
if not os.path.exists(REPO_SUBMISSION):
    os.mkdir(REPO_SUBMISSION)

## Load data

In [4]:
import zipfile

if not os.path.exists(os.path.join(REPO_DATA, 'train.csv')) \
or not os.path.exists(os.path.join(REPO_DATA, 'test.csv')) \
or not os.path.exists(os.path.join(REPO_DATA, 'sample_submission.csv')):
    
    !kaggle competitions download -c spaceship-titanic -p $REPO_DATA
    
    with zipfile.ZipFile(os.path.join(REPO_DATA, 'spaceship-titanic.zip'), 'r') as zip_ref:
        zip_ref.extractall(REPO_DATA)
        
df_train = pd.read_csv(os.path.join(REPO_DATA, 'train.csv'))
df_test = pd.read_csv(os.path.join(REPO_DATA, 'test.csv'))

In [5]:
if not os.path.exists(os.path.join(REPO_DATA_PREPROCESSED, "X_train.csv")) \
or not os.path.exists(os.path.join(REPO_DATA_PREPROCESSED, "X_test.csv")) \
or not os.path.exists(os.path.join(REPO_DATA_PREPROCESSED, "y_train.csv")) \
or not os.path.exists(os.path.join(REPO_DATA_PREPROCESSED, "list_passenger_id.csv")):
    
    X_train, X_test, y_train, list_passenger_id = dataframe_preprocessing(df_train, df_test)
    X_train.to_csv(os.path.join(REPO_DATA_PREPROCESSED, "X_train.csv"), index=False)
    X_test.to_csv(os.path.join(REPO_DATA_PREPROCESSED, "X_test.csv"), index=False)
    y_train.to_csv(os.path.join(REPO_DATA_PREPROCESSED, "y_train.csv"), index=False)
    list_passenger_id.to_csv(os.path.join(REPO_DATA_PREPROCESSED, "list_passenger_id.csv"), index=False)
    
else:
    
    X_train = pd.read_csv(os.path.join(REPO_DATA_PREPROCESSED, "X_train.csv"))
    X_test = pd.read_csv(os.path.join(REPO_DATA_PREPROCESSED, "X_test.csv"))
    y_train = pd.read_csv(os.path.join(REPO_DATA_PREPROCESSED, "y_train.csv")).Transported
    list_passenger_id = pd.read_csv(os.path.join(REPO_DATA_PREPROCESSED, "list_passenger_id.csv")).PassengerId

## Kaggle submission function

In [6]:
def analyze_model(model_name, repo_model, repo_submission, X_train, y_train, X_test, list_passenger_id):
    
    with open(os.path.join(repo_model, model_name + ".json"), 'r') as file:
        data = json.load(file)
        
    print("Model name: {}".format(data.get("model_name")))
    print("Cross validation score: {:.5f}".format(data.get("best_score")))
    print("Best params: {}".format(data.get("best_params")))
    print("Time to tune: {} seconds".format(int(data.get("time_to_tune"))))
    
    score_kaggle = data.get("score_kaggle")
    
    if score_kaggle == None:
    
        pipeline = get_pipeline(model_name=model_name)
        pipeline = pipeline.set_params(**data["best_params"])
        pipeline = pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)

        submission = pd.DataFrame({'PassengerId': list_passenger_id, 'Transported': y_pred})
        submission.to_csv(os.path.join(repo_submission, model_name + ".csv"), index=False)
            
        path = os.path.join(repo_submission, model_name + ".csv")
        request = !kaggle competitions submit -c spaceship-titanic -f $path -m $model_name"
                
        time.sleep(5)
        
        response = !kaggle competitions submissions -c spaceship-titanic --csv
        score_kaggle = float(response[2].split(',')[4])
        
        data["score_kaggle"] = score_kaggle
        
        with open(os.path.join(repo_model, model_name + ".json"), 'w+') as file:
            json.dump(data, file)
        
    print("Kaggle submission score: {}".format(score_kaggle))

## Basic features

### Pipeline v1

#### Logistic regression

In [18]:
MODEL_NAME = "logistic_regression__v01"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME
    
analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: logistic_regression__v01
Cross validation score: 0.77948
Best params: {'logistic__C': 0.01, 'logistic__penalty': 'l2', 'logistic__solver': 'liblinear'}
Time to tune: 1476 seconds
Kaggle submission score: 0.78442


#### KNN

In [19]:
MODEL_NAME = "knn__v01"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: knn__v01
Cross validation score: 0.78880
Best params: {'knn__n_neighbors': 47, 'knn__p': 2, 'knn__weights': 'uniform'}
Time to tune: 5311 seconds
Kaggle submission score: 0.78793


#### SVM

In [20]:
MODEL_NAME = "svm__v01"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: svm__v01
Cross validation score: 0.80053
Best params: {'svm__C': 0.46415888336127775, 'svm__degree': 3, 'svm__kernel': 'poly'}
Time to tune: 2623 seconds
Kaggle submission score: 0.80032


#### GNB

In [21]:
MODEL_NAME = "gnb__v01"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: gnb__v01
Cross validation score: 0.75774
Best params: {'gnb__var_smoothing': 0.811130830789689}
Time to tune: 42 seconds
Kaggle submission score: 0.77016


### Pipeline v2

#### Logistic regression

In [14]:
MODEL_NAME = "logistic_regression__v02"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: logistic_regression__v02
Cross validation score: 0.78155
Best params: {'feature_selection__k': 15, 'logistic__C': 0.021544346900318822, 'logistic__l1_ratio': 0.1, 'logistic__penalty': 'elasticnet', 'logistic__solver': 'saga'}
Time to tune: 12903 seconds
Kaggle submission score: 0.78349


#### KNN

In [15]:
MODEL_NAME = "knn__v02"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: knn__v02
Cross validation score: 0.79029
Best params: {'feature_selection__k': 18, 'knn__n_neighbors': 31, 'knn__p': 1, 'knn__weights': 'uniform'}
Time to tune: 64065 seconds
Kaggle submission score: 0.78442


#### SVM

In [16]:
MODEL_NAME = "svm__v02"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: svm__v02
Cross validation score: 0.80122
Best params: {'feature_selection__k': 20, 'svm__C': 0.46415888336127775, 'svm__degree': 3, 'svm__kernel': 'poly'}
Time to tune: 307954 seconds
Kaggle submission score: 0.79892


#### GNB

In [17]:
MODEL_NAME = "gnb__v02"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: gnb__v02
Cross validation score: 0.76993
Best params: {'feature_selection__k': 6, 'gnb__var_smoothing': 1.0}
Time to tune: 1177 seconds
Kaggle submission score: 0.76712


### Pipeline v3

#### Logistic regression

In [11]:
MODEL_NAME = "logistic_regression__v03"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: logistic_regression__v03
Cross validation score: 0.78167
Best params: {'feature_selection__k': 16, 'logistic__C': 0.21544346900318823, 'logistic__l1_ratio': 0.2, 'logistic__penalty': 'elasticnet', 'logistic__solver': 'saga'}
Time to tune: 57133 seconds
Kaggle submission score: 0.77928


#### KNN

In [12]:
MODEL_NAME = "knn__v03"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: knn__v03
Cross validation score: 0.79593
Best params: {'feature_selection__k': 13, 'knn__n_neighbors': 48, 'knn__p': 1, 'knn__weights': 'uniform'}
Time to tune: 75335 seconds
Kaggle submission score: 0.78887


#### SVM

In [None]:
MODEL_NAME = "svm__v03"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

#### GNB

In [13]:
MODEL_NAME = "gnb__v03"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: gnb__v03
Cross validation score: 0.76579
Best params: {'feature_selection__k': 4, 'gnb__var_smoothing': 1.0}
Time to tune: 13949 seconds
Kaggle submission score: 0.7739


### Pipeline v4

#### Logistic regression

In [7]:
MODEL_NAME = "logistic_regression__v04"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: logistic_regression__v04
Cross validation score: 0.78063
Best params: {'feature_selection__k': 24, 'logistic__C': 0.01, 'logistic__penalty': 'l2', 'logistic__solver': 'liblinear'}
Time to tune: 58302 seconds
Kaggle submission score: 0.78512


#### KNN

In [8]:
MODEL_NAME = "knn__v04"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: knn__v04
Cross validation score: 0.79501
Best params: {'feature_selection__k': 18, 'knn__n_neighbors': 33, 'knn__p': 1, 'knn__weights': 'uniform'}
Time to tune: 105862 seconds
Kaggle submission score: 0.78723


#### SVM

In [9]:
MODEL_NAME = "svm__v04"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: svm__v04
Cross validation score: 0.80019
Best params: {'feature_selection__k': 21, 'svm__C': 4.6415888336127775, 'svm__degree': 2, 'svm__kernel': 'poly'}
Time to tune: 226415 seconds
Kaggle submission score: 0.79448


#### GNB

In [10]:
MODEL_NAME = "gnb__v04"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: gnb__v04
Cross validation score: 0.76648
Best params: {'feature_selection__k': 4, 'gnb__var_smoothing': 0.5336699231206302}
Time to tune: 14452 seconds
Kaggle submission score: 0.77367


### Pipeline v5

#### Logistic regression

In [22]:
MODEL_NAME = "logistic_regression__v05"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: logistic_regression__v05
Cross validation score: 0.78075
Best params: {'feature_selection__k': 12, 'logistic__C': 2154.434690031878, 'logistic__l1_ratio': 0.5, 'logistic__penalty': 'elasticnet', 'logistic__solver': 'saga'}
Time to tune: 59880 seconds
Kaggle submission score: 0.77671


#### KNN

In [23]:
MODEL_NAME = "knn__v05"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: knn__v05
Cross validation score: 0.79248
Best params: {'feature_selection__k': 11, 'knn__n_neighbors': 37, 'knn__p': 1, 'knn__weights': 'uniform'}
Time to tune: 91760 seconds
Kaggle submission score: 0.78021


#### SVM

In [24]:
MODEL_NAME = "svm__v05"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: svm__v05
Cross validation score: 0.80191
Best params: {'feature_selection__k': 23, 'svm__C': 0.46415888336127775, 'svm__degree': 3, 'svm__kernel': 'poly'}
Time to tune: 180134 seconds
Kaggle submission score: 0.79939


#### GNB

In [25]:
MODEL_NAME = "gnb__v05"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: gnb__v05
Cross validation score: 0.76648
Best params: {'feature_selection__k': 4, 'gnb__var_smoothing': 0.5336699231206302}
Time to tune: 14887 seconds
Kaggle submission score: 0.77367


### Pipeline v6

#### Logistic regression

In [26]:
MODEL_NAME = "logistic_regression__v06"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: logistic_regression__v06
Cross validation score: 0.78040
Best params: {'feature_selection__k': 15, 'logistic__C': 4641.588833612773, 'logistic__l1_ratio': 0.8, 'logistic__penalty': 'elasticnet', 'logistic__solver': 'saga'}
Time to tune: 61262 seconds
Kaggle submission score: 0.78255


#### KNN

In [27]:
MODEL_NAME = "knn__v06"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: knn__v06
Cross validation score: 0.79340
Best params: {'feature_selection__k': 13, 'knn__n_neighbors': 49, 'knn__p': 1, 'knn__weights': 'uniform'}
Time to tune: 110870 seconds
Kaggle submission score: 0.78559


#### SVM

In [28]:
MODEL_NAME = "svm__v06"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: svm__v06
Cross validation score: 0.80065
Best params: {'feature_selection__k': 24, 'svm__C': 0.46415888336127775, 'svm__degree': 3, 'svm__kernel': 'poly'}
Time to tune: 209248 seconds
Kaggle submission score: 0.79939


#### GNB

In [29]:
MODEL_NAME = "gnb__v06"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: gnb__v06
Cross validation score: 0.76637
Best params: {'feature_selection__k': 4, 'gnb__var_smoothing': 0.6579332246575682}
Time to tune: 14559 seconds
Kaggle submission score: 0.77437


### Pipeline v7

#### Logistic regression

In [30]:
MODEL_NAME = "logistic_regression__v07"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: logistic_regression__v07
Cross validation score: 0.78074
Best params: {'feature_selection__k': 15, 'logistic__C': 10000.0, 'logistic__penalty': 'l2', 'logistic__solver': 'liblinear'}
Time to tune: 62178 seconds
Kaggle submission score: 0.78209


#### KNN

In [31]:
MODEL_NAME = "knn__v07"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: knn__v07
Cross validation score: 0.79363
Best params: {'feature_selection__k': 16, 'knn__n_neighbors': 51, 'knn__p': 1, 'knn__weights': 'uniform'}
Time to tune: 113486 seconds
Kaggle submission score: 0.78863


#### SVM

In [32]:
MODEL_NAME = "svm__v07"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: svm__v07
Cross validation score: 0.80122
Best params: {'feature_selection__k': 24, 'svm__C': 1.0, 'svm__degree': 3, 'svm__kernel': 'poly'}
Time to tune: 220124 seconds
Kaggle submission score: 0.80009


#### GNB

In [33]:
MODEL_NAME = "gnb__v07"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: gnb__v07
Cross validation score: 0.76648
Best params: {'feature_selection__k': 4, 'gnb__var_smoothing': 0.5336699231206302}
Time to tune: 15604 seconds
Kaggle submission score: 0.77367


### Pipeline v8

#### Logistic regression

In [34]:
MODEL_NAME = "logistic_regression__v08"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: logistic_regression__v08
Cross validation score: 0.77948
Best params: {'feature_selection__max_features': 23, 'logistic__C': 0.01, 'logistic__penalty': 'l2', 'logistic__solver': 'liblinear'}
Time to tune: 42744 seconds
Kaggle submission score: 0.78419


#### KNN

In [35]:
MODEL_NAME = "knn__v08"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: knn__v08
Cross validation score: 0.79121
Best params: {'feature_selection__max_features': 11, 'knn__n_neighbors': 63, 'knn__p': 1, 'knn__weights': 'uniform'}
Time to tune: 83657 seconds
Kaggle submission score: 0.78349


#### SVM

In [36]:
MODEL_NAME = "svm__v08"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: svm__v08
Cross validation score: 0.80168
Best params: {'feature_selection__max_features': 25, 'svm__C': 0.46415888336127775, 'svm__degree': 3, 'svm__kernel': 'poly'}
Time to tune: 97146 seconds
Kaggle submission score: 0.79962


#### GNB

In [37]:
MODEL_NAME = "gnb__v08"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: gnb__v08
Cross validation score: 0.76717
Best params: {'feature_selection__max_features': 4, 'gnb__var_smoothing': 0.3511191734215127}
Time to tune: 12574 seconds
Kaggle submission score: 0.77484


### Pipeline v9

#### Logistic regression

In [38]:
MODEL_NAME = "logistic_regression__v09"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: logistic_regression__v09
Cross validation score: 0.77994
Best params: {'feature_selection__max_features': 19, 'logistic__C': 0.021544346900318822, 'logistic__l1_ratio': 0.4, 'logistic__penalty': 'elasticnet', 'logistic__solver': 'saga'}
Time to tune: 13259 seconds
Kaggle submission score: 0.78115


#### KNN

In [39]:
MODEL_NAME = "knn__v09"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: knn__v09
Cross validation score: 0.79018
Best params: {'feature_selection__max_features': 27, 'knn__n_neighbors': 47, 'knn__p': 2, 'knn__weights': 'uniform'}
Time to tune: 20887 seconds
Kaggle submission score: 0.7877


#### SVM

In [40]:
MODEL_NAME = "svm__v09"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: svm__v09
Cross validation score: 0.80053
Best params: {'feature_selection__max_features': 26, 'svm__C': 0.46415888336127775, 'svm__degree': 3, 'svm__kernel': 'poly'}
Time to tune: 185765 seconds
Kaggle submission score: 0.79939


#### GNB

In [41]:
MODEL_NAME = "gnb__v09"

if not os.path.exists(os.path.join(REPO_MODEL, MODEL_NAME + ".json")):
    !python script_tuning.py $MODEL_NAME

analyze_model(
    model_name=MODEL_NAME, 
    repo_model=REPO_MODEL, 
    repo_submission=REPO_SUBMISSION, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    list_passenger_id=list_passenger_id
)

Model name: gnb__v09
Cross validation score: 0.77062
Best params: {'feature_selection__max_features': 7, 'gnb__var_smoothing': 0.811130830789689}
Time to tune: 1013 seconds
Kaggle submission score: 0.77718


## Analysis

### By model

In [42]:
VERSIONS = [str(x) if x > 9 else "0" + str(x) for x in range(1, 10)]
COLUMNS = ["Version", "Description", "Best_score", "Time_to_tune", "Score_kaggle", "Best_params"]
DESCRIPTION_VERSIONS = {
    "01": "Basic feature",
    "02": "Basic feature with feature selection f_classif",
    "03": "Basic feature with feature selection MI n=1",
    "04": "Basic feature with feature selection MI n=2",
    "05": "Basic feature with feature selection MI n=3",
    "06": "Basic feature with feature selection MI n=4",
    "07": "Basic feature with feature selection MI n=5",
    "08": "Basic feature with feature selection from rf",
    "09": "Basic feature with feature selection from ridge",
}

#### Logistic regression

In [43]:
df_results = pd.DataFrame(columns=COLUMNS)
NAME_MODEL = "logistic_regression"

for version in VERSIONS:
    
    if os.path.exists(os.path.join(REPO_MODEL, NAME_MODEL + "__v" + version + ".json")):
        with open(os.path.join(REPO_MODEL, NAME_MODEL + "__v" + version + ".json"), "r") as file:
            data = json.load(file)
        df_results = pd.concat([
            df_results, 
            pd.DataFrame({
                "Version": [version], 
                "Description": DESCRIPTION_VERSIONS.get(version),
                "Best_score": [data["best_score"]],
                "Time_to_tune": [int(data["time_to_tune"])],
                "Score_kaggle": [data["score_kaggle"]],
                "Best_params": [data["best_params"]],
                
            })])
        
df_results.reset_index(inplace=True, drop=True)
df_results.style.apply(lambda row: ['background-color: yellow'] * df_results.shape[1] if row.Best_score == max(df_results.Best_score) else ['background-color: white'] * df_results.shape[1], axis=1)

Unnamed: 0,Version,Description,Best_score,Time_to_tune,Score_kaggle,Best_params
0,1,Basic feature,0.779479,1476,0.78442,"{'logistic__C': 0.01, 'logistic__penalty': 'l2', 'logistic__solver': 'liblinear'}"
1,2,Basic feature with feature selection f_classif,0.781549,12903,0.78349,"{'feature_selection__k': 15, 'logistic__C': 0.021544346900318822, 'logistic__l1_ratio': 0.1, 'logistic__penalty': 'elasticnet', 'logistic__solver': 'saga'}"
2,3,Basic feature with feature selection MI n=1,0.781665,57133,0.77928,"{'feature_selection__k': 16, 'logistic__C': 0.21544346900318823, 'logistic__l1_ratio': 0.2, 'logistic__penalty': 'elasticnet', 'logistic__solver': 'saga'}"
3,4,Basic feature with feature selection MI n=2,0.780629,58302,0.78512,"{'feature_selection__k': 24, 'logistic__C': 0.01, 'logistic__penalty': 'l2', 'logistic__solver': 'liblinear'}"
4,5,Basic feature with feature selection MI n=3,0.780745,59880,0.77671,"{'feature_selection__k': 12, 'logistic__C': 2154.434690031878, 'logistic__l1_ratio': 0.5, 'logistic__penalty': 'elasticnet', 'logistic__solver': 'saga'}"
5,6,Basic feature with feature selection MI n=4,0.7804,61262,0.78255,"{'feature_selection__k': 15, 'logistic__C': 4641.588833612773, 'logistic__l1_ratio': 0.8, 'logistic__penalty': 'elasticnet', 'logistic__solver': 'saga'}"
6,7,Basic feature with feature selection MI n=5,0.780745,62178,0.78209,"{'feature_selection__k': 15, 'logistic__C': 10000.0, 'logistic__penalty': 'l2', 'logistic__solver': 'liblinear'}"
7,8,Basic feature with feature selection from rf,0.779479,42744,0.78419,"{'feature_selection__max_features': 23, 'logistic__C': 0.01, 'logistic__penalty': 'l2', 'logistic__solver': 'liblinear'}"
8,9,Basic feature with feature selection from ridge,0.779939,13259,0.78115,"{'feature_selection__max_features': 19, 'logistic__C': 0.021544346900318822, 'logistic__l1_ratio': 0.4, 'logistic__penalty': 'elasticnet', 'logistic__solver': 'saga'}"


#### KNN

In [44]:
df_results = pd.DataFrame(columns=COLUMNS)
NAME_MODEL = "knn"

for version in VERSIONS:
    
    if os.path.exists(os.path.join(REPO_MODEL, NAME_MODEL + "__v" + version + ".json")):
        with open(os.path.join(REPO_MODEL, NAME_MODEL + "__v" + version + ".json"), "r") as file:
            data = json.load(file)
        df_results = pd.concat([
            df_results, 
            pd.DataFrame({
                "Version": [version], 
                "Description": DESCRIPTION_VERSIONS.get(version), 
                "Best_score": [data["best_score"]],
                "Time_to_tune": [int(data["time_to_tune"])],
                "Score_kaggle": [data["score_kaggle"]],
                "Best_params": [data["best_params"]],
            })])
        
df_results.reset_index(inplace=True, drop=True)
df_results.style.apply(lambda row: ['background-color: yellow'] * df_results.shape[1] if row.Best_score == max(df_results.Best_score) else ['background-color: white'] * df_results.shape[1], axis=1)

Unnamed: 0,Version,Description,Best_score,Time_to_tune,Score_kaggle,Best_params
0,1,Basic feature,0.788799,5311,0.78793,"{'knn__n_neighbors': 47, 'knn__p': 2, 'knn__weights': 'uniform'}"
1,2,Basic feature with feature selection f_classif,0.790294,64065,0.78442,"{'feature_selection__k': 18, 'knn__n_neighbors': 31, 'knn__p': 1, 'knn__weights': 'uniform'}"
2,3,Basic feature with feature selection MI n=1,0.795932,75335,0.78887,"{'feature_selection__k': 13, 'knn__n_neighbors': 48, 'knn__p': 1, 'knn__weights': 'uniform'}"
3,4,Basic feature with feature selection MI n=2,0.795009,105862,0.78723,"{'feature_selection__k': 18, 'knn__n_neighbors': 33, 'knn__p': 1, 'knn__weights': 'uniform'}"
4,5,Basic feature with feature selection MI n=3,0.792481,91760,0.78021,"{'feature_selection__k': 11, 'knn__n_neighbors': 37, 'knn__p': 1, 'knn__weights': 'uniform'}"
5,6,Basic feature with feature selection MI n=4,0.7934,110870,0.78559,"{'feature_selection__k': 13, 'knn__n_neighbors': 49, 'knn__p': 1, 'knn__weights': 'uniform'}"
6,7,Basic feature with feature selection MI n=5,0.793631,113486,0.78863,"{'feature_selection__k': 16, 'knn__n_neighbors': 51, 'knn__p': 1, 'knn__weights': 'uniform'}"
7,8,Basic feature with feature selection from rf,0.791214,83657,0.78349,"{'feature_selection__max_features': 11, 'knn__n_neighbors': 63, 'knn__p': 1, 'knn__weights': 'uniform'}"
8,9,Basic feature with feature selection from ridge,0.790179,20887,0.7877,"{'feature_selection__max_features': 27, 'knn__n_neighbors': 47, 'knn__p': 2, 'knn__weights': 'uniform'}"


#### SVM 

In [45]:
df_results = pd.DataFrame(columns=COLUMNS)
NAME_MODEL = "svm"

for version in VERSIONS:
    
    if os.path.exists(os.path.join(REPO_MODEL, NAME_MODEL + "__v" + version + ".json")):
        with open(os.path.join(REPO_MODEL, NAME_MODEL + "__v" + version + ".json"), "r") as file:
            data = json.load(file)
        df_results = pd.concat([
            df_results, 
            pd.DataFrame({
                "Version": [version], 
                "Description": DESCRIPTION_VERSIONS.get(version), 
                "Best_score": [data["best_score"]],
                "Time_to_tune": [int(data["time_to_tune"])],
                "Score_kaggle": [data["score_kaggle"]],
                "Best_params": [data["best_params"]],
            })])
        
df_results.reset_index(inplace=True, drop=True)
df_results.style.apply(lambda row: ['background-color: yellow'] * df_results.shape[1] if row.Best_score == max(df_results.Best_score) else ['background-color: white'] * df_results.shape[1], axis=1)

Unnamed: 0,Version,Description,Best_score,Time_to_tune,Score_kaggle,Best_params
0,1,Basic feature,0.800532,2623,0.80032,"{'svm__C': 0.46415888336127775, 'svm__degree': 3, 'svm__kernel': 'poly'}"
1,2,Basic feature with feature selection f_classif,0.801222,307954,0.79892,"{'feature_selection__k': 20, 'svm__C': 0.46415888336127775, 'svm__degree': 3, 'svm__kernel': 'poly'}"
2,4,Basic feature with feature selection MI n=2,0.800187,226415,0.79448,"{'feature_selection__k': 21, 'svm__C': 4.6415888336127775, 'svm__degree': 2, 'svm__kernel': 'poly'}"
3,5,Basic feature with feature selection MI n=3,0.801912,180134,0.79939,"{'feature_selection__k': 23, 'svm__C': 0.46415888336127775, 'svm__degree': 3, 'svm__kernel': 'poly'}"
4,6,Basic feature with feature selection MI n=4,0.800647,209248,0.79939,"{'feature_selection__k': 24, 'svm__C': 0.46415888336127775, 'svm__degree': 3, 'svm__kernel': 'poly'}"
5,7,Basic feature with feature selection MI n=5,0.801223,220124,0.80009,"{'feature_selection__k': 24, 'svm__C': 1.0, 'svm__degree': 3, 'svm__kernel': 'poly'}"
6,8,Basic feature with feature selection from rf,0.801682,97146,0.79962,"{'feature_selection__max_features': 25, 'svm__C': 0.46415888336127775, 'svm__degree': 3, 'svm__kernel': 'poly'}"
7,9,Basic feature with feature selection from ridge,0.800532,185765,0.79939,"{'feature_selection__max_features': 26, 'svm__C': 0.46415888336127775, 'svm__degree': 3, 'svm__kernel': 'poly'}"


#### GNB

In [46]:
df_results = pd.DataFrame(columns=COLUMNS)
NAME_MODEL = "gnb"

for version in VERSIONS:
    
    if os.path.exists(os.path.join(REPO_MODEL, NAME_MODEL + "__v" + version + ".json")):
        with open(os.path.join(REPO_MODEL, NAME_MODEL + "__v" + version + ".json"), "r") as file:
            data = json.load(file)
        df_results = pd.concat([
            df_results, 
            pd.DataFrame({
                "Version": [version], 
                "Description": DESCRIPTION_VERSIONS.get(version), 
                "Best_score": [data["best_score"]],
                "Time_to_tune": [int(data["time_to_tune"])],
                "Score_kaggle": [data["score_kaggle"]],
                "Best_params": [data["best_params"]],
            })])
        
df_results.reset_index(inplace=True, drop=True)
df_results.style.apply(lambda row: ['background-color: yellow'] * df_results.shape[1] if row.Best_score == max(df_results.Best_score) else ['background-color: white'] * df_results.shape[1], axis=1)

Unnamed: 0,Version,Description,Best_score,Time_to_tune,Score_kaggle,Best_params
0,1,Basic feature,0.757738,42,0.77016,{'gnb__var_smoothing': 0.811130830789689}
1,2,Basic feature with feature selection f_classif,0.769933,1177,0.76712,"{'feature_selection__k': 6, 'gnb__var_smoothing': 1.0}"
2,3,Basic feature with feature selection MI n=1,0.765792,13949,0.7739,"{'feature_selection__k': 4, 'gnb__var_smoothing': 1.0}"
3,4,Basic feature with feature selection MI n=2,0.766482,14452,0.77367,"{'feature_selection__k': 4, 'gnb__var_smoothing': 0.5336699231206302}"
4,5,Basic feature with feature selection MI n=3,0.766482,14887,0.77367,"{'feature_selection__k': 4, 'gnb__var_smoothing': 0.5336699231206302}"
5,6,Basic feature with feature selection MI n=4,0.766367,14559,0.77437,"{'feature_selection__k': 4, 'gnb__var_smoothing': 0.6579332246575682}"
6,7,Basic feature with feature selection MI n=5,0.766482,15604,0.77367,"{'feature_selection__k': 4, 'gnb__var_smoothing': 0.5336699231206302}"
7,8,Basic feature with feature selection from rf,0.767173,12574,0.77484,"{'feature_selection__max_features': 4, 'gnb__var_smoothing': 0.3511191734215127}"
8,9,Basic feature with feature selection from ridge,0.770624,1013,0.77718,"{'feature_selection__max_features': 7, 'gnb__var_smoothing': 0.811130830789689}"


### By version

In [47]:
COLUMNS = ["Name_model", "Best_score", "Time_to_tune", "Score_kaggle", "Best_params"]
MODELS = ["logistic_regression", "knn", "svm", "gnb"]

#### Version 1

In [48]:
df_results = pd.DataFrame(columns=COLUMNS)
NAME_VERSION = "01"

for model in MODELS:
    
    if os.path.exists(os.path.join(REPO_MODEL, model + "__v" + NAME_VERSION + ".json")):
        with open(os.path.join(REPO_MODEL, model + "__v" + NAME_VERSION + ".json"), "r") as file:
            data = json.load(file)
        df_results = pd.concat([
            df_results, 
            pd.DataFrame({
                "Name_model": [model], 
                "Best_score": [data["best_score"]],
                "Time_to_tune": [int(data["time_to_tune"])],
                "Score_kaggle": [data["score_kaggle"]],
                "Best_params": [data["best_params"]],
            })])
        
df_results.reset_index(inplace=True, drop=True)
df_results.style.apply(lambda row: ['background-color: yellow'] * df_results.shape[1] if row.Best_score == max(df_results.Best_score) else ['background-color: white'] * df_results.shape[1], axis=1)

Unnamed: 0,Name_model,Best_score,Time_to_tune,Score_kaggle,Best_params
0,logistic_regression,0.779479,1476,0.78442,"{'logistic__C': 0.01, 'logistic__penalty': 'l2', 'logistic__solver': 'liblinear'}"
1,knn,0.788799,5311,0.78793,"{'knn__n_neighbors': 47, 'knn__p': 2, 'knn__weights': 'uniform'}"
2,svm,0.800532,2623,0.80032,"{'svm__C': 0.46415888336127775, 'svm__degree': 3, 'svm__kernel': 'poly'}"
3,gnb,0.757738,42,0.77016,{'gnb__var_smoothing': 0.811130830789689}


#### Version 2

In [49]:
df_results = pd.DataFrame(columns=COLUMNS)
NAME_VERSION = "02"

for model in MODELS:
    
    if os.path.exists(os.path.join(REPO_MODEL, model + "__v" + NAME_VERSION + ".json")):
        with open(os.path.join(REPO_MODEL, model + "__v" + NAME_VERSION + ".json"), "r") as file:
            data = json.load(file)
        df_results = pd.concat([
            df_results, 
            pd.DataFrame({
                "Name_model": [model], 
                "Best_score": [data["best_score"]],
                "Time_to_tune": [int(data["time_to_tune"])],
                "Score_kaggle": [data["score_kaggle"]],
                "Best_params": [data["best_params"]],
            })])
        
df_results.reset_index(inplace=True, drop=True)
df_results.style.apply(lambda row: ['background-color: yellow'] * df_results.shape[1] if row.Best_score == max(df_results.Best_score) else ['background-color: white'] * df_results.shape[1], axis=1)

Unnamed: 0,Name_model,Best_score,Time_to_tune,Score_kaggle,Best_params
0,logistic_regression,0.781549,12903,0.78349,"{'feature_selection__k': 15, 'logistic__C': 0.021544346900318822, 'logistic__l1_ratio': 0.1, 'logistic__penalty': 'elasticnet', 'logistic__solver': 'saga'}"
1,knn,0.790294,64065,0.78442,"{'feature_selection__k': 18, 'knn__n_neighbors': 31, 'knn__p': 1, 'knn__weights': 'uniform'}"
2,svm,0.801222,307954,0.79892,"{'feature_selection__k': 20, 'svm__C': 0.46415888336127775, 'svm__degree': 3, 'svm__kernel': 'poly'}"
3,gnb,0.769933,1177,0.76712,"{'feature_selection__k': 6, 'gnb__var_smoothing': 1.0}"


#### Version 3

In [50]:
df_results = pd.DataFrame(columns=COLUMNS)
NAME_VERSION = "03"

for model in MODELS:
    
    if os.path.exists(os.path.join(REPO_MODEL, model + "__v" + NAME_VERSION + ".json")):
        with open(os.path.join(REPO_MODEL, model + "__v" + NAME_VERSION + ".json"), "r") as file:
            data = json.load(file)
        df_results = pd.concat([
            df_results, 
            pd.DataFrame({
                "Name_model": [model], 
                "Best_score": [data["best_score"]],
                "Time_to_tune": [int(data["time_to_tune"])],
                "Score_kaggle": [data["score_kaggle"]],
                "Best_params": [data["best_params"]],
            })])
        
df_results.reset_index(inplace=True, drop=True)
df_results.style.apply(lambda row: ['background-color: yellow'] * df_results.shape[1] if row.Best_score == max(df_results.Best_score) else ['background-color: white'] * df_results.shape[1], axis=1)

Unnamed: 0,Name_model,Best_score,Time_to_tune,Score_kaggle,Best_params
0,logistic_regression,0.781665,57133,0.77928,"{'feature_selection__k': 16, 'logistic__C': 0.21544346900318823, 'logistic__l1_ratio': 0.2, 'logistic__penalty': 'elasticnet', 'logistic__solver': 'saga'}"
1,knn,0.795932,75335,0.78887,"{'feature_selection__k': 13, 'knn__n_neighbors': 48, 'knn__p': 1, 'knn__weights': 'uniform'}"
2,gnb,0.765792,13949,0.7739,"{'feature_selection__k': 4, 'gnb__var_smoothing': 1.0}"


#### Version 4

In [51]:
df_results = pd.DataFrame(columns=COLUMNS)
NAME_VERSION = "04"

for model in MODELS:
    
    if os.path.exists(os.path.join(REPO_MODEL, model + "__v" + NAME_VERSION + ".json")):
        with open(os.path.join(REPO_MODEL, model + "__v" + NAME_VERSION + ".json"), "r") as file:
            data = json.load(file)
        df_results = pd.concat([
            df_results, 
            pd.DataFrame({
                "Name_model": [model], 
                "Best_score": [data["best_score"]],
                "Time_to_tune": [int(data["time_to_tune"])],
                "Score_kaggle": [data["score_kaggle"]],
                "Best_params": [data["best_params"]],
            })])
        
df_results.reset_index(inplace=True, drop=True)
df_results.style.apply(lambda row: ['background-color: yellow'] * df_results.shape[1] if row.Best_score == max(df_results.Best_score) else ['background-color: white'] * df_results.shape[1], axis=1)

Unnamed: 0,Name_model,Best_score,Time_to_tune,Score_kaggle,Best_params
0,logistic_regression,0.780629,58302,0.78512,"{'feature_selection__k': 24, 'logistic__C': 0.01, 'logistic__penalty': 'l2', 'logistic__solver': 'liblinear'}"
1,knn,0.795009,105862,0.78723,"{'feature_selection__k': 18, 'knn__n_neighbors': 33, 'knn__p': 1, 'knn__weights': 'uniform'}"
2,svm,0.800187,226415,0.79448,"{'feature_selection__k': 21, 'svm__C': 4.6415888336127775, 'svm__degree': 2, 'svm__kernel': 'poly'}"
3,gnb,0.766482,14452,0.77367,"{'feature_selection__k': 4, 'gnb__var_smoothing': 0.5336699231206302}"


#### Version 5

In [52]:
df_results = pd.DataFrame(columns=COLUMNS)
NAME_VERSION = "05"

for model in MODELS:
    
    if os.path.exists(os.path.join(REPO_MODEL, model + "__v" + NAME_VERSION + ".json")):
        with open(os.path.join(REPO_MODEL, model + "__v" + NAME_VERSION + ".json"), "r") as file:
            data = json.load(file)
        df_results = pd.concat([
            df_results, 
            pd.DataFrame({
                "Name_model": [model], 
                "Best_score": [data["best_score"]],
                "Time_to_tune": [int(data["time_to_tune"])],
                "Score_kaggle": [data["score_kaggle"]],
                "Best_params": [data["best_params"]],
            })])
        
df_results.reset_index(inplace=True, drop=True)
df_results.style.apply(lambda row: ['background-color: yellow'] * df_results.shape[1] if row.Best_score == max(df_results.Best_score) else ['background-color: white'] * df_results.shape[1], axis=1)

Unnamed: 0,Name_model,Best_score,Time_to_tune,Score_kaggle,Best_params
0,logistic_regression,0.780745,59880,0.77671,"{'feature_selection__k': 12, 'logistic__C': 2154.434690031878, 'logistic__l1_ratio': 0.5, 'logistic__penalty': 'elasticnet', 'logistic__solver': 'saga'}"
1,knn,0.792481,91760,0.78021,"{'feature_selection__k': 11, 'knn__n_neighbors': 37, 'knn__p': 1, 'knn__weights': 'uniform'}"
2,svm,0.801912,180134,0.79939,"{'feature_selection__k': 23, 'svm__C': 0.46415888336127775, 'svm__degree': 3, 'svm__kernel': 'poly'}"
3,gnb,0.766482,14887,0.77367,"{'feature_selection__k': 4, 'gnb__var_smoothing': 0.5336699231206302}"


#### Version 6

In [53]:
df_results = pd.DataFrame(columns=COLUMNS)
NAME_VERSION = "06"

for model in MODELS:
    
    if os.path.exists(os.path.join(REPO_MODEL, model + "__v" + NAME_VERSION + ".json")):
        with open(os.path.join(REPO_MODEL, model + "__v" + NAME_VERSION + ".json"), "r") as file:
            data = json.load(file)
        df_results = pd.concat([
            df_results, 
            pd.DataFrame({
                "Name_model": [model], 
                "Best_score": [data["best_score"]],
                "Time_to_tune": [int(data["time_to_tune"])],
                "Score_kaggle": [data["score_kaggle"]],
                "Best_params": [data["best_params"]],
            })])
        
df_results.reset_index(inplace=True, drop=True)
df_results.style.apply(lambda row: ['background-color: yellow'] * df_results.shape[1] if row.Best_score == max(df_results.Best_score) else ['background-color: white'] * df_results.shape[1], axis=1)

Unnamed: 0,Name_model,Best_score,Time_to_tune,Score_kaggle,Best_params
0,logistic_regression,0.7804,61262,0.78255,"{'feature_selection__k': 15, 'logistic__C': 4641.588833612773, 'logistic__l1_ratio': 0.8, 'logistic__penalty': 'elasticnet', 'logistic__solver': 'saga'}"
1,knn,0.7934,110870,0.78559,"{'feature_selection__k': 13, 'knn__n_neighbors': 49, 'knn__p': 1, 'knn__weights': 'uniform'}"
2,svm,0.800647,209248,0.79939,"{'feature_selection__k': 24, 'svm__C': 0.46415888336127775, 'svm__degree': 3, 'svm__kernel': 'poly'}"
3,gnb,0.766367,14559,0.77437,"{'feature_selection__k': 4, 'gnb__var_smoothing': 0.6579332246575682}"


#### Version 7

In [54]:
df_results = pd.DataFrame(columns=COLUMNS)
NAME_VERSION = "07"

for model in MODELS:
    
    if os.path.exists(os.path.join(REPO_MODEL, model + "__v" + NAME_VERSION + ".json")):
        with open(os.path.join(REPO_MODEL, model + "__v" + NAME_VERSION + ".json"), "r") as file:
            data = json.load(file)
        df_results = pd.concat([
            df_results, 
            pd.DataFrame({
                "Name_model": [model], 
                "Best_score": [data["best_score"]],
                "Time_to_tune": [int(data["time_to_tune"])],
                "Score_kaggle": [data["score_kaggle"]],
                "Best_params": [data["best_params"]],
            })])
        
df_results.reset_index(inplace=True, drop=True)
df_results.style.apply(lambda row: ['background-color: yellow'] * df_results.shape[1] if row.Best_score == max(df_results.Best_score) else ['background-color: white'] * df_results.shape[1], axis=1)

Unnamed: 0,Name_model,Best_score,Time_to_tune,Score_kaggle,Best_params
0,logistic_regression,0.780745,62178,0.78209,"{'feature_selection__k': 15, 'logistic__C': 10000.0, 'logistic__penalty': 'l2', 'logistic__solver': 'liblinear'}"
1,knn,0.793631,113486,0.78863,"{'feature_selection__k': 16, 'knn__n_neighbors': 51, 'knn__p': 1, 'knn__weights': 'uniform'}"
2,svm,0.801223,220124,0.80009,"{'feature_selection__k': 24, 'svm__C': 1.0, 'svm__degree': 3, 'svm__kernel': 'poly'}"
3,gnb,0.766482,15604,0.77367,"{'feature_selection__k': 4, 'gnb__var_smoothing': 0.5336699231206302}"


#### Version 8

In [55]:
df_results = pd.DataFrame(columns=COLUMNS)
NAME_VERSION = "08"

for model in MODELS:
    
    if os.path.exists(os.path.join(REPO_MODEL, model + "__v" + NAME_VERSION + ".json")):
        with open(os.path.join(REPO_MODEL, model + "__v" + NAME_VERSION + ".json"), "r") as file:
            data = json.load(file)
        df_results = pd.concat([
            df_results, 
            pd.DataFrame({
                "Name_model": [model], 
                "Best_score": [data["best_score"]],
                "Time_to_tune": [int(data["time_to_tune"])],
                "Score_kaggle": [data["score_kaggle"]],
                "Best_params": [data["best_params"]],
            })])
        
df_results.reset_index(inplace=True, drop=True)
df_results.style.apply(lambda row: ['background-color: yellow'] * df_results.shape[1] if row.Best_score == max(df_results.Best_score) else ['background-color: white'] * df_results.shape[1], axis=1)

Unnamed: 0,Name_model,Best_score,Time_to_tune,Score_kaggle,Best_params
0,logistic_regression,0.779479,42744,0.78419,"{'feature_selection__max_features': 23, 'logistic__C': 0.01, 'logistic__penalty': 'l2', 'logistic__solver': 'liblinear'}"
1,knn,0.791214,83657,0.78349,"{'feature_selection__max_features': 11, 'knn__n_neighbors': 63, 'knn__p': 1, 'knn__weights': 'uniform'}"
2,svm,0.801682,97146,0.79962,"{'feature_selection__max_features': 25, 'svm__C': 0.46415888336127775, 'svm__degree': 3, 'svm__kernel': 'poly'}"
3,gnb,0.767173,12574,0.77484,"{'feature_selection__max_features': 4, 'gnb__var_smoothing': 0.3511191734215127}"


#### Version 9

In [56]:
df_results = pd.DataFrame(columns=COLUMNS)
NAME_VERSION = "09"

for model in MODELS:
    
    if os.path.exists(os.path.join(REPO_MODEL, model + "__v" + NAME_VERSION + ".json")):
        with open(os.path.join(REPO_MODEL, model + "__v" + NAME_VERSION + ".json"), "r") as file:
            data = json.load(file)
        df_results = pd.concat([
            df_results, 
            pd.DataFrame({
                "Name_model": [model], 
                "Best_score": [data["best_score"]],
                "Time_to_tune": [int(data["time_to_tune"])],
                "Score_kaggle": [data["score_kaggle"]],
                "Best_params": [data["best_params"]],
            })])
        
df_results.reset_index(inplace=True, drop=True)
df_results.style.apply(lambda row: ['background-color: yellow'] * df_results.shape[1] if row.Best_score == max(df_results.Best_score) else ['background-color: white'] * df_results.shape[1], axis=1)

Unnamed: 0,Name_model,Best_score,Time_to_tune,Score_kaggle,Best_params
0,logistic_regression,0.779939,13259,0.78115,"{'feature_selection__max_features': 19, 'logistic__C': 0.021544346900318822, 'logistic__l1_ratio': 0.4, 'logistic__penalty': 'elasticnet', 'logistic__solver': 'saga'}"
1,knn,0.790179,20887,0.7877,"{'feature_selection__max_features': 27, 'knn__n_neighbors': 47, 'knn__p': 2, 'knn__weights': 'uniform'}"
2,svm,0.800532,185765,0.79939,"{'feature_selection__max_features': 26, 'svm__C': 0.46415888336127775, 'svm__degree': 3, 'svm__kernel': 'poly'}"
3,gnb,0.770624,1013,0.77718,"{'feature_selection__max_features': 7, 'gnb__var_smoothing': 0.811130830789689}"


## ------------------------------------------------------------------------------------------------------------

In [57]:
from titanic_spaceship_package.preprocessor import preprocessor
from datetime import datetime

X = preprocessor.fit_transform(X_train)

In [None]:
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task
from sklearn.metrics import accuracy_score

In [None]:
def accuracy_metric(y_true, y_pred):
    return accuracy_score(y_true, (y_pred > 0.5).astype(int))

task = Task('binary', metric = accuracy_metric)

In [None]:
roles = {'target': 'Transported'}